Skip to content

pelinker.io

Unified IO interface for reading large files in chunks. Supports Feather, Parquet, and CSV/TSV formats.

is_gzip_file_path(path)

True when path ends with .gz (gzip-wrapped payload, e.g. *.json.gz).

Source code in pelinker/io/json_files.py
def is_gzip_file_path(path: pathlib.Path | str) -> bool:
    """True when ``path`` ends with ``.gz`` (gzip-wrapped payload, e.g. ``*.json.gz``)."""
    return pathlib.Path(path).suffix.lower() == ".gz"

load_json_path(path)

Parse one JSON value from a filesystem path.

If the path suffix is .gz, the file is read as UTF-8 text through gzip; otherwise it is read as UTF-8 text from the raw file.

Source code in pelinker/io/json_files.py
def load_json_path(path: pathlib.Path | str) -> Any:
    """
    Parse one JSON value from a filesystem path.

    If the path suffix is ``.gz``, the file is read as UTF-8 text through gzip;
    otherwise it is read as UTF-8 text from the raw file.
    """
    p = pathlib.Path(path).expanduser()
    if is_gzip_file_path(p):
        with gzip.open(p, mode="rt", encoding="utf-8") as fh:
            return json.load(fh)
    with p.open(mode="r", encoding="utf-8") as fh:
        return json.load(fh)

read_batches(file_path, batch_size=1000, file_type=None, **kwargs)

Read large files in batches, supporting Feather, Parquet, and CSV/TSV formats.

Automatically detects file type from extension if not provided.

Parameters:

Name Type Description Default
file_path str

Path to the file to read

required
batch_size int

Number of rows per batch (default: 1000)

1000
file_type Optional[str]

Optional file type override ('feather', 'parquet', 'csv'). If None, auto-detects from file extension.

None
**kwargs

Additional arguments passed to format-specific readers: - For CSV: sep, header, etc. (pandas.read_csv arguments) - For Parquet: columns (list of column names to read)

{}

Yields:

Type Description
DataFrame

pd.DataFrame: Batches of data as pandas DataFrames

Examples:

>>> # Read feather file
>>> for batch in read_batches("data.feather", batch_size=5000):
...     process(batch)
>>> # Read parquet file
>>> for batch in read_batches("data.parquet", batch_size=10000):
...     process(batch)
>>> # Read CSV file with custom separator
>>> for batch in read_batches("data.csv", batch_size=2000, sep=";"):
...     process(batch)
Source code in pelinker/io/reader.py
def read_batches(
    file_path: str, batch_size: int = 1000, file_type: Optional[str] = None, **kwargs
) -> Iterator[pd.DataFrame]:
    """
    Read large files in batches, supporting Feather, Parquet, and CSV/TSV formats.

    Automatically detects file type from extension if not provided.

    Args:
        file_path: Path to the file to read
        batch_size: Number of rows per batch (default: 1000)
        file_type: Optional file type override ('feather', 'parquet', 'csv').
                   If None, auto-detects from file extension.
        **kwargs: Additional arguments passed to format-specific readers:
                  - For CSV: sep, header, etc. (pandas.read_csv arguments)
                  - For Parquet: columns (list of column names to read)

    Yields:
        pd.DataFrame: Batches of data as pandas DataFrames

    Examples:
        >>> # Read feather file
        >>> for batch in read_batches("data.feather", batch_size=5000):
        ...     process(batch)

        >>> # Read parquet file
        >>> for batch in read_batches("data.parquet", batch_size=10000):
        ...     process(batch)

        >>> # Read CSV file with custom separator
        >>> for batch in read_batches("data.csv", batch_size=2000, sep=";"):
        ...     process(batch)
    """
    if file_type is None:
        file_type = _detect_file_type(file_path)

    if file_type == "feather":
        yield from _read_feather_batches(file_path, batch_size)
    elif file_type == "parquet":
        # Extract columns from kwargs if provided
        columns = kwargs.pop("columns", None)
        yield from _read_parquet_batches(file_path, batch_size, columns=columns)
    elif file_type == "csv":
        yield from _read_csv_batches(file_path, batch_size, **kwargs)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")