Skip to content

pelinker.io.reader

Unified reader interface for reading large files in chunks. Supports Feather, Parquet, and CSV/TSV formats.

read_batches(file_path, batch_size=1000, file_type=None, **kwargs)

Read large files in batches, supporting Feather, Parquet, and CSV/TSV formats.

Automatically detects file type from extension if not provided.

Parameters:

Name Type Description Default
file_path str

Path to the file to read

required
batch_size int

Number of rows per batch (default: 1000)

1000
file_type Optional[str]

Optional file type override ('feather', 'parquet', 'csv'). If None, auto-detects from file extension.

None
**kwargs

Additional arguments passed to format-specific readers: - For CSV: sep, header, etc. (pandas.read_csv arguments) - For Parquet: columns (list of column names to read)

{}

Yields:

Type Description
DataFrame

pd.DataFrame: Batches of data as pandas DataFrames

Examples:

>>> # Read feather file
>>> for batch in read_batches("data.feather", batch_size=5000):
...     process(batch)
>>> # Read parquet file
>>> for batch in read_batches("data.parquet", batch_size=10000):
...     process(batch)
>>> # Read CSV file with custom separator
>>> for batch in read_batches("data.csv", batch_size=2000, sep=";"):
...     process(batch)
Source code in pelinker/io/reader.py
def read_batches(
    file_path: str, batch_size: int = 1000, file_type: Optional[str] = None, **kwargs
) -> Iterator[pd.DataFrame]:
    """
    Read large files in batches, supporting Feather, Parquet, and CSV/TSV formats.

    Automatically detects file type from extension if not provided.

    Args:
        file_path: Path to the file to read
        batch_size: Number of rows per batch (default: 1000)
        file_type: Optional file type override ('feather', 'parquet', 'csv').
                   If None, auto-detects from file extension.
        **kwargs: Additional arguments passed to format-specific readers:
                  - For CSV: sep, header, etc. (pandas.read_csv arguments)
                  - For Parquet: columns (list of column names to read)

    Yields:
        pd.DataFrame: Batches of data as pandas DataFrames

    Examples:
        >>> # Read feather file
        >>> for batch in read_batches("data.feather", batch_size=5000):
        ...     process(batch)

        >>> # Read parquet file
        >>> for batch in read_batches("data.parquet", batch_size=10000):
        ...     process(batch)

        >>> # Read CSV file with custom separator
        >>> for batch in read_batches("data.csv", batch_size=2000, sep=";"):
        ...     process(batch)
    """
    if file_type is None:
        file_type = _detect_file_type(file_path)

    if file_type == "feather":
        yield from _read_feather_batches(file_path, batch_size)
    elif file_type == "parquet":
        # Extract columns from kwargs if provided
        columns = kwargs.pop("columns", None)
        yield from _read_parquet_batches(file_path, batch_size, columns=columns)
    elif file_type == "csv":
        yield from _read_csv_batches(file_path, batch_size, **kwargs)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")