`pelinker.ops`¶

`iter_pmid_text_table_chunks(table_path, *, chunk_size=10000)` ¶

Yield (chunk, pmid_col, text_col) from a PMID/text CSV or TSV table.

Source code in pelinker/ops.py

def iter_pmid_text_table_chunks(
    table_path: pathlib.Path | str,
    *,
    chunk_size: int = 10_000,
) -> Iterator[tuple[pd.DataFrame, str | int, str | int]]:
    """Yield ``(chunk, pmid_col, text_col)`` from a PMID/text CSV or TSV table."""
    path = pathlib.Path(table_path).expanduser().resolve()
    if not path.exists():
        raise FileNotFoundError(f"PMID text table not found: {path}")

    _fmt, has_header, pmid_col, text_col, compression, sep = (
        _pmid_text_table_csv_kwargs(path)
    )
    reader = pd.read_csv(
        path,
        sep=sep,
        header=0 if has_header else None,
        compression=compression,
        chunksize=chunk_size,
    )
    for chunk in reader:
        yield chunk, pmid_col, text_col

`load_dataframe(table_path)` ¶

Load a dataframe from CSV/TSV file.

Parameters:

Name	Type	Description	Default
`table_path`	`Path`	Path to the CSV/TSV file (optionally gzipped)	required

Returns:

Type	Description
`DataFrame`	Loaded DataFrame

Source code in pelinker/ops.py

def load_dataframe(table_path: pathlib.Path) -> pd.DataFrame:
    """
    Load a dataframe from CSV/TSV file.

    Args:
        table_path: Path to the CSV/TSV file (optionally gzipped)

    Returns:
        Loaded DataFrame
    """
    table_path = table_path.expanduser()
    if not table_path.exists():
        raise FileNotFoundError(f"Input table not found at {table_path}")

    file_format = _detect_file_format(table_path)
    compression = "gzip" if table_path.suffix.endswith(".gz") else None
    sep = "\t" if file_format == "tsv" else ","

    return pd.read_csv(table_path, sep=sep, compression=compression)

`load_pmid_texts_from_table(table_path, pmids, *, chunk_size=10000)` ¶

Stream a PMID/text table and return rows for the requested pmids only.

Source code in pelinker/ops.py

def load_pmid_texts_from_table(
    table_path: pathlib.Path | str,
    pmids: set[str],
    *,
    chunk_size: int = 10_000,
) -> dict[str, str]:
    """Stream a PMID/text table and return rows for the requested ``pmids`` only."""
    need = {str(p) for p in pmids}
    if not need:
        return {}

    found: dict[str, str] = {}
    for chunk, pmid_col, text_col in iter_pmid_text_table_chunks(
        table_path, chunk_size=chunk_size
    ):
        pmid_series = chunk[pmid_col].astype(str)
        mask = pmid_series.isin(need)
        if not mask.any():
            continue
        for pmid, text in zip(
            pmid_series[mask],
            chunk.loc[mask, text_col].astype(str),
            strict=True,
        ):
            found[str(pmid)] = text
        if len(found) >= len(need):
            break
    return found

`parse_model_filename(filename, prefix)` ¶

Parse filename like 'res_bert_1.parquet' to extract model and layer.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Filename to parse	required
`prefix`	`str`	parsing prefix	required

Returns:

Name	Type	Description
`tuple`	`tuple[str \| None, int \| None]`	(model, layer) or (None, None) if pattern doesn't match

Source code in pelinker/ops.py

def parse_model_filename(filename: str, prefix: str) -> tuple[str | None, int | None]:
    """
    Parse filename like 'res_bert_1.parquet' to extract model and layer.

    Args:
        filename: Filename to parse
        prefix: parsing prefix

    Returns:
        tuple: (model, layer) or (None, None) if pattern doesn't match
    """
    # Pattern: <prefix]>_<model>_<layer>.parquet
    pattern = rf"{prefix}_([^_]+)_(\d+)\.parquet"
    match = re.match(pattern, filename)
    if match:
        model = match.group(1)
        layer = int(match.group(2))
        return model, layer
    return None, None

pelinker.ops¶

iter_pmid_text_table_chunks(table_path, *, chunk_size=10000) ¶

load_dataframe(table_path) ¶

load_pmid_texts_from_table(table_path, pmids, *, chunk_size=10000) ¶

parse_model_filename(filename, prefix) ¶

`pelinker.ops`¶

`iter_pmid_text_table_chunks(table_path, *, chunk_size=10000)` ¶

`load_dataframe(table_path)` ¶

`load_pmid_texts_from_table(table_path, pmids, *, chunk_size=10000)` ¶

`parse_model_filename(filename, prefix)` ¶