Skip to content

pelinker.config

ClusterCompositionSnapshot dataclass

Mention-weighted mixture of KB property labels per HDBSCAN cluster after Linker.fit.

  • :attr:global_property_mass — total mention count per property in the fitted corpus (denominator for “fraction of that property’s mass” views).
  • :attr:cluster_within_fraction — within each cluster, each property’s share of that cluster’s mention mass (sums to 1.0 per cluster).
  • :attr:cluster_fraction_of_property_mass — for each cluster and property, mentions(cluster ∩ property) / global_property_mass[property] (how much of that property’s corpus sits in this cluster; sums to ≤ 1.0 across disjoint cluster rows for a fixed property, excluding double-counting issues from overlapping keys).
Source code in pelinker/config.py
@dataclass(frozen=True)
class ClusterCompositionSnapshot:
    """
    Mention-weighted mixture of KB ``property`` labels per HDBSCAN cluster after ``Linker.fit``.

    * :attr:`global_property_mass` — total mention count per property in the fitted corpus
      (denominator for “fraction of that property’s mass” views).
    * :attr:`cluster_within_fraction` — within each cluster, each property’s share of that
      cluster’s mention mass (sums to 1.0 per cluster).
    * :attr:`cluster_fraction_of_property_mass` — for each cluster and property,
      ``mentions(cluster ∩ property) / global_property_mass[property]`` (how much of that
      property’s corpus sits in this cluster; sums to ≤ 1.0 across disjoint cluster rows
      for a fixed property, excluding double-counting issues from overlapping keys).
    """

    global_property_mass: dict[str, int]
    cluster_within_fraction: dict[int, dict[str, float]]
    cluster_fraction_of_property_mass: dict[int, dict[str, float]]

ClusteringOptimizationConfig dataclass

Configuration for clustering optimization grid search.

Source code in pelinker/config.py
@dataclass
class ClusteringOptimizationConfig:
    """Configuration for clustering optimization grid search."""

    min_class_size: int = 20
    # Exclusive end of ``np.arange(resolved_min_scale(), max_scale, clustering_grid_step)``.
    max_scale: int = 100
    min_scale: int | None = None
    """Lower bound (inclusive) for the ``min_cluster_size`` grid.

    When ``None``, defaults to ``max(1, min_class_size // 2)`` (legacy behavior: half of
    :attr:`min_class_size`). Set explicitly to decouple grid start from mention-level
    filtering (:attr:`min_class_size`).
    """
    clustering_grid_step: int = 5
    """Step between consecutive ``min_cluster_size`` values on the grid (``numpy.arange`` step)."""
    rns: RandomState = field(default_factory=lambda: RandomState(seed=13))
    frac: float = 1.0
    n_embedding_batches: int | None = None
    """Cap parquet reads at this many batches (`batch_size` rows each); None = read all."""
    batch_size: int = 1000
    """Rows per batch when **reading mention-level embedding parquet** (not encoder batch size)."""
    optimization_method: str = "mean"
    """How to build the objective f(min_cluster_size) before smoothing (mean / lower_bound / weighted)."""
    grid_objective: GridObjectiveSpec = "dbcv_ari_mean_minmax"
    """Which scalar to optimize on the grid (single metric or pooled DBCV+ARI; see ``clustering_grid``)."""
    grid_smooth_window: int = 3
    """Odd-length centered moving-average window for smoothing f(x). Even values are bumped up by one."""
    grid_plateau_fraction: float = 0.92
    """Plateau threshold on the **smoothed** curve: ``y_min + this * (y_max - y_min)`` (finite values only)."""
    grid_derivative_rel_tol: float = 0.12
    """|df/dx| below this times max|df/dx| counts as “derivative near zero” on the smoothed curve."""
    negative_screener: NegativeScreenerConfig = field(
        default_factory=NegativeScreenerConfig
    )
    """Negative-class screening before PCA→UMAP (see :class:`NegativeScreenerConfig`)."""

    def resolved_min_scale(self) -> int:
        """Inclusive start of the ``min_cluster_size`` grid (HDBSCAN hyperparameter)."""
        if self.min_scale is not None:
            return self.min_scale
        return max(1, self.min_class_size // 2)

    def __post_init__(self) -> None:
        if self.min_class_size < 1:
            raise ValueError("min_class_size must be >= 1")
        if self.min_scale is not None and self.min_scale < 1:
            raise ValueError("min_scale must be >= 1 when provided")
        lo = self.resolved_min_scale()
        if self.max_scale < lo:
            raise ValueError(
                f"max_scale must be >= resolved min_scale ({lo}); got max_scale={self.max_scale}"
            )
        if self.clustering_grid_step < 1:
            raise ValueError("clustering_grid_step must be >= 1")
        if self.batch_size < 1:
            raise ValueError("batch_size must be >= 1")
        if not 0 < self.frac <= 1:
            raise ValueError("frac must be in range (0, 1]")
        if self.n_embedding_batches is not None and self.n_embedding_batches < 1:
            raise ValueError("n_embedding_batches must be >= 1 when provided")
        if not self.optimization_method:
            raise ValueError("optimization_method must be a non-empty string")
        if self.grid_objective not in _GRID_OBJECTIVES:
            raise ValueError(
                f"grid_objective must be one of {sorted(_GRID_OBJECTIVES)}"
            )
        if self.grid_smooth_window < 1:
            raise ValueError("grid_smooth_window must be >= 1")
        if not 0 < self.grid_plateau_fraction <= 1:
            raise ValueError("grid_plateau_fraction must be in (0, 1]")
        if self.grid_derivative_rel_tol <= 0:
            raise ValueError("grid_derivative_rel_tol must be > 0")

    def to_dict(self) -> dict[str, Any]:
        return asdict(self)

batch_size = 1000 class-attribute instance-attribute

Rows per batch when reading mention-level embedding parquet (not encoder batch size).

clustering_grid_step = 5 class-attribute instance-attribute

Step between consecutive min_cluster_size values on the grid (numpy.arange step).

grid_derivative_rel_tol = 0.12 class-attribute instance-attribute

|df/dx| below this times max|df/dx| counts as “derivative near zero” on the smoothed curve.

grid_objective = 'dbcv_ari_mean_minmax' class-attribute instance-attribute

Which scalar to optimize on the grid (single metric or pooled DBCV+ARI; see clustering_grid).

grid_plateau_fraction = 0.92 class-attribute instance-attribute

Plateau threshold on the smoothed curve: y_min + this * (y_max - y_min) (finite values only).

grid_smooth_window = 3 class-attribute instance-attribute

Odd-length centered moving-average window for smoothing f(x). Even values are bumped up by one.

min_scale = None class-attribute instance-attribute

Lower bound (inclusive) for the min_cluster_size grid.

When None, defaults to max(1, min_class_size // 2) (legacy behavior: half of :attr:min_class_size). Set explicitly to decouple grid start from mention-level filtering (:attr:min_class_size).

n_embedding_batches = None class-attribute instance-attribute

Cap parquet reads at this many batches (batch_size rows each); None = read all.

negative_screener = field(default_factory=NegativeScreenerConfig) class-attribute instance-attribute

Negative-class screening before PCA→UMAP (see :class:NegativeScreenerConfig).

optimization_method = 'mean' class-attribute instance-attribute

How to build the objective f(min_cluster_size) before smoothing (mean / lower_bound / weighted).

resolved_min_scale()

Inclusive start of the min_cluster_size grid (HDBSCAN hyperparameter).

Source code in pelinker/config.py
def resolved_min_scale(self) -> int:
    """Inclusive start of the ``min_cluster_size`` grid (HDBSCAN hyperparameter)."""
    if self.min_scale is not None:
        return self.min_scale
    return max(1, self.min_class_size // 2)

EmbeddingModelMetadata dataclass

Describes which embedding backbones/layers produced the model (saved with the Linker).

Source code in pelinker/config.py
@dataclass(frozen=True)
class EmbeddingModelMetadata:
    """Describes which embedding backbones/layers produced the model (saved with the Linker)."""

    sources: tuple[EmbeddingSourceSpec, ...]

    def __post_init__(self) -> None:
        if not self.sources:
            raise ValueError("sources must contain at least one EmbeddingSourceSpec")

    @classmethod
    def from_single(cls, model_type: str, layers_spec: str) -> EmbeddingModelMetadata:
        return cls(
            sources=(
                EmbeddingSourceSpec(model_type=model_type, layers_spec=layers_spec),
            )
        )

EmbeddingSourceSpec dataclass

One backbone + layer selection (e.g. for a single encoder or one branch of a fused model).

Source code in pelinker/config.py
@dataclass(frozen=True)
class EmbeddingSourceSpec:
    """One backbone + layer selection (e.g. for a single encoder or one branch of a fused model)."""

    model_type: str
    layers_spec: str

    def __post_init__(self) -> None:
        if not self.model_type:
            raise ValueError("model_type must be a non-empty string")
        if not self.layers_spec:
            raise ValueError("layers_spec must be a non-empty string")

EmbeddingTrainingConfig dataclass

Inputs and runtime settings used only while embedding the corpus (not part of model identity).

Source code in pelinker/config.py
@dataclass
class EmbeddingTrainingConfig:
    """Inputs and runtime settings used only while embedding the corpus (not part of model identity)."""

    input_text_table_path: Path
    kb_csv_path: Path
    use_gpu: bool = False
    input_buffer_rows: int = 1000
    """Rows read per ``pandas.read_csv(..., chunksize=...)`` pass over the text table (I/O buffer only)."""
    encoder_batch_size: int = 200
    """How many table rows are encoded per transformer forward pass; lower if GPU memory is tight."""
    nlp_model: str = "en_core_web_trf"
    max_input_buffers: int | None = None
    """If set, stop after this many text-table read passes (each up to ``input_buffer_rows`` rows)."""
    negatives_per_positive: float = 0.0
    """Number of random negative mentions to sample per positive mention."""
    negative_label: str = NEGATIVE_LABEL
    """Entity label to use for synthetic negative rows."""
    negative_seed: int | None = 13
    """Optional random seed for deterministic negative sampling."""

    def __post_init__(self) -> None:
        if self.input_buffer_rows < 1:
            raise ValueError("input_buffer_rows must be >= 1")
        if self.encoder_batch_size < 1:
            raise ValueError("encoder_batch_size must be >= 1")
        if self.max_input_buffers is not None and self.max_input_buffers < 1:
            raise ValueError("max_input_buffers must be >= 1 when provided")
        if self.negatives_per_positive < 0:
            raise ValueError("negatives_per_positive must be >= 0")
        if not self.negative_label:
            raise ValueError("negative_label must be a non-empty string")
        self.input_text_table_path = Path(
            os.path.expandvars(os.fspath(self.input_text_table_path))
        ).expanduser()
        self.kb_csv_path = Path(
            os.path.expandvars(os.fspath(self.kb_csv_path))
        ).expanduser()

encoder_batch_size = 200 class-attribute instance-attribute

How many table rows are encoded per transformer forward pass; lower if GPU memory is tight.

input_buffer_rows = 1000 class-attribute instance-attribute

Rows read per pandas.read_csv(..., chunksize=...) pass over the text table (I/O buffer only).

max_input_buffers = None class-attribute instance-attribute

If set, stop after this many text-table read passes (each up to input_buffer_rows rows).

negative_label = NEGATIVE_LABEL class-attribute instance-attribute

Entity label to use for synthetic negative rows.

negative_seed = 13 class-attribute instance-attribute

Optional random seed for deterministic negative sampling.

negatives_per_positive = 0.0 class-attribute instance-attribute

Number of random negative mentions to sample per positive mention.

KBConfig dataclass

Metadata for the knowledge base packaged with a fitted Linker.

Source code in pelinker/config.py
@dataclass(frozen=True)
class KBConfig:
    """Metadata for the knowledge base packaged with a fitted Linker."""

    name: str
    version: str
    created_at: date
    description: str = ""
    entity_count: int | None = None
    """Set after fit from vocabulary size when None at construction time."""

    def __post_init__(self) -> None:
        if not self.name.strip():
            raise ValueError("name must be a non-empty string")
        _validate_semver(self.version)
        if self.entity_count is not None and self.entity_count < 0:
            raise ValueError("entity_count must be >= 0 when provided")

entity_count = None class-attribute instance-attribute

Set after fit from vocabulary size when None at construction time.

LinkerFitConfig dataclass

Parquet read + mention filters + screener settings for :meth:~pelinker.model.Linker.fit.

Source code in pelinker/config.py
@dataclass
class LinkerFitConfig:
    """Parquet read + mention filters + screener settings for :meth:`~pelinker.model.Linker.fit`."""

    min_class_size: int = 20
    """Minimum mention rows per KB ``entity`` before training (negative label exempt)."""
    batch_size: int = 1000
    n_embedding_batches: int | None = None
    negative_screener: NegativeScreenerConfig = field(
        default_factory=NegativeScreenerConfig
    )
    manifold_oov_screener: ManifoldOovScreenerConfig = field(
        default_factory=ManifoldOovScreenerConfig
    )

    def __post_init__(self) -> None:
        if self.min_class_size < 1:
            raise ValueError("min_class_size must be >= 1")
        if self.batch_size < 1:
            raise ValueError("batch_size must be >= 1")
        if self.n_embedding_batches is not None and self.n_embedding_batches < 1:
            raise ValueError("n_embedding_batches must be >= 1 when provided")

min_class_size = 20 class-attribute instance-attribute

Minimum mention rows per KB entity before training (negative label exempt).

ManifoldOovScreenerConfig dataclass

3D (residual, Mahalanobis, spectral entropy) OOV score model; predict-time gate only.

Source code in pelinker/config.py
@dataclass(frozen=True)
class ManifoldOovScreenerConfig:
    """3D (residual, Mahalanobis, spectral entropy) OOV score model; predict-time gate only."""

    enabled: bool = True
    cv_n_splits: int = 20
    cv_test_size: float = 0.2
    cv_random_state: int = 42
    dt_max_depth_candidates: tuple[int | None, ...] = (None, 4, 8)
    """``None`` means unrestricted depth (sklearn default)."""
    dt_min_samples_leaf_candidates: tuple[int, ...] = (1, 2, 5)

    def __post_init__(self) -> None:
        if self.cv_n_splits < 2:
            raise ValueError("cv_n_splits must be >= 2")
        if not 0.0 < self.cv_test_size < 1.0:
            raise ValueError("cv_test_size must be in (0, 1)")
        if not self.dt_max_depth_candidates:
            raise ValueError("dt_max_depth_candidates must be non-empty")
        if not self.dt_min_samples_leaf_candidates:
            raise ValueError("dt_min_samples_leaf_candidates must be non-empty")
        for leaf in self.dt_min_samples_leaf_candidates:
            if int(leaf) < 1:
                raise ValueError("dt_min_samples_leaf_candidates values must be >= 1")

dt_max_depth_candidates = (None, 4, 8) class-attribute instance-attribute

None means unrestricted depth (sklearn default).

NegativeScreenerConfig dataclass

Binary LDA/SVM screen for negative_label vs KB mentions before PCA→UMAP.

Source code in pelinker/config.py
@dataclass(frozen=True)
class NegativeScreenerConfig:
    """Binary LDA/SVM screen for ``negative_label`` vs KB mentions before PCA→UMAP."""

    kind: ScreenerKind = "lda"
    """Estimator persisted on :class:`~pelinker.model.Linker` (``Linker.screener``)."""
    negative_label: str = NEGATIVE_LABEL
    cv_n_splits: int = 20
    cv_test_size: float = 0.2
    cv_random_state: int = 42

    def __post_init__(self) -> None:
        if not self.negative_label.strip():
            raise ValueError("negative_label must be non-empty")
        if self.cv_n_splits < 2:
            raise ValueError("cv_n_splits must be >= 2")
        if not 0.0 < self.cv_test_size < 1.0:
            raise ValueError("cv_test_size must be in (0, 1)")

kind = 'lda' class-attribute instance-attribute

Estimator persisted on :class:~pelinker.model.Linker (Linker.screener).

TransformConfig dataclass

Configuration for the embedding transformation pipeline.

Source code in pelinker/config.py
@dataclass
class TransformConfig:
    """Configuration for the embedding transformation pipeline."""

    # PCA configuration
    pca_components: int = 50
    """Number of principal components to keep after PCA reduction."""

    # UMAP configuration
    umap_components: int = 4
    """Number of UMAP dimensions for clustering (typically 3-5)."""
    umap_metric: str = "cosine"
    """Distance metric for UMAP (default: 'cosine')."""

    # Visualization UMAP configuration
    umap_viz_components: int = 3
    """Number of UMAP dimensions for visualization (default: 3)."""
    umap_viz_metric: str = "cosine"
    """Distance metric for visualization UMAP (default: 'cosine')."""

    def __post_init__(self):
        """Validate configuration parameters."""
        if self.pca_components < 1:
            raise ValueError("pca_components must be >= 1")
        if self.umap_components < 2:
            raise ValueError("umap_components must be >= 2")
        if self.umap_viz_components < 2:
            raise ValueError("umap_viz_components must be >= 2")

pca_components = 50 class-attribute instance-attribute

Number of principal components to keep after PCA reduction.

umap_components = 4 class-attribute instance-attribute

Number of UMAP dimensions for clustering (typically 3-5).

umap_metric = 'cosine' class-attribute instance-attribute

Distance metric for UMAP (default: 'cosine').

umap_viz_components = 3 class-attribute instance-attribute

Number of UMAP dimensions for visualization (default: 3).

umap_viz_metric = 'cosine' class-attribute instance-attribute

Distance metric for visualization UMAP (default: 'cosine').

__post_init__()

Validate configuration parameters.

Source code in pelinker/config.py
def __post_init__(self):
    """Validate configuration parameters."""
    if self.pca_components < 1:
        raise ValueError("pca_components must be >= 1")
    if self.umap_components < 2:
        raise ValueError("umap_components must be >= 2")
    if self.umap_viz_components < 2:
        raise ValueError("umap_viz_components must be >= 2")