Skip to content

graflo.db.postgres.inference_utils

Inference utilities for PostgreSQL schema analysis.

This module provides utility functions for inferring relationships and patterns from PostgreSQL table and column names using heuristics and fuzzy matching.

detect_separator(text)

Detect the most common separator character in a text.

Parameters:

Name Type Description Default
text str

Text to analyze

required

Returns:

Type Description
str

Most common separator character, defaults to '_'

Source code in graflo/db/postgres/inference_utils.py
def detect_separator(text: str) -> str:
    """Detect the most common separator character in a text.

    Args:
        text: Text to analyze

    Returns:
        Most common separator character, defaults to '_'
    """
    # Common separators
    separators = ["_", "-", "."]
    counts = {sep: text.count(sep) for sep in separators}

    if max(counts.values()) > 0:
        return max(counts, key=counts.get)
    return "_"  # Default separator

fuzzy_match_fragment(fragment, vertex_names, threshold=0.6)

Fuzzy match a fragment to vertex names.

Backward-compatible wrapper function that uses the improved FuzzyMatcher.

Parameters:

Name Type Description Default
fragment str

Fragment to match

required
vertex_names list[str]

List of vertex table names to match against

required
threshold float

Similarity threshold (0.0 to 1.0)

0.6

Returns:

Type Description
str | None

Best matching vertex name or None if no match above threshold

Source code in graflo/db/postgres/inference_utils.py
def fuzzy_match_fragment(
    fragment: str, vertex_names: list[str], threshold: float = 0.6
) -> str | None:
    """Fuzzy match a fragment to vertex names.

    Backward-compatible wrapper function that uses the improved FuzzyMatcher.

    Args:
        fragment: Fragment to match
        vertex_names: List of vertex table names to match against
        threshold: Similarity threshold (0.0 to 1.0)

    Returns:
        Best matching vertex name or None if no match above threshold
    """
    matcher = FuzzyMatcher(vertex_names, threshold)
    match, _ = matcher.match(fragment)
    return match

infer_edge_vertices_from_table_name(table_name, pk_columns, fk_columns, vertex_table_names=None, match_cache=None)

Infer source and target vertex names from table name and structure.

Uses fuzzy matching to identify vertex names in table name fragments and key names. Handles patterns like: - rel_cluster_containment_host -> cluster, host, containment - rel_cluster_containment_cluster_2 -> cluster, cluster, containment (self-reference) - user_follows_user -> user, user, follows (self-reference) - product_category_mapping -> product, category, mapping

Parameters:

Name Type Description Default
table_name str

Name of the table

required
pk_columns list[str]

List of primary key column names

required
fk_columns list[dict[str, Any]]

List of foreign key dictionaries with 'column' and 'references_table' keys

required
vertex_table_names list[str] | None

Optional list of known vertex table names for fuzzy matching

None
match_cache FuzzyMatchCache | None

Optional pre-computed fuzzy match cache for better performance

None

Returns:

Type Description
tuple[str | None, str | None, str | None]

Tuple of (source_table, target_table, relation_name) or (None, None, None) if cannot infer

Source code in graflo/db/postgres/inference_utils.py
def infer_edge_vertices_from_table_name(
    table_name: str,
    pk_columns: list[str],
    fk_columns: list[dict[str, Any]],
    vertex_table_names: list[str] | None = None,
    match_cache: FuzzyMatchCache | None = None,
) -> tuple[str | None, str | None, str | None]:
    """Infer source and target vertex names from table name and structure.

    Uses fuzzy matching to identify vertex names in table name fragments and key names.
    Handles patterns like:
    - rel_cluster_containment_host -> cluster, host, containment
    - rel_cluster_containment_cluster_2 -> cluster, cluster, containment (self-reference)
    - user_follows_user -> user, user, follows (self-reference)
    - product_category_mapping -> product, category, mapping

    Args:
        table_name: Name of the table
        pk_columns: List of primary key column names
        fk_columns: List of foreign key dictionaries with 'column' and 'references_table' keys
        vertex_table_names: Optional list of known vertex table names for fuzzy matching
        match_cache: Optional pre-computed fuzzy match cache for better performance

    Returns:
        Tuple of (source_table, target_table, relation_name) or (None, None, None) if cannot infer
    """
    if vertex_table_names is None:
        vertex_table_names = []

    # Use cache if provided, otherwise create a temporary one
    if match_cache is None:
        match_cache = FuzzyMatchCache(vertex_table_names)

    # Step 1: Detect separator and split table name
    separator = detect_separator(table_name)
    table_fragments = split_by_separator(table_name, separator)

    # Step 2: Extract fragments from keys
    key_fragments = _extract_key_fragments(pk_columns, fk_columns, separator)

    # Step 3: Match vertices from table name fragments
    (
        source_match_idx,
        target_match_idx,
        source_vertex,
        target_vertex,
        matched_vertices_set,
    ) = _match_vertices_from_table_fragments(table_fragments, match_cache)

    # Step 4: Match vertices from key fragments
    matched_vertices, key_matched_vertices = _match_vertices_from_key_fragments(
        key_fragments, match_cache, matched_vertices_set, source_vertex, target_vertex
    )

    # Step 5: Extract FK vertex names
    fk_vertex_names = _extract_fk_vertex_names(fk_columns)

    # Step 6: Determine source and target vertices
    source_table, target_table = _determine_source_target_vertices(
        fk_vertex_names,
        source_match_idx,
        target_match_idx,
        source_vertex,
        target_vertex,
        key_matched_vertices,
        matched_vertices,
    )

    # Step 7: Identify relation name
    relation_name = _identify_relation_name(
        table_fragments, source_match_idx, target_match_idx, source_table, target_table
    )

    return (source_table, target_table, relation_name)

infer_vertex_from_column_name(column_name, vertex_table_names=None, match_cache=None)

Infer vertex table name from a column name using robust pattern matching.

Uses the same logic as infer_edge_vertices_from_table_name but focused on extracting vertex names from column names. Handles patterns like: - user_id -> user - product_id -> product - customer_fk -> customer - source_vertex -> source_vertex (if matches)

Parameters:

Name Type Description Default
column_name str

Name of the column

required
vertex_table_names list[str] | None

Optional list of known vertex table names for fuzzy matching

None
match_cache FuzzyMatchCache | None

Optional pre-computed fuzzy match cache for better performance

None

Returns:

Type Description
str | None

Inferred vertex table name or None if cannot infer

Source code in graflo/db/postgres/inference_utils.py
def infer_vertex_from_column_name(
    column_name: str,
    vertex_table_names: list[str] | None = None,
    match_cache: FuzzyMatchCache | None = None,
) -> str | None:
    """Infer vertex table name from a column name using robust pattern matching.

    Uses the same logic as infer_edge_vertices_from_table_name but focused on
    extracting vertex names from column names. Handles patterns like:
    - user_id -> user
    - product_id -> product
    - customer_fk -> customer
    - source_vertex -> source_vertex (if matches)

    Args:
        column_name: Name of the column
        vertex_table_names: Optional list of known vertex table names for fuzzy matching
        match_cache: Optional pre-computed fuzzy match cache for better performance

    Returns:
        Inferred vertex table name or None if cannot infer
    """
    if vertex_table_names is None:
        vertex_table_names = []

    # Use cache if provided, otherwise create a temporary one
    if match_cache is None:
        match_cache = FuzzyMatchCache(vertex_table_names)

    if not column_name:
        return None

    # Common suffixes to remove: id, fk, key, pk, ref
    common_suffixes = {"id", "fk", "key", "pk", "ref", "reference"}

    # Step 1: Try matching full column name first
    matched = match_cache.get_match(column_name)
    if matched:
        return matched

    # Step 2: Detect separator and split column name
    separator = detect_separator(column_name)
    fragments = split_by_separator(column_name, separator)

    if not fragments:
        return None

    # Step 3: Try matching fragments (excluding common suffixes)
    matched = _match_fragments_excluding_suffixes(
        fragments, match_cache, common_suffixes
    )
    if matched:
        return matched

    # Step 4: Try removing common suffix and matching again
    matched = _try_match_without_suffix(
        fragments, separator, match_cache, common_suffixes
    )
    if matched:
        return matched

    # Step 5: As last resort, try exact match against vertex names (case-insensitive)
    return _try_exact_match_with_suffix_removal(
        column_name, vertex_table_names, common_suffixes
    )

split_by_separator(text, separator)

Split text by separator, handling multiple consecutive separators.

Parameters:

Name Type Description Default
text str

Text to split

required
separator str

Separator character

required

Returns:

Type Description
list[str]

List of non-empty fragments

Source code in graflo/db/postgres/inference_utils.py
def split_by_separator(text: str, separator: str) -> list[str]:
    """Split text by separator, handling multiple consecutive separators.

    Args:
        text: Text to split
        separator: Separator character

    Returns:
        List of non-empty fragments
    """
    # Split and filter out empty strings
    parts = [p for p in text.split(separator) if p]
    return parts