Skip to content

ontocast.tool.agg.normalizer

Entity normalization for disambiguation.

This module handles the preparation of entities for embedding-based disambiguation. It creates normalized string representations r(e) that include: - Normalized form of the entity URI - Semantic neighbors (types, properties)

EntityNormalizer

Normalizes entities and creates string representations for embedding.

This class is responsible for transforming entity URIs into normalized string representations that can be embedded and compared.

Source code in ontocast/tool/agg/normalizer.py
class EntityNormalizer:
    """Normalizes entities and creates string representations for embedding.

    This class is responsible for transforming entity URIs into normalized
    string representations that can be embedded and compared.
    """

    def __init__(self, ontology_namespaces: set[str] | None = None):
        """Initialize the entity normalizer.

        Args:
            ontology_namespaces: Set of namespace URIs that identify ontology entities.
                Entities from these namespaces are preferred as representatives.
        """
        self.ontology_namespaces = ontology_namespaces or set()

    def normalize_string(self, text: str) -> str:
        """Normalize a string: lowercase, remove diacritics, clean special chars.

        CamelCase is split so that it yields the same logical tokens as snake_case
        (e.g. 'PLRedShift' -> 'pl red shift').

        Args:
            text: Input string to normalize

        Returns:
            Normalized string suitable for comparison

        Examples:
            'PLRedShift' -> 'pl red shift'
            'PL_red_shift_value' -> 'pl red shift value'
            'Café' -> 'cafe'
        """
        # Remove diacritics
        text = "".join(
            c
            for c in unicodedata.normalize("NFD", text)
            if unicodedata.category(c) != "Mn"
        )

        # Insert space before capitals that start a word (followed by lowercase)
        # so e.g. PLRedShift -> PL Red Shift -> pl red shift (like snake_case)
        text = re.sub(r"(?=[A-Z][a-z])", " ", text)

        # Convert to lowercase
        text = text.lower()

        # Replace underscores and hyphens with spaces
        text = text.replace("_", " ").replace("-", " ")

        # Collapse multiple spaces and strip
        return re.sub(r"\s+", " ", text).strip()

    def normalize_uri(self, uri: URIRef) -> str:
        """Extract and normalize the local part of a URI.

        Args:
            uri: URI to normalize

        Returns:
            Normalized local name

        Examples:
            'http://example.org/PLRedShift' -> 'pl red shift'
            'http://example.org/PL_red_shift_value' -> 'pl red shift value'
        """
        uri_str = str(uri)

        # Extract local name from fragment or path
        if "#" in uri_str:
            local = uri_str.rsplit("#", 1)[-1]
        else:
            trimmed = uri_str.rstrip("/")
            local = trimmed.rsplit("/", 1)[-1] if "/" in trimmed else trimmed

        # Handle camelCase before normalization
        # Insert spaces before uppercase letters
        local = re.sub(r"([a-z])([A-Z])", r"\1 \2", local)
        local = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", local)

        return self.normalize_string(local)

    def is_ontology_entity(self, entity: URIRef) -> bool:
        """Check if an entity belongs to an ontology namespace.

        Args:
            entity: Entity URI to check

        Returns:
            True if entity is from an ontology namespace
        """
        entity_str = str(entity)
        return any(entity_str.startswith(ns) for ns in self.ontology_namespaces)

    def extract_entity_context(
        self, entity: URIRef, graph: RDFGraph
    ) -> tuple[list[URIRef], list[URIRef], list[str]]:
        """Extract semantic context for an entity from the graph.

        Args:
            entity: Entity to extract context for
            graph: RDF graph containing the entity

        Returns:
            Tuple of (types, properties, labels)
        """
        types = []
        properties = set()
        labels = []

        # Extract information from triples
        for s, p, o in graph:
            # When entity is subject
            if s == entity:
                properties.add(p)

                # Collect types
                if p == RDF.type and isinstance(o, URIRef):
                    types.append(o)

                # Collect labels
                if p == RDFS.label and isinstance(o, Literal):
                    labels.append(str(o))

            # When entity is object
            elif o == entity:
                properties.add(p)

        return types, list(properties), labels

    def create_representation(
        self, entity: URIRef, graph: RDFGraph
    ) -> EntityRepresentation:
        """Create a normalized representation r(e) for an entity.

        This combines the normalized form with semantic neighbors to create
        a rich representation suitable for embedding.

        Args:
            entity: Entity URI
            graph: RDF graph containing the entity

        Returns:
            EntityRepresentation containing r(e) and metadata
        """
        # Get normalized form
        normal_form = self.normalize_uri(entity)

        # Extract semantic context
        types, properties, labels = self.extract_entity_context(entity, graph)

        # Build representation string r(e)
        parts = [normal_form]

        # Add labels if available (most informative)
        if labels:
            parts.extend(
                self.normalize_string(label) for label in labels[:3]
            )  # Max 3 labels

        # Add type information (very important semantic signal)
        if types:
            type_names = [self.normalize_uri(t) for t in types[:3]]  # Max 3 types
            parts.extend(f"type {tn}" for tn in type_names)

        # Add property information (additional semantic signal)
        if properties:
            # Filter out very common properties
            filtered_props = [
                p for p in properties if p not in {RDF.type, RDFS.label, RDFS.comment}
            ]
            prop_names = [
                self.normalize_uri(p) for p in filtered_props[:5]
            ]  # Max 5 properties
            parts.extend(f"has {pn}" for pn in prop_names)

        # Combine into representation
        representation = " ".join(parts)

        # Check if ontology entity
        is_ontology = self.is_ontology_entity(entity)

        return EntityRepresentation(
            entity=entity,
            normal_form=normal_form,
            types=types,
            properties=properties,
            labels=labels,
            representation=representation,
            is_ontology_entity=is_ontology,
        )

    def create_representations_batch(
        self, entities: list[URIRef], graphs: dict[URIRef, RDFGraph]
    ) -> dict[URIRef, EntityRepresentation]:
        """Create representations for multiple entities.

        Args:
            entities: List of entity URIs
            graphs: Mapping from entity to its source graph

        Returns:
            Dictionary mapping entity URIs to their representations
        """
        representations = {}

        for entity in entities:
            graph = graphs.get(entity)
            if graph is not None:
                representations[entity] = self.create_representation(entity, graph)

        return representations

__init__(ontology_namespaces=None)

Initialize the entity normalizer.

Parameters:

Name Type Description Default
ontology_namespaces set[str] | None

Set of namespace URIs that identify ontology entities. Entities from these namespaces are preferred as representatives.

None
Source code in ontocast/tool/agg/normalizer.py
def __init__(self, ontology_namespaces: set[str] | None = None):
    """Initialize the entity normalizer.

    Args:
        ontology_namespaces: Set of namespace URIs that identify ontology entities.
            Entities from these namespaces are preferred as representatives.
    """
    self.ontology_namespaces = ontology_namespaces or set()

create_representation(entity, graph)

Create a normalized representation r(e) for an entity.

This combines the normalized form with semantic neighbors to create a rich representation suitable for embedding.

Parameters:

Name Type Description Default
entity URIRef

Entity URI

required
graph RDFGraph

RDF graph containing the entity

required

Returns:

Type Description
EntityRepresentation

EntityRepresentation containing r(e) and metadata

Source code in ontocast/tool/agg/normalizer.py
def create_representation(
    self, entity: URIRef, graph: RDFGraph
) -> EntityRepresentation:
    """Create a normalized representation r(e) for an entity.

    This combines the normalized form with semantic neighbors to create
    a rich representation suitable for embedding.

    Args:
        entity: Entity URI
        graph: RDF graph containing the entity

    Returns:
        EntityRepresentation containing r(e) and metadata
    """
    # Get normalized form
    normal_form = self.normalize_uri(entity)

    # Extract semantic context
    types, properties, labels = self.extract_entity_context(entity, graph)

    # Build representation string r(e)
    parts = [normal_form]

    # Add labels if available (most informative)
    if labels:
        parts.extend(
            self.normalize_string(label) for label in labels[:3]
        )  # Max 3 labels

    # Add type information (very important semantic signal)
    if types:
        type_names = [self.normalize_uri(t) for t in types[:3]]  # Max 3 types
        parts.extend(f"type {tn}" for tn in type_names)

    # Add property information (additional semantic signal)
    if properties:
        # Filter out very common properties
        filtered_props = [
            p for p in properties if p not in {RDF.type, RDFS.label, RDFS.comment}
        ]
        prop_names = [
            self.normalize_uri(p) for p in filtered_props[:5]
        ]  # Max 5 properties
        parts.extend(f"has {pn}" for pn in prop_names)

    # Combine into representation
    representation = " ".join(parts)

    # Check if ontology entity
    is_ontology = self.is_ontology_entity(entity)

    return EntityRepresentation(
        entity=entity,
        normal_form=normal_form,
        types=types,
        properties=properties,
        labels=labels,
        representation=representation,
        is_ontology_entity=is_ontology,
    )

create_representations_batch(entities, graphs)

Create representations for multiple entities.

Parameters:

Name Type Description Default
entities list[URIRef]

List of entity URIs

required
graphs dict[URIRef, RDFGraph]

Mapping from entity to its source graph

required

Returns:

Type Description
dict[URIRef, EntityRepresentation]

Dictionary mapping entity URIs to their representations

Source code in ontocast/tool/agg/normalizer.py
def create_representations_batch(
    self, entities: list[URIRef], graphs: dict[URIRef, RDFGraph]
) -> dict[URIRef, EntityRepresentation]:
    """Create representations for multiple entities.

    Args:
        entities: List of entity URIs
        graphs: Mapping from entity to its source graph

    Returns:
        Dictionary mapping entity URIs to their representations
    """
    representations = {}

    for entity in entities:
        graph = graphs.get(entity)
        if graph is not None:
            representations[entity] = self.create_representation(entity, graph)

    return representations

extract_entity_context(entity, graph)

Extract semantic context for an entity from the graph.

Parameters:

Name Type Description Default
entity URIRef

Entity to extract context for

required
graph RDFGraph

RDF graph containing the entity

required

Returns:

Type Description
tuple[list[URIRef], list[URIRef], list[str]]

Tuple of (types, properties, labels)

Source code in ontocast/tool/agg/normalizer.py
def extract_entity_context(
    self, entity: URIRef, graph: RDFGraph
) -> tuple[list[URIRef], list[URIRef], list[str]]:
    """Extract semantic context for an entity from the graph.

    Args:
        entity: Entity to extract context for
        graph: RDF graph containing the entity

    Returns:
        Tuple of (types, properties, labels)
    """
    types = []
    properties = set()
    labels = []

    # Extract information from triples
    for s, p, o in graph:
        # When entity is subject
        if s == entity:
            properties.add(p)

            # Collect types
            if p == RDF.type and isinstance(o, URIRef):
                types.append(o)

            # Collect labels
            if p == RDFS.label and isinstance(o, Literal):
                labels.append(str(o))

        # When entity is object
        elif o == entity:
            properties.add(p)

    return types, list(properties), labels

is_ontology_entity(entity)

Check if an entity belongs to an ontology namespace.

Parameters:

Name Type Description Default
entity URIRef

Entity URI to check

required

Returns:

Type Description
bool

True if entity is from an ontology namespace

Source code in ontocast/tool/agg/normalizer.py
def is_ontology_entity(self, entity: URIRef) -> bool:
    """Check if an entity belongs to an ontology namespace.

    Args:
        entity: Entity URI to check

    Returns:
        True if entity is from an ontology namespace
    """
    entity_str = str(entity)
    return any(entity_str.startswith(ns) for ns in self.ontology_namespaces)

normalize_string(text)

Normalize a string: lowercase, remove diacritics, clean special chars.

CamelCase is split so that it yields the same logical tokens as snake_case (e.g. 'PLRedShift' -> 'pl red shift').

Parameters:

Name Type Description Default
text str

Input string to normalize

required

Returns:

Type Description
str

Normalized string suitable for comparison

Examples:

'PLRedShift' -> 'pl red shift' 'PL_red_shift_value' -> 'pl red shift value' 'Café' -> 'cafe'

Source code in ontocast/tool/agg/normalizer.py
def normalize_string(self, text: str) -> str:
    """Normalize a string: lowercase, remove diacritics, clean special chars.

    CamelCase is split so that it yields the same logical tokens as snake_case
    (e.g. 'PLRedShift' -> 'pl red shift').

    Args:
        text: Input string to normalize

    Returns:
        Normalized string suitable for comparison

    Examples:
        'PLRedShift' -> 'pl red shift'
        'PL_red_shift_value' -> 'pl red shift value'
        'Café' -> 'cafe'
    """
    # Remove diacritics
    text = "".join(
        c
        for c in unicodedata.normalize("NFD", text)
        if unicodedata.category(c) != "Mn"
    )

    # Insert space before capitals that start a word (followed by lowercase)
    # so e.g. PLRedShift -> PL Red Shift -> pl red shift (like snake_case)
    text = re.sub(r"(?=[A-Z][a-z])", " ", text)

    # Convert to lowercase
    text = text.lower()

    # Replace underscores and hyphens with spaces
    text = text.replace("_", " ").replace("-", " ")

    # Collapse multiple spaces and strip
    return re.sub(r"\s+", " ", text).strip()

normalize_uri(uri)

Extract and normalize the local part of a URI.

Parameters:

Name Type Description Default
uri URIRef

URI to normalize

required

Returns:

Type Description
str

Normalized local name

Examples:

'http://example.org/PLRedShift' -> 'pl red shift' 'http://example.org/PL_red_shift_value' -> 'pl red shift value'

Source code in ontocast/tool/agg/normalizer.py
def normalize_uri(self, uri: URIRef) -> str:
    """Extract and normalize the local part of a URI.

    Args:
        uri: URI to normalize

    Returns:
        Normalized local name

    Examples:
        'http://example.org/PLRedShift' -> 'pl red shift'
        'http://example.org/PL_red_shift_value' -> 'pl red shift value'
    """
    uri_str = str(uri)

    # Extract local name from fragment or path
    if "#" in uri_str:
        local = uri_str.rsplit("#", 1)[-1]
    else:
        trimmed = uri_str.rstrip("/")
        local = trimmed.rsplit("/", 1)[-1] if "/" in trimmed else trimmed

    # Handle camelCase before normalization
    # Insert spaces before uppercase letters
    local = re.sub(r"([a-z])([A-Z])", r"\1 \2", local)
    local = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", local)

    return self.normalize_string(local)

EntityRepresentation dataclass

Normalized representation of an entity for embedding.

Attributes:

Name Type Description
entity URIRef

Original entity URI

normal_form str

Normalized string (lowercase, no diacritics, etc.)

types list[URIRef]

List of type URIs for this entity

properties list[URIRef]

List of property URIs used with this entity

labels list[str]

List of labels found for this entity

representation str

Combined string representation r(e) for embedding

is_ontology_entity bool

Whether this entity is from an ontology namespace

Source code in ontocast/tool/agg/normalizer.py
@dataclass
class EntityRepresentation:
    """Normalized representation of an entity for embedding.

    Attributes:
        entity: Original entity URI
        normal_form: Normalized string (lowercase, no diacritics, etc.)
        types: List of type URIs for this entity
        properties: List of property URIs used with this entity
        labels: List of labels found for this entity
        representation: Combined string representation r(e) for embedding
        is_ontology_entity: Whether this entity is from an ontology namespace
    """

    entity: URIRef
    normal_form: str
    types: list[URIRef]
    properties: list[URIRef]
    labels: list[str]
    representation: str
    is_ontology_entity: bool