Skip to content

ontocast.tool.agg.promoter

URI promotion from chunk-level to document-level.

This module handles the promotion of chunk-level entity URIs to document-level URIs, while preserving ontology entity URIs unchanged.

URIPromoter

Promotes chunk-level URIs to document-level URIs.

This class is responsible for converting entities from chunk namespaces to the document namespace, while keeping ontology entities unchanged.

Source code in ontocast/tool/agg/promoter.py
class URIPromoter:
    """Promotes chunk-level URIs to document-level URIs.

    This class is responsible for converting entities from chunk namespaces
    to the document namespace, while keeping ontology entities unchanged.
    """

    def __init__(
        self,
        doc_namespace: str,
        chunk_namespaces: set[str],
        ontology_namespaces: set[str],
    ):
        """Initialize the URI promoter.

        Args:
            doc_namespace: Document namespace for promoted URIs
            chunk_namespaces: Set of chunk namespace URIs
            ontology_namespaces: Set of ontology namespace URIs (preserved as-is)
        """
        self.doc_namespace = self._normalize_namespace(doc_namespace)
        self.chunk_namespaces = chunk_namespaces
        self.ontology_namespaces = ontology_namespaces
        self._used_uris: set[str] = set()

    def _normalize_namespace(self, namespace: str) -> str:
        """Ensure namespace ends with appropriate separator."""
        return namespace if namespace.endswith(("/", "#")) else namespace + "/"

    def _clean_local_name(self, name: str) -> str:
        """Clean a name for use as URI local part.

        Args:
            name: Name to clean

        Returns:
            Cleaned name suitable for URI
        """
        # Replace invalid URI characters with underscores
        cleaned = re.sub(r"[^\w\-.]", "_", name)
        # Remove consecutive underscores
        cleaned = re.sub(r"_+", "_", cleaned)
        # Remove leading/trailing underscores
        cleaned = cleaned.strip("_")
        return cleaned or "entity"

    def _ensure_unique_uri(self, uri: str) -> str:
        """Ensure URI is unique by appending counter if needed.

        Args:
            uri: Proposed URI

        Returns:
            Unique URI
        """
        if uri not in self._used_uris:
            self._used_uris.add(uri)
            return uri

        # URI already used, append counter
        base_uri = uri
        counter = 1

        while uri in self._used_uris:
            # Extract local name and add counter
            if "#" in base_uri:
                namespace, local = base_uri.rsplit("#", 1)
                uri = f"{namespace}#{local}_{counter}"
            else:
                namespace = base_uri.rstrip("/").rsplit("/", 1)[0]
                local = (
                    base_uri.rstrip("/").rsplit("/", 1)[1]
                    if "/" in base_uri.rstrip("/")
                    else base_uri
                )
                uri = f"{namespace}/{local}_{counter}"
            counter += 1

        self._used_uris.add(uri)
        return uri

    def should_promote(self, entity: URIRef) -> bool:
        """Check if an entity should be promoted to document namespace.

        Args:
            entity: Entity URI to check

        Returns:
            True if entity should be promoted (is from chunk namespace)
        """
        entity_str = str(entity)

        # Don't promote ontology entities
        if any(entity_str.startswith(ns) for ns in self.ontology_namespaces):
            return False

        # Promote chunk entities
        if any(entity_str.startswith(ns) for ns in self.chunk_namespaces):
            return True

        # Unknown namespace - be conservative, don't promote
        return False

    def promote_entity(
        self, entity: URIRef, representation: EntityRepresentation
    ) -> URIRef:
        """Promote a chunk entity to document namespace.

        Args:
            entity: Original entity URI
            representation: Entity representation with metadata

        Returns:
            Promoted entity URI
        """
        if not self.should_promote(entity):
            # Keep ontology entities unchanged
            return entity

        # Create new URI in document namespace
        # Use normalized form as local name
        local_name = self._clean_local_name(representation.normal_form)

        # Construct promoted URI
        promoted_uri_str = f"{self.doc_namespace}{local_name}"

        # Ensure uniqueness
        unique_uri_str = self._ensure_unique_uri(promoted_uri_str)

        return URIRef(unique_uri_str)

    def create_promotion_mapping(
        self,
        entities: list[URIRef],
        representations: dict[URIRef, EntityRepresentation],
    ) -> dict[URIRef, URIRef]:
        """Create mapping from original to promoted URIs.

        Args:
            entities: List of entity URIs to promote
            representations: Dictionary mapping entities to their representations

        Returns:
            Dictionary mapping original URIs to promoted URIs (e -> promoted(e))
        """
        mapping = {}
        promoted_count = 0
        preserved_count = 0

        for entity in entities:
            representation = representations.get(entity)
            if representation is None:
                logger.warning(f"No representation found for entity {entity}")
                mapping[entity] = entity
                continue

            promoted = self.promote_entity(entity, representation)
            mapping[entity] = promoted

            if promoted != entity:
                promoted_count += 1
            else:
                preserved_count += 1

        logger.info(
            f"Created promotion mapping: "
            f"{promoted_count} promoted, "
            f"{preserved_count} preserved (ontology entities)"
        )

        return mapping

    def compose_mappings(
        self,
        clustering_mapping: dict[URIRef, URIRef],
        promotion_mapping: dict[URIRef, URIRef],
    ) -> dict[URIRef, URIRef]:
        """Compose clustering and promotion mappings.

        First, entities are mapped to their cluster representatives (clustering_mapping).
        Then, representatives are promoted to document namespace (promotion_mapping).

        The composed mapping is: e -> promoted(representative(e))

        Args:
            clustering_mapping: Map from entity to cluster representative (e -> e_rep)
            promotion_mapping: Map from representative to promoted URI (e_rep -> e')

        Returns:
            Composed mapping (e -> e')
        """
        composed = {}

        for original, representative in clustering_mapping.items():
            # Look up the promoted version of the representative
            promoted = promotion_mapping.get(representative, representative)
            composed[original] = promoted

        logger.info(
            f"Composed mapping: {len(composed)} entities mapped to "
            f"{len(set(composed.values()))} final URIs"
        )

        return composed

__init__(doc_namespace, chunk_namespaces, ontology_namespaces)

Initialize the URI promoter.

Parameters:

Name Type Description Default
doc_namespace str

Document namespace for promoted URIs

required
chunk_namespaces set[str]

Set of chunk namespace URIs

required
ontology_namespaces set[str]

Set of ontology namespace URIs (preserved as-is)

required
Source code in ontocast/tool/agg/promoter.py
def __init__(
    self,
    doc_namespace: str,
    chunk_namespaces: set[str],
    ontology_namespaces: set[str],
):
    """Initialize the URI promoter.

    Args:
        doc_namespace: Document namespace for promoted URIs
        chunk_namespaces: Set of chunk namespace URIs
        ontology_namespaces: Set of ontology namespace URIs (preserved as-is)
    """
    self.doc_namespace = self._normalize_namespace(doc_namespace)
    self.chunk_namespaces = chunk_namespaces
    self.ontology_namespaces = ontology_namespaces
    self._used_uris: set[str] = set()

compose_mappings(clustering_mapping, promotion_mapping)

Compose clustering and promotion mappings.

First, entities are mapped to their cluster representatives (clustering_mapping). Then, representatives are promoted to document namespace (promotion_mapping).

The composed mapping is: e -> promoted(representative(e))

Parameters:

Name Type Description Default
clustering_mapping dict[URIRef, URIRef]

Map from entity to cluster representative (e -> e_rep)

required
promotion_mapping dict[URIRef, URIRef]

Map from representative to promoted URI (e_rep -> e')

required

Returns:

Type Description
dict[URIRef, URIRef]

Composed mapping (e -> e')

Source code in ontocast/tool/agg/promoter.py
def compose_mappings(
    self,
    clustering_mapping: dict[URIRef, URIRef],
    promotion_mapping: dict[URIRef, URIRef],
) -> dict[URIRef, URIRef]:
    """Compose clustering and promotion mappings.

    First, entities are mapped to their cluster representatives (clustering_mapping).
    Then, representatives are promoted to document namespace (promotion_mapping).

    The composed mapping is: e -> promoted(representative(e))

    Args:
        clustering_mapping: Map from entity to cluster representative (e -> e_rep)
        promotion_mapping: Map from representative to promoted URI (e_rep -> e')

    Returns:
        Composed mapping (e -> e')
    """
    composed = {}

    for original, representative in clustering_mapping.items():
        # Look up the promoted version of the representative
        promoted = promotion_mapping.get(representative, representative)
        composed[original] = promoted

    logger.info(
        f"Composed mapping: {len(composed)} entities mapped to "
        f"{len(set(composed.values()))} final URIs"
    )

    return composed

create_promotion_mapping(entities, representations)

Create mapping from original to promoted URIs.

Parameters:

Name Type Description Default
entities list[URIRef]

List of entity URIs to promote

required
representations dict[URIRef, EntityRepresentation]

Dictionary mapping entities to their representations

required

Returns:

Type Description
dict[URIRef, URIRef]

Dictionary mapping original URIs to promoted URIs (e -> promoted(e))

Source code in ontocast/tool/agg/promoter.py
def create_promotion_mapping(
    self,
    entities: list[URIRef],
    representations: dict[URIRef, EntityRepresentation],
) -> dict[URIRef, URIRef]:
    """Create mapping from original to promoted URIs.

    Args:
        entities: List of entity URIs to promote
        representations: Dictionary mapping entities to their representations

    Returns:
        Dictionary mapping original URIs to promoted URIs (e -> promoted(e))
    """
    mapping = {}
    promoted_count = 0
    preserved_count = 0

    for entity in entities:
        representation = representations.get(entity)
        if representation is None:
            logger.warning(f"No representation found for entity {entity}")
            mapping[entity] = entity
            continue

        promoted = self.promote_entity(entity, representation)
        mapping[entity] = promoted

        if promoted != entity:
            promoted_count += 1
        else:
            preserved_count += 1

    logger.info(
        f"Created promotion mapping: "
        f"{promoted_count} promoted, "
        f"{preserved_count} preserved (ontology entities)"
    )

    return mapping

promote_entity(entity, representation)

Promote a chunk entity to document namespace.

Parameters:

Name Type Description Default
entity URIRef

Original entity URI

required
representation EntityRepresentation

Entity representation with metadata

required

Returns:

Type Description
URIRef

Promoted entity URI

Source code in ontocast/tool/agg/promoter.py
def promote_entity(
    self, entity: URIRef, representation: EntityRepresentation
) -> URIRef:
    """Promote a chunk entity to document namespace.

    Args:
        entity: Original entity URI
        representation: Entity representation with metadata

    Returns:
        Promoted entity URI
    """
    if not self.should_promote(entity):
        # Keep ontology entities unchanged
        return entity

    # Create new URI in document namespace
    # Use normalized form as local name
    local_name = self._clean_local_name(representation.normal_form)

    # Construct promoted URI
    promoted_uri_str = f"{self.doc_namespace}{local_name}"

    # Ensure uniqueness
    unique_uri_str = self._ensure_unique_uri(promoted_uri_str)

    return URIRef(unique_uri_str)

should_promote(entity)

Check if an entity should be promoted to document namespace.

Parameters:

Name Type Description Default
entity URIRef

Entity URI to check

required

Returns:

Type Description
bool

True if entity should be promoted (is from chunk namespace)

Source code in ontocast/tool/agg/promoter.py
def should_promote(self, entity: URIRef) -> bool:
    """Check if an entity should be promoted to document namespace.

    Args:
        entity: Entity URI to check

    Returns:
        True if entity should be promoted (is from chunk namespace)
    """
    entity_str = str(entity)

    # Don't promote ontology entities
    if any(entity_str.startswith(ns) for ns in self.ontology_namespaces):
        return False

    # Promote chunk entities
    if any(entity_str.startswith(ns) for ns in self.chunk_namespaces):
        return True

    # Unknown namespace - be conservative, don't promote
    return False