Skip to content

ontocast.tool.agg.uri_builder

URI construction with naming convention normalization.

Builds final URIs for entity representatives following RDF/Semantic Web naming conventions (see README.md): - Classes (entities / types): PascalCase (e.g., JudicialDecision) - Properties (predicates): lowerCamelCase (e.g., hasDecision) - Instances with natural names: PascalCase (e.g., FrenchCourtOfCassation) - Instances with structured/external IDs: preserve structure (e.g., Case_2023_456)

Underscores are avoided in ontology terms (classes, properties). Underscores are acceptable for instances derived from external IDs.

EntityRole

Bases: StrEnum

Role of an entity in an RDF graph.

Source code in ontocast/tool/agg/uri_builder.py
class EntityRole(StrEnum):
    """Role of an entity in an RDF graph."""

    CLASS = "class"
    PROPERTY = "property"
    INSTANCE = "instance"

URIBuilder

Build normalized URIs for all entities following RDF naming conventions.

  • Fact entities (under base_iri) get new URIs under base_iri.
  • Ontology entities (everything else) are preserved as-is.
Source code in ontocast/tool/agg/uri_builder.py
class URIBuilder:
    """Build normalized URIs for all entities following RDF naming conventions.

    - **Fact entities** (under *base_iri*) get new URIs under *base_iri*.
    - **Ontology entities** (everything else) are preserved as-is.
    """

    def __init__(
        self,
        base_iri: str = DEFAULT_IRI,
    ):
        """Initialise the builder.

        Args:
            base_iri: Base IRI for fact entities (default ``DEFAULT_IRI``).
                Entities under this namespace are facts; everything else is
                treated as an ontology entity.
        """
        self.base_iri = base_iri.rstrip("/") + "/"
        self._used_uris: set[URIRef] = set()

    # ------------------------------------------------------------------
    # helpers
    # ------------------------------------------------------------------

    def is_ontology_entity(self, entity: URIRef) -> bool:
        """Return True if *entity* does **not** belong to the facts namespace."""
        return not str(entity).startswith(self.base_iri)

    @staticmethod
    def _extract_namespace(entity: URIRef) -> str:
        """Extract the namespace part of a URI (everything before the local name).

        For ``http://example.org/ns#Foo`` returns ``http://example.org/ns#``.
        For ``http://example.org/ns/Foo`` returns ``http://example.org/ns/``.
        """
        uri_str = str(entity)
        if "#" in uri_str:
            return uri_str.rsplit("#", 1)[0] + "#"
        trimmed = uri_str.rstrip("/")
        if "/" in trimmed:
            return trimmed.rsplit("/", 1)[0] + "/"
        return uri_str

    def _ensure_unique_uri(self, base: str, local_name: str) -> URIRef:
        """Return a unique URI under *base* for *local_name*."""
        candidate = URIRef(f"{base}{local_name}")
        if candidate not in self._used_uris:
            self._used_uris.add(candidate)
            return candidate

        counter = 1
        while True:
            candidate = URIRef(f"{base}{local_name}_{counter}")
            if candidate not in self._used_uris:
                self._used_uris.add(candidate)
                return candidate
            counter += 1

    # ------------------------------------------------------------------
    # public API
    # ------------------------------------------------------------------

    def build_uri(
        self,
        entity: URIRef,
        representation: EntityRepresentation,
        role: EntityRole | str,
        target_iri: URIRef | str | None = None,
        is_ontology_entity: bool | None = None,
    ) -> URIRef:
        """Build a normalised URI for a single entity.

        Fact entities are normalised and placed under *target_iri* (falling
        back to *base_iri*). Ontology entities are preserved as-is.

        Args:
            entity: Original entity URI.
            representation: Entity representation with metadata.
            role: Entity role (an :class:`EntityRole` value).
            target_iri: Optional document IRI to use as namespace for fact
                entities instead of the default *base_iri*.  When chunks carry
                different ``doc_iri`` values the caller passes the appropriate
                one here so that each fact is placed under its document
                namespace.
            is_ontology_entity: Explicit ontology/fact classification.  When
                provided this takes precedence over namespace-based inference.

        Returns:
            Normalised URI.
        """
        is_ontology = (
            self.is_ontology_entity(entity)
            if is_ontology_entity is None
            else is_ontology_entity
        )

        if is_ontology:
            return entity

        local_name = normalize_local_name(representation, role)
        base = (str(target_iri).rstrip("/") + "/") if target_iri else self.base_iri
        return self._ensure_unique_uri(base=base, local_name=local_name)

    def create_entity_uri_mapping(
        self,
        identity_mapping: dict[URIRef, URIRef],
        representations: dict[URIRef, EntityRepresentation],
        entity_doc_iris: dict[URIRef, URIRef],
        entity_is_ontology: dict[URIRef, bool],
    ) -> dict[URIRef, URIRef]:
        """Create final URI mapping from identity mapping + namespace policy.

        This method decouples canonical identity choice from URI surface choice:
        identity mapping decides *what* is the same entity, while this method
        decides *how* each source entity should be rendered as a final URI.
        Fact entities are always rendered in their source ``doc_iri`` namespace.
        Ontology entities are preserved as their canonical URI.

        Args:
            identity_mapping: Mapping ``entity -> canonical_entity``.
            representations: All entity representations.
            entity_doc_iris: Mapping from source entity to source ``doc_iri``.
            entity_is_ontology: Classification map where ``True`` means the
                canonical entity should stay in ontology space.

        Returns:
            Mapping ``entity -> final_uri``.
        """
        self._used_uris.clear()
        mapping: dict[URIRef, URIRef] = {}
        canonical_cache: dict[tuple[URIRef, str], URIRef] = {}

        for entity, canonical in identity_mapping.items():
            rep = representations.get(canonical)
            if rep is None:
                mapping[entity] = entity
                continue

            role = rep.role if rep.role is not None else EntityRole.INSTANCE
            is_ontology = entity_is_ontology.get(
                canonical, self.is_ontology_entity(canonical)
            )
            if is_ontology:
                mapping[entity] = canonical
                continue

            doc_iri = entity_doc_iris.get(entity)
            base = (str(doc_iri).rstrip("/") + "/") if doc_iri else self.base_iri
            cache_key = (canonical, base)
            if cache_key in canonical_cache:
                mapping[entity] = canonical_cache[cache_key]
                continue

            canonical_uri = self.build_uri(
                canonical,
                rep,
                role,
                target_iri=doc_iri,
                is_ontology_entity=False,
            )
            canonical_cache[cache_key] = canonical_uri
            mapping[entity] = canonical_uri

        normalised = sum(1 for e, u in mapping.items() if e != u)
        logger.info(
            f"Built URI mapping: {len(mapping)} entities, {normalised} normalised"
        )
        return mapping

    @staticmethod
    def compose_mappings(
        clustering_mapping: dict[URIRef, URIRef],
        uri_mapping: dict[URIRef, URIRef],
    ) -> dict[URIRef, URIRef]:
        """Compose clustering and URI mappings.

        ``e → representative(e) → normalised_uri(representative(e))``

        Args:
            clustering_mapping: ``e → e_rep``.
            uri_mapping: ``e_rep → final_uri``.

        Returns:
            Composed mapping ``e → final_uri``.
        """
        composed = {
            original: uri_mapping.get(representative, representative)
            for original, representative in clustering_mapping.items()
        }
        logger.info(
            f"Composed mapping: {len(composed)} entities → "
            f"{len(set(composed.values()))} final URIs"
        )
        return composed

__init__(base_iri=DEFAULT_IRI)

Initialise the builder.

Parameters:

Name Type Description Default
base_iri str

Base IRI for fact entities (default DEFAULT_IRI). Entities under this namespace are facts; everything else is treated as an ontology entity.

DEFAULT_IRI
Source code in ontocast/tool/agg/uri_builder.py
def __init__(
    self,
    base_iri: str = DEFAULT_IRI,
):
    """Initialise the builder.

    Args:
        base_iri: Base IRI for fact entities (default ``DEFAULT_IRI``).
            Entities under this namespace are facts; everything else is
            treated as an ontology entity.
    """
    self.base_iri = base_iri.rstrip("/") + "/"
    self._used_uris: set[URIRef] = set()

build_uri(entity, representation, role, target_iri=None, is_ontology_entity=None)

Build a normalised URI for a single entity.

Fact entities are normalised and placed under target_iri (falling back to base_iri). Ontology entities are preserved as-is.

Parameters:

Name Type Description Default
entity URIRef

Original entity URI.

required
representation EntityRepresentation

Entity representation with metadata.

required
role EntityRole | str

Entity role (an :class:EntityRole value).

required
target_iri URIRef | str | None

Optional document IRI to use as namespace for fact entities instead of the default base_iri. When chunks carry different doc_iri values the caller passes the appropriate one here so that each fact is placed under its document namespace.

None
is_ontology_entity bool | None

Explicit ontology/fact classification. When provided this takes precedence over namespace-based inference.

None

Returns:

Type Description
URIRef

Normalised URI.

Source code in ontocast/tool/agg/uri_builder.py
def build_uri(
    self,
    entity: URIRef,
    representation: EntityRepresentation,
    role: EntityRole | str,
    target_iri: URIRef | str | None = None,
    is_ontology_entity: bool | None = None,
) -> URIRef:
    """Build a normalised URI for a single entity.

    Fact entities are normalised and placed under *target_iri* (falling
    back to *base_iri*). Ontology entities are preserved as-is.

    Args:
        entity: Original entity URI.
        representation: Entity representation with metadata.
        role: Entity role (an :class:`EntityRole` value).
        target_iri: Optional document IRI to use as namespace for fact
            entities instead of the default *base_iri*.  When chunks carry
            different ``doc_iri`` values the caller passes the appropriate
            one here so that each fact is placed under its document
            namespace.
        is_ontology_entity: Explicit ontology/fact classification.  When
            provided this takes precedence over namespace-based inference.

    Returns:
        Normalised URI.
    """
    is_ontology = (
        self.is_ontology_entity(entity)
        if is_ontology_entity is None
        else is_ontology_entity
    )

    if is_ontology:
        return entity

    local_name = normalize_local_name(representation, role)
    base = (str(target_iri).rstrip("/") + "/") if target_iri else self.base_iri
    return self._ensure_unique_uri(base=base, local_name=local_name)

compose_mappings(clustering_mapping, uri_mapping) staticmethod

Compose clustering and URI mappings.

e → representative(e) → normalised_uri(representative(e))

Parameters:

Name Type Description Default
clustering_mapping dict[URIRef, URIRef]

e → e_rep.

required
uri_mapping dict[URIRef, URIRef]

e_rep → final_uri.

required

Returns:

Type Description
dict[URIRef, URIRef]

Composed mapping e → final_uri.

Source code in ontocast/tool/agg/uri_builder.py
@staticmethod
def compose_mappings(
    clustering_mapping: dict[URIRef, URIRef],
    uri_mapping: dict[URIRef, URIRef],
) -> dict[URIRef, URIRef]:
    """Compose clustering and URI mappings.

    ``e → representative(e) → normalised_uri(representative(e))``

    Args:
        clustering_mapping: ``e → e_rep``.
        uri_mapping: ``e_rep → final_uri``.

    Returns:
        Composed mapping ``e → final_uri``.
    """
    composed = {
        original: uri_mapping.get(representative, representative)
        for original, representative in clustering_mapping.items()
    }
    logger.info(
        f"Composed mapping: {len(composed)} entities → "
        f"{len(set(composed.values()))} final URIs"
    )
    return composed

create_entity_uri_mapping(identity_mapping, representations, entity_doc_iris, entity_is_ontology)

Create final URI mapping from identity mapping + namespace policy.

This method decouples canonical identity choice from URI surface choice: identity mapping decides what is the same entity, while this method decides how each source entity should be rendered as a final URI. Fact entities are always rendered in their source doc_iri namespace. Ontology entities are preserved as their canonical URI.

Parameters:

Name Type Description Default
identity_mapping dict[URIRef, URIRef]

Mapping entity -> canonical_entity.

required
representations dict[URIRef, EntityRepresentation]

All entity representations.

required
entity_doc_iris dict[URIRef, URIRef]

Mapping from source entity to source doc_iri.

required
entity_is_ontology dict[URIRef, bool]

Classification map where True means the canonical entity should stay in ontology space.

required

Returns:

Type Description
dict[URIRef, URIRef]

Mapping entity -> final_uri.

Source code in ontocast/tool/agg/uri_builder.py
def create_entity_uri_mapping(
    self,
    identity_mapping: dict[URIRef, URIRef],
    representations: dict[URIRef, EntityRepresentation],
    entity_doc_iris: dict[URIRef, URIRef],
    entity_is_ontology: dict[URIRef, bool],
) -> dict[URIRef, URIRef]:
    """Create final URI mapping from identity mapping + namespace policy.

    This method decouples canonical identity choice from URI surface choice:
    identity mapping decides *what* is the same entity, while this method
    decides *how* each source entity should be rendered as a final URI.
    Fact entities are always rendered in their source ``doc_iri`` namespace.
    Ontology entities are preserved as their canonical URI.

    Args:
        identity_mapping: Mapping ``entity -> canonical_entity``.
        representations: All entity representations.
        entity_doc_iris: Mapping from source entity to source ``doc_iri``.
        entity_is_ontology: Classification map where ``True`` means the
            canonical entity should stay in ontology space.

    Returns:
        Mapping ``entity -> final_uri``.
    """
    self._used_uris.clear()
    mapping: dict[URIRef, URIRef] = {}
    canonical_cache: dict[tuple[URIRef, str], URIRef] = {}

    for entity, canonical in identity_mapping.items():
        rep = representations.get(canonical)
        if rep is None:
            mapping[entity] = entity
            continue

        role = rep.role if rep.role is not None else EntityRole.INSTANCE
        is_ontology = entity_is_ontology.get(
            canonical, self.is_ontology_entity(canonical)
        )
        if is_ontology:
            mapping[entity] = canonical
            continue

        doc_iri = entity_doc_iris.get(entity)
        base = (str(doc_iri).rstrip("/") + "/") if doc_iri else self.base_iri
        cache_key = (canonical, base)
        if cache_key in canonical_cache:
            mapping[entity] = canonical_cache[cache_key]
            continue

        canonical_uri = self.build_uri(
            canonical,
            rep,
            role,
            target_iri=doc_iri,
            is_ontology_entity=False,
        )
        canonical_cache[cache_key] = canonical_uri
        mapping[entity] = canonical_uri

    normalised = sum(1 for e, u in mapping.items() if e != u)
    logger.info(
        f"Built URI mapping: {len(mapping)} entities, {normalised} normalised"
    )
    return mapping

is_ontology_entity(entity)

Return True if entity does not belong to the facts namespace.

Source code in ontocast/tool/agg/uri_builder.py
def is_ontology_entity(self, entity: URIRef) -> bool:
    """Return True if *entity* does **not** belong to the facts namespace."""
    return not str(entity).startswith(self.base_iri)

detect_role(entity, graph)

Detect the role of an entity: class, property, or instance.

Parameters:

Name Type Description Default
entity URIRef

The entity URI.

required
graph RDFGraph

The RDF graph containing the entity.

required

Returns:

Type Description
EntityRole

The detected :class:EntityRole.

Source code in ontocast/tool/agg/uri_builder.py
def detect_role(entity: URIRef, graph: RDFGraph) -> EntityRole:
    """Detect the role of an entity: class, property, or instance.

    Args:
        entity: The entity URI.
        graph: The RDF graph containing the entity.

    Returns:
        The detected :class:`EntityRole`.
    """
    entity_types: set[URIRef] = set()
    is_predicate = False

    for s, p, o in graph:
        if s == entity and p == RDF.type and isinstance(o, URIRef):
            entity_types.add(o)
        if p == entity:
            is_predicate = True

    if entity_types & _CLASS_TYPES:
        return EntityRole.CLASS
    if entity_types & _PROPERTY_TYPES or is_predicate:
        return EntityRole.PROPERTY
    return EntityRole.INSTANCE

detect_role_from_context(types, is_predicate=False)

Detect entity role from pre-extracted context (no graph scan needed).

This is the preferred entry point when the caller has already extracted types and predicate usage via :meth:EntityNormalizer.extract_entity_context, avoiding a redundant full-graph iteration.

Parameters:

Name Type Description Default
types list[URIRef]

rdf:type values of the entity.

required
is_predicate bool

Whether the entity appears in the predicate position of at least one triple.

False

Returns:

Type Description
EntityRole

The detected :class:EntityRole.

Source code in ontocast/tool/agg/uri_builder.py
def detect_role_from_context(
    types: list[URIRef],
    is_predicate: bool = False,
) -> EntityRole:
    """Detect entity role from pre-extracted context (no graph scan needed).

    This is the preferred entry point when the caller has already extracted
    types and predicate usage via
    :meth:`EntityNormalizer.extract_entity_context`, avoiding a redundant
    full-graph iteration.

    Args:
        types: ``rdf:type`` values of the entity.
        is_predicate: Whether the entity appears in the predicate position
            of at least one triple.

    Returns:
        The detected :class:`EntityRole`.
    """
    type_set = frozenset(types)

    if type_set & _CLASS_TYPES:
        return EntityRole.CLASS
    if type_set & _PROPERTY_TYPES or is_predicate:
        return EntityRole.PROPERTY
    return EntityRole.INSTANCE

format_structured_id(entity)

Format a structured identifier preserving underscores and digits.

The leading word segment is capitalised so that the result starts with an uppercase letter (e.g. Case_2023_456).

Parameters:

Name Type Description Default
entity URIRef

Original entity URI.

required

Returns:

Type Description
str

Cleaned identifier string.

Source code in ontocast/tool/agg/uri_builder.py
def format_structured_id(entity: URIRef) -> str:
    """Format a structured identifier preserving underscores and digits.

    The leading word segment is capitalised so that the result starts
    with an uppercase letter (e.g. ``Case_2023_456``).

    Args:
        entity: Original entity URI.

    Returns:
        Cleaned identifier string.
    """
    local = str(entity).rsplit("/", 1)[-1].rsplit("#", 1)[-1]
    cleaned = re.sub(r"[^\w]", "_", local)
    cleaned = re.sub(r"_+", "_", cleaned).strip("_")
    if not cleaned:
        return "Entity"
    # Capitalise first segment for readability
    parts = cleaned.split("_", 1)
    parts[0] = parts[0].capitalize()
    return "_".join(parts)

has_structured_id(entity)

Detect if an entity represents a structured/external identifier.

Structured IDs contain digits together with underscores, e.g. Case_2023_456 or Decision_2021_09_15.

Parameters:

Name Type Description Default
entity URIRef

Original entity URI.

required

Returns:

Type Description
bool

True if the entity appears to have a structured ID.

Source code in ontocast/tool/agg/uri_builder.py
def has_structured_id(entity: URIRef) -> bool:
    """Detect if an entity represents a structured/external identifier.

    Structured IDs contain digits together with underscores, e.g.
    ``Case_2023_456`` or ``Decision_2021_09_15``.

    Args:
        entity: Original entity URI.

    Returns:
        True if the entity appears to have a structured ID.
    """
    local = str(entity).rsplit("/", 1)[-1].rsplit("#", 1)[-1]
    return bool(re.search(r"\d", local) and "_" in local)

normalize_local_name(representation, role)

Produce a properly-cased local name following RDF conventions.

Parameters:

Name Type Description Default
representation EntityRepresentation

Entity representation with metadata.

required
role EntityRole | str

Entity role (an :class:EntityRole value).

required

Returns:

Type Description
str

Properly cased local name.

Source code in ontocast/tool/agg/uri_builder.py
def normalize_local_name(
    representation: EntityRepresentation,
    role: EntityRole | str,
) -> str:
    """Produce a properly-cased local name following RDF conventions.

    Args:
        representation: Entity representation with metadata.
        role: Entity role (an :class:`EntityRole` value).

    Returns:
        Properly cased local name.
    """
    if role == EntityRole.PROPERTY:
        return to_lower_camel_case(representation.normal_form)

    if role == EntityRole.INSTANCE and has_structured_id(representation.entity):
        return format_structured_id(representation.entity)

    # Classes and instances with natural names → PascalCase
    return to_pascal_case(representation.normal_form)

to_lower_camel_case(normalized)

Convert a space-separated lowercase string to lowerCamelCase.

Parameters:

Name Type Description Default
normalized str

Space-separated lowercase string.

required

Returns:

Type Description
str

lowerCamelCase string.

Examples:

>>> to_lower_camel_case('has decision')
'hasDecision'
>>> to_lower_camel_case('date published')
'datePublished'
Source code in ontocast/tool/agg/uri_builder.py
def to_lower_camel_case(normalized: str) -> str:
    """Convert a space-separated lowercase string to lowerCamelCase.

    Args:
        normalized: Space-separated lowercase string.

    Returns:
        lowerCamelCase string.

    Examples:
        >>> to_lower_camel_case('has decision')
        'hasDecision'
        >>> to_lower_camel_case('date published')
        'datePublished'
    """
    words = normalized.split()
    if not words:
        return ""
    return words[0] + "".join(w.capitalize() for w in words[1:])

to_pascal_case(normalized)

Convert a space-separated lowercase string to PascalCase.

Parameters:

Name Type Description Default
normalized str

Space-separated lowercase string.

required

Returns:

Type Description
str

PascalCase string.

Examples:

>>> to_pascal_case('judicial decision')
'JudicialDecision'
>>> to_pascal_case('french court of cassation')
'FrenchCourtOfCassation'
Source code in ontocast/tool/agg/uri_builder.py
def to_pascal_case(normalized: str) -> str:
    """Convert a space-separated lowercase string to PascalCase.

    Args:
        normalized: Space-separated lowercase string.

    Returns:
        PascalCase string.

    Examples:
        >>> to_pascal_case('judicial decision')
        'JudicialDecision'
        >>> to_pascal_case('french court of cassation')
        'FrenchCourtOfCassation'
    """
    words = normalized.split()
    return "".join(w.capitalize() for w in words if w)