Skip to content

ontocast.onto.content_unit

ContentUnit

Bases: SourceUnit

A processing unit that extends source data with mutable output fields.

Source code in ontocast/onto/content_unit.py
class ContentUnit(SourceUnit):
    """A processing unit that extends source data with mutable output fields."""

    graph: RDFGraph = Field(
        description="RDF triples representing facts rendered from this source unit in turtle format "
        "as a string in compact form: use prefixes for namespaces, do NOT add comments",
        default_factory=RDFGraph,
    )

    _graph_absolute: RDFGraph | None = PrivateAttr(default=None)

    processed: bool = Field(default=False, description="Was this unit processed?")
    generated_at: datetime | None = Field(
        default=None, description="generated timestamp"
    )

    @property
    def graph_absolute(self):
        if self._graph_absolute is None:
            self._graph_absolute = self.graph.copy()
            self._graph_absolute.remap_namespaces(self.iri, self.iri_absolute)
        return self._graph_absolute

    @property
    def generated_at_iso(self):
        """Get generated timestamp in ISO format.

        Returns:
            str: Timestamp in ISO format.
        """
        if self.generated_at is None:
            self.generated_at = datetime.now(timezone.utc)
        return self.generated_at.isoformat()

    def sanitize(self):
        self.graph = self.graph.unbind_chunk_namespaces()
        self.graph.sanitize_prefixes_namespaces()

generated_at_iso property

Get generated timestamp in ISO format.

Returns:

Name Type Description
str

Timestamp in ISO format.

SourceUnit

Bases: BaseModel

Immutable source unit identity and input text.

Attributes:

Name Type Description
text str

Source text content for this unit.

index int

Position of this unit in the source document.

hid str

A stable hash id derived from text.

doc_iri URIRef

IRI of parent document.

type OutputType

Type of content unit (facts or ontology).

Source code in ontocast/onto/content_unit.py
class SourceUnit(BaseModel):
    """Immutable source unit identity and input text.

    Attributes:
        text: Source text content for this unit.
        index: Position of this unit in the source document.
        hid: A stable hash id derived from text.
        doc_iri: IRI of parent document.
        type: Type of content unit (facts or ontology).
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    text: str = Field(description="Source text content for this unit")
    index: int = Field(description="Position of this unit in the source document")
    doc_iri: URIRef = Field(description="IRI of parent doc")
    type: OutputType = Field(
        default=OutputType.FACTS, description="Type of content unit"
    )
    _hid: str = PrivateAttr(default="")

    @field_validator("doc_iri", mode="before")
    @classmethod
    def _coerce_doc_iri(cls, value: URIRef | str) -> URIRef:
        if isinstance(value, URIRef):
            return value
        return URIRef(value)

    @computed_field(return_type=str)
    @property
    def hid(self) -> str:
        """Stable hash id generated from source text."""
        rendered_hid = render_text_hash(self.text)
        if self._hid != rendered_hid:
            self._hid = rendered_hid
        return self._hid

    @property
    def iri(self):
        """Get the base IRI for this unit.

        Returns:
            str: The base unit IRI.
        """
        return DEFAULT_IRI

    @property
    def iri_absolute(self):
        """Get the absolute IRI for this unit.

        Returns:
            str: The unit IRI.
        """
        return f"{self.doc_iri}/{self.hid}"

    @property
    def namespace(self):
        """Get the namespace for this unit.

        Returns:
            str: The unit namespace.
        """
        return iri2namespace(self.iri, ontology=False)

    def __len__(self):
        return len(self.text)

hid property

Stable hash id generated from source text.

iri property

Get the base IRI for this unit.

Returns:

Name Type Description
str

The base unit IRI.

iri_absolute property

Get the absolute IRI for this unit.

Returns:

Name Type Description
str

The unit IRI.

namespace property

Get the namespace for this unit.

Returns:

Name Type Description
str

The unit namespace.