`graflo.data_source`¶

Data source abstraction layer for graph database ingestion.

This package provides a unified interface for different data source types, separating "where data comes from" (DataSource) from "how it's transformed" (Resource).

Key Components

AbstractDataSource: Base class for all data sources
FileDataSource: File-based data sources (JSON, JSONL, CSV/TSV)
APIDataSource: REST API runtime executor (built from APIConnector + conn_proxy)
SQLDataSource: SQL database data source
DataSourceRegistry: Maps DataSources to Resource names

`APIConfig` ¶

Bases: ConfigBaseModel

Merged runtime configuration for REST API requests.

Built exclusively via :meth:APIConnector.build_api_config; not intended for direct construction in manifests or factory helpers.

Source code in graflo/data_source/api.py

class APIConfig(ConfigBaseModel):
    """Merged runtime configuration for REST API requests.

    Built exclusively via :meth:`APIConnector.build_api_config`; not intended
    for direct construction in manifests or factory helpers.
    """

    url: str
    method: str = "GET"
    headers: dict[str, str] = Field(default_factory=dict)
    auth: ApiAuth | None = None
    params: dict[str, Any] = Field(default_factory=dict)
    timeout: float | None = None
    retries: int = 0
    retry_backoff_factor: float = 0.1
    retry_status_forcelist: list[int] = Field(
        default_factory=lambda: [500, 502, 503, 504]
    )
    verify: bool = True
    pagination: PaginationConfig | None = None
    row_annotations: dict[str, Any] = Field(default_factory=dict)

`APIConnector` ¶

Bases: ResourceConnector

Connector for REST API endpoints.

Declares the non-secret access pattern (path, method, pagination). Runtime base_url and credentials are supplied via connector_connection -> conn_proxy -> :class:~graflo.hq.connection_provider.ApiGeneralizedConnConfig.

Attributes:

Name	Type	Description
`path`	`str`	Relative endpoint path (e.g. `/api/users`).
`method`	`str`	HTTP method (default `GET`).
`params`	`dict[str, Any]`	Static query parameters.
`pagination`	`PaginationConfig \| None`	Pagination strategy and response path configuration.
`row_annotations`	`dict[str, Any]`	Constant fields merged into every fetched row (doc wins).
`headers`	`dict[str, str]`	Non-secret HTTP headers.
`timeout`	`float \| None`	Request timeout in seconds.
`retries`	`int`	Number of retry attempts.
`retry_backoff_factor`	`float`	Backoff factor for retries.
`retry_status_forcelist`	`list[int]`	HTTP status codes to retry on.
`verify`	`bool`	Verify SSL certificates.

Source code in graflo/architecture/contract/bindings/connectors.py

class APIConnector(ResourceConnector):
    """Connector for REST API endpoints.

    Declares the non-secret access pattern (path, method, pagination). Runtime
    ``base_url`` and credentials are supplied via ``connector_connection`` ->
    ``conn_proxy`` -> :class:`~graflo.hq.connection_provider.ApiGeneralizedConnConfig`.

    Attributes:
        path: Relative endpoint path (e.g. ``/api/users``).
        method: HTTP method (default ``GET``).
        params: Static query parameters.
        pagination: Pagination strategy and response path configuration.
        row_annotations: Constant fields merged into every fetched row (doc wins).
        headers: Non-secret HTTP headers.
        timeout: Request timeout in seconds.
        retries: Number of retry attempts.
        retry_backoff_factor: Backoff factor for retries.
        retry_status_forcelist: HTTP status codes to retry on.
        verify: Verify SSL certificates.
    """

    path: str = Field(..., description="Relative API endpoint path")
    method: str = "GET"
    params: dict[str, Any] = Field(default_factory=dict)
    pagination: PaginationConfig | None = None
    headers: dict[str, str] = Field(default_factory=dict)
    timeout: float | None = None
    retries: int = 0
    retry_backoff_factor: float = 0.1
    retry_status_forcelist: list[int] = Field(
        default_factory=lambda: [500, 502, 503, 504]
    )
    verify: bool = True

    @staticmethod
    def _join_url(base_url: str, path: str) -> str:
        return f"{base_url.rstrip('/')}/{path.lstrip('/')}"

    def matches(self, resource_identifier: str) -> bool:
        """Match resource name, connector name, or path tail."""
        if self.name is not None and resource_identifier == self.name:
            return True
        if self.resource_name is not None and resource_identifier == self.resource_name:
            return True
        path_tail = self.path.rstrip("/").rsplit("/", 1)[-1]
        return resource_identifier in {self.path, path_tail}

    def bound_source_kind(self) -> BoundSourceKind:
        return BoundSourceKind.API

    def build_api_config(
        self,
        *,
        base_url: str,
        auth: "ApiAuth | None" = None,
        default_headers: dict[str, str] | None = None,
        page_size_override: int | None = None,
    ) -> "APIConfig":
        """Merge contract fields with runtime connection config into ``APIConfig``."""
        from graflo.data_source.api import APIConfig

        headers = dict(default_headers or {})
        headers.update(self.headers)

        pagination = self.pagination
        if pagination is not None and page_size_override is not None:
            pagination = pagination.model_copy(
                update={
                    "request": pagination.request.model_copy(
                        update={"page_size": page_size_override}
                    )
                }
            )

        return APIConfig(
            url=self._join_url(base_url, self.path),
            method=self.method,
            headers=headers,
            auth=auth,
            params=dict(self.params),
            timeout=self.timeout,
            retries=self.retries,
            retry_backoff_factor=self.retry_backoff_factor,
            retry_status_forcelist=list(self.retry_status_forcelist),
            verify=self.verify,
            pagination=pagination,
            row_annotations=dict(self.row_annotations),
        )

`build_api_config(*, base_url, auth=None, default_headers=None, page_size_override=None)` ¶

Merge contract fields with runtime connection config into APIConfig.

Source code in graflo/architecture/contract/bindings/connectors.py

def build_api_config(
    self,
    *,
    base_url: str,
    auth: "ApiAuth | None" = None,
    default_headers: dict[str, str] | None = None,
    page_size_override: int | None = None,
) -> "APIConfig":
    """Merge contract fields with runtime connection config into ``APIConfig``."""
    from graflo.data_source.api import APIConfig

    headers = dict(default_headers or {})
    headers.update(self.headers)

    pagination = self.pagination
    if pagination is not None and page_size_override is not None:
        pagination = pagination.model_copy(
            update={
                "request": pagination.request.model_copy(
                    update={"page_size": page_size_override}
                )
            }
        )

    return APIConfig(
        url=self._join_url(base_url, self.path),
        method=self.method,
        headers=headers,
        auth=auth,
        params=dict(self.params),
        timeout=self.timeout,
        retries=self.retries,
        retry_backoff_factor=self.retry_backoff_factor,
        retry_status_forcelist=list(self.retry_status_forcelist),
        verify=self.verify,
        pagination=pagination,
        row_annotations=dict(self.row_annotations),
    )

`matches(resource_identifier)` ¶

Match resource name, connector name, or path tail.

Source code in graflo/architecture/contract/bindings/connectors.py

def matches(self, resource_identifier: str) -> bool:
    """Match resource name, connector name, or path tail."""
    if self.name is not None and resource_identifier == self.name:
        return True
    if self.resource_name is not None and resource_identifier == self.resource_name:
        return True
    path_tail = self.path.rstrip("/").rsplit("/", 1)[-1]
    return resource_identifier in {self.path, path_tail}

`APIDataSource` ¶

Bases: AbstractDataSource

Data source for REST API endpoints.

Source code in graflo/data_source/api.py

class APIDataSource(AbstractDataSource):
    """Data source for REST API endpoints."""

    config: APIConfig
    source_type: DataSourceType = DataSourceType.API

    def _create_session(self) -> requests.Session:
        session = requests.Session()

        if self.config.retries > 0:
            retry_strategy = Retry(
                total=self.config.retries,
                backoff_factor=self.config.retry_backoff_factor,
                status_forcelist=self.config.retry_status_forcelist,
            )
            adapter = HTTPAdapter(max_retries=retry_strategy)
            session.mount("http://", adapter)
            session.mount("https://", adapter)

        auth = self.config.auth
        if auth is not None:
            if auth.auth_type == "basic":
                session.auth = HTTPBasicAuth(
                    auth.username or "",
                    auth.password or "",
                )
            elif auth.auth_type == "digest":
                session.auth = HTTPDigestAuth(
                    auth.username or "",
                    auth.password or "",
                )
            elif auth.auth_type == "bearer":
                token = auth.token or ""
                session.headers[auth.header_name] = f"{auth.prefix} {token}".strip()
            elif auth.auth_type == "api_key":
                session.headers[auth.header_name] = auth.token or ""

        session.headers.update(self.config.headers)
        return session

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        session = self._create_session()
        total_items = 0
        resolved_response: ResolvedApiResponse | None = None

        try:
            pagination = self.config.pagination
            request = pagination.request if pagination else None
            offset = request.initial_offset if request else 0
            page = request.initial_page if request else 1
            cursor: str | None = request.initial_cursor if request else None

            while True:
                if limit is not None and total_items >= limit:
                    break

                params = dict(self.config.params)

                page_limit = request.page_size if request else 0
                if request is not None and limit is not None:
                    page_limit = min(page_limit, limit - total_items)

                if request is not None:
                    if request.strategy == "offset":
                        params[request.offset_param] = offset
                        params[request.limit_param] = page_limit
                    elif request.strategy == "page":
                        params[request.page_param] = page
                        params[request.per_page_param] = page_limit
                    elif request.strategy == "cursor" and cursor is not None:
                        params[request.cursor_param] = cursor

                try:
                    response = session.request(
                        method=self.config.method,
                        url=self.config.url,
                        params=params,
                        timeout=self.config.timeout,
                        verify=self.config.verify,
                    )
                    response.raise_for_status()
                    data = response.json()
                except requests.RequestException as e:
                    logger.error(f"API request failed: {e}")
                    break

                if pagination is not None and resolved_response is None:
                    resolved_response = ResolvedApiResponse.resolve(
                        pagination.response,
                        data,
                    )

                response_shape = (
                    resolved_response
                    if resolved_response is not None
                    else ResolvedApiResponse()
                )
                items = extract_records(data, response_shape)
                batch_metadata = get_batch_metadata(data, response_shape)

                batch: list[dict] = []
                for item in items:
                    if limit is not None and total_items >= limit:
                        break
                    batch.append(
                        {**self.config.row_annotations, **batch_metadata, **item}
                    )
                    total_items += 1

                    if len(batch) >= batch_size:
                        yield batch
                        batch = []

                if batch:
                    yield batch

                if limit is not None and total_items >= limit:
                    break

                if request is None:
                    break

                if not has_more_pages(
                    data,
                    response_shape,
                    items,
                    strategy=request.strategy,
                ):
                    break

                if request.strategy == "offset":
                    server_offset = next_offset_value(data, response_shape)
                    if server_offset is not None:
                        offset = server_offset
                    else:
                        offset += page_limit
                elif request.strategy == "page":
                    page += 1
                elif request.strategy == "cursor":
                    cursor = next_cursor_value(data, response_shape)
                    if not cursor:
                        break

        finally:
            session.close()

`AbstractDataSource` ¶

Bases: ConfigBaseModel, ABC

Abstract base class for all data sources.

Data sources handle data retrieval from various sources and provide a unified interface for batch iteration. They are separate from Resources, which handle data transformation. Many DataSources can map to the same Resource.

Attributes:

Name	Type	Description
`source_type`	`DataSourceType`	Type of the data source
`resource_name`	`str \| None`	Name of the resource this data source maps to (set externally via DataSourceRegistry)

Source code in graflo/data_source/base.py

class AbstractDataSource(ConfigBaseModel, abc.ABC):
    """Abstract base class for all data sources.

    Data sources handle data retrieval from various sources and provide
    a unified interface for batch iteration. They are separate from Resources,
    which handle data transformation. Many DataSources can map to the same Resource.

    Attributes:
        source_type: Type of the data source
        resource_name: Name of the resource this data source maps to
            (set externally via DataSourceRegistry)
    """

    source_type: DataSourceType
    _resource_name: str | None = PrivateAttr(default=None)

    @property
    def resource_name(self) -> str | None:
        """Get the resource name this data source maps to.

        Returns:
            Resource name or None if not set
        """
        return self._resource_name

    @resource_name.setter
    def resource_name(self, value: str | None) -> None:
        """Set the resource name this data source maps to.

        Args:
            value: Resource name to set
        """
        self._resource_name = value

    @abc.abstractmethod
    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Iterate over data in batches.

        This method yields batches of documents (dictionaries) from the data source.
        Each batch is a list of dictionaries representing the data items.

        Args:
            batch_size: Maximum number of items per yielded batch (last batch may
                be smaller).
            limit: Cap on **total items** read from this source (rows, JSON
                documents, SPARQL subjects after grouping, etc.), not a cap on
                batches. ``None`` means read until the source is exhausted.

        Yields:
            list[dict]: Batches of documents as dictionaries

        Raises:
            NotImplementedError: Must be implemented by subclasses
        """
        raise NotImplementedError("Subclasses must implement iter_batches")

    def __iter__(self):
        """Make data source iterable, yielding individual items.

        Yields:
            dict: Individual documents
        """
        for batch in self.iter_batches(batch_size=1, limit=None):
            for item in batch:
                yield item

`resource_name` `property` `writable` ¶

Get the resource name this data source maps to.

Returns:

Type	Description
`str \| None`	Resource name or None if not set

`iter()` ¶

Make data source iterable, yielding individual items.

Yields:

Name	Type	Description
`dict`		Individual documents

Source code in graflo/data_source/base.py

def __iter__(self):
    """Make data source iterable, yielding individual items.

    Yields:
        dict: Individual documents
    """
    for batch in self.iter_batches(batch_size=1, limit=None):
        for item in batch:
            yield item

`iter_batches(batch_size=1000, limit=None)` `abstractmethod` ¶

Iterate over data in batches.

This method yields batches of documents (dictionaries) from the data source. Each batch is a list of dictionaries representing the data items.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Maximum number of items per yielded batch (last batch may be smaller).	`1000`
`limit`	`int \| None`	Cap on total items read from this source (rows, JSON documents, SPARQL subjects after grouping, etc.), not a cap on batches. `None` means read until the source is exhausted.	`None`

Yields:

Type	Description
`list[dict]`	list[dict]: Batches of documents as dictionaries

Raises:

Type	Description
`NotImplementedError`	Must be implemented by subclasses

Source code in graflo/data_source/base.py

@abc.abstractmethod
def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Iterate over data in batches.

    This method yields batches of documents (dictionaries) from the data source.
    Each batch is a list of dictionaries representing the data items.

    Args:
        batch_size: Maximum number of items per yielded batch (last batch may
            be smaller).
        limit: Cap on **total items** read from this source (rows, JSON
            documents, SPARQL subjects after grouping, etc.), not a cap on
            batches. ``None`` means read until the source is exhausted.

    Yields:
        list[dict]: Batches of documents as dictionaries

    Raises:
        NotImplementedError: Must be implemented by subclasses
    """
    raise NotImplementedError("Subclasses must implement iter_batches")

`ApiResponseStructure` ¶

Bases: ConfigBaseModel

Maps JSON response envelope fields to extraction and pagination signals.

Source code in graflo/architecture/contract/bindings/connectors.py

class ApiResponseStructure(ConfigBaseModel):
    """Maps JSON response envelope fields to extraction and pagination signals."""

    records_path: str | None = Field(
        default=None,
        description="Dot path to the record list (e.g. ``results``).",
    )
    total_count_path: str | None = Field(
        default=None,
        description="Dot path to total item count across all pages (e.g. ``count``).",
    )
    offset_path: str | None = Field(
        default=None,
        description="Dot path to echoed page start index (e.g. ``offset``).",
    )
    next_offset_path: str | None = Field(
        default=None,
        description=(
            "Dot path to server-provided next offset for the following request "
            "(e.g. ``next_offset``)."
        ),
    )
    has_more_path: str | None = Field(
        default=None,
        description="Dot path to a boolean more-pages flag (e.g. ``has_more``).",
    )
    cursor_path: str | None = Field(
        default=None,
        description="Dot path to the next opaque cursor token.",
    )
    batch_metadata_paths: dict[str, str] = Field(
        default_factory=dict,
        description=(
            "Map row annotation keys to response dot paths "
            "(e.g. ``_batch_id: result_id``)."
        ),
    )
    auto_detect: bool = Field(
        default=False,
        description=(
            "When true, infer unset response paths from the first response body."
        ),
    )

`DataSourceFactory` ¶

Factory for creating data source instances.

Source code in graflo/data_source/factory.py

class DataSourceFactory:
    """Factory for creating data source instances."""

    @staticmethod
    def _guess_file_type(filename: Path) -> ChunkerType:
        return ChunkerFactory._guess_chunker_type(filename)

    @classmethod
    def create_file_data_source(
        cls,
        path: Path | str,
        file_type: str | ChunkerType | None = None,
        encoding: EncodingType = EncodingType.UTF_8,
        sep: str | None = None,
    ) -> (
        JsonFileDataSource
        | JsonlFileDataSource
        | TableFileDataSource
        | ParquetFileDataSource
    ):
        if isinstance(path, str):
            path = Path(path)

        if file_type is None:
            try:
                file_type_enum = cls._guess_file_type(path)
            except ValueError as e:
                raise ValueError(
                    f"Could not determine file type for {path}. "
                    f"Please specify file_type explicitly. Error: {e}"
                )
        elif isinstance(file_type, str):
            file_type_enum = ChunkerType(file_type.lower())
        else:
            file_type_enum = file_type

        if file_type_enum == ChunkerType.JSON:
            return JsonFileDataSource(path=path, encoding=encoding)
        if file_type_enum == ChunkerType.JSONL:
            return JsonlFileDataSource(path=path, encoding=encoding)
        if file_type_enum == ChunkerType.TABLE:
            return TableFileDataSource(path=path, encoding=encoding, sep=sep or ",")
        if file_type_enum == ChunkerType.PARQUET:
            return ParquetFileDataSource(path=path)
        raise ValueError(f"Unsupported file type: {file_type_enum}")

    @classmethod
    def create_sql_data_source(cls, config: SQLConfig) -> SQLDataSource:
        return SQLDataSource(config=config)

    @classmethod
    def create_in_memory_data_source(
        cls,
        data: list[dict] | list[list] | pd.DataFrame,
        columns: list[str] | None = None,
    ) -> InMemoryDataSource:
        return InMemoryDataSource(data=data, columns=columns)

    @classmethod
    def create_data_source(
        cls,
        source_type: DataSourceType | str | None = None,
        **kwargs: Any,
    ) -> AbstractDataSource:
        if source_type is None:
            if "path" in kwargs or "file_type" in kwargs:
                source_type = DataSourceType.FILE
            elif "data" in kwargs:
                source_type = DataSourceType.IN_MEMORY
            elif "config" in kwargs:
                config = kwargs["config"]
                if isinstance(config, dict):
                    if "connection_string" in config or "query" in config:
                        source_type = DataSourceType.SQL
                    elif "source_type" in config:
                        source_type = DataSourceType(config["source_type"].lower())
                    else:
                        raise ValueError(
                            "Cannot determine source type from config. "
                            "Please specify source_type or provide "
                            "'connection_string'/'query' (SQL) in config."
                        )
                elif hasattr(config, "connection_string") or hasattr(config, "query"):
                    source_type = DataSourceType.SQL
                else:
                    raise ValueError(
                        "Cannot determine source type from config. "
                        "Please specify source_type explicitly."
                    )
            else:
                raise ValueError(
                    "Cannot determine source type. Please specify source_type or "
                    "provide one of: path (FILE), data (IN_MEMORY), or config (SQL)."
                )

        if isinstance(source_type, str):
            source_type = DataSourceType(source_type.lower())

        if source_type == DataSourceType.API:
            raise ValueError(
                "API data sources must be declared via bindings (APIConnector) and "
                "built with RegistryBuilder; inline API factory creation is not supported."
            )

        if source_type == DataSourceType.FILE:
            return cls.create_file_data_source(**kwargs)
        if source_type == DataSourceType.SQL:
            if "config" not in kwargs:
                config = SQLConfig.from_dict(kwargs)
                return cls.create_sql_data_source(config=config)
            config = kwargs["config"]
            if isinstance(config, dict):
                config = SQLConfig.from_dict(config)
            return cls.create_sql_data_source(config=config)
        if source_type == DataSourceType.IN_MEMORY:
            if "data" not in kwargs:
                raise ValueError("In-memory data source requires 'data' parameter")
            return cls.create_in_memory_data_source(**kwargs)
        raise ValueError(f"Unsupported data source type: {source_type}")

    @classmethod
    def create_data_source_from_config(
        cls, config: dict[str, Any]
    ) -> AbstractDataSource:
        config = config.copy()
        source_type = config.pop("source_type", None)
        if source_type is not None and str(source_type).lower() == "api":
            raise ValueError(
                "API data sources must be declared via bindings (APIConnector) and "
                "ingested through GraphEngine.define_and_ingest with a ConnectionProvider."
            )
        return cls.create_data_source(source_type=source_type, **config)

`DataSourceRegistry` ¶

Bases: ConfigBaseModel

Registry for mapping data sources to resource names.

This class maintains a mapping from resource names to lists of data sources. Many data sources can map to the same resource, allowing data to be ingested from multiple sources and combined.

Attributes:

Name	Type	Description
`sources`	`dict[str, list[AbstractDataSource]]`	Dictionary mapping resource names to lists of data sources

Source code in graflo/data_source/registry.py

class DataSourceRegistry(ConfigBaseModel):
    """Registry for mapping data sources to resource names.

    This class maintains a mapping from resource names to lists of data sources.
    Many data sources can map to the same resource, allowing data to be ingested
    from multiple sources and combined.

    Attributes:
        sources: Dictionary mapping resource names to lists of data sources
    """

    sources: dict[str, list[AbstractDataSource]] = Field(default_factory=dict)

    def register(self, data_source: AbstractDataSource, resource_name: str) -> None:
        """Register a data source for a resource.

        Args:
            data_source: Data source to register
            resource_name: Name of the resource to map to
        """
        if resource_name not in self.sources:
            self.sources[resource_name] = []
        self.sources[resource_name].append(data_source)
        data_source.resource_name = resource_name

    def get_data_sources(self, resource_name: str) -> list[AbstractDataSource]:
        """Get all data sources for a resource.

        Args:
            resource_name: Name of the resource

        Returns:
            List of data sources for the resource (empty list if none found)
        """
        return self.sources.get(resource_name, [])

    def get_all_data_sources(self) -> list[AbstractDataSource]:
        """Get all registered data sources.

        Returns:
            List of all registered data sources
        """
        all_sources = []
        for sources_list in self.sources.values():
            all_sources.extend(sources_list)
        return all_sources

    def has_resource(self, resource_name: str) -> bool:
        """Check if a resource has any data sources.

        Args:
            resource_name: Name of the resource

        Returns:
            True if the resource has data sources, False otherwise
        """
        return resource_name in self.sources and len(self.sources[resource_name]) > 0

    def clear(self) -> None:
        """Clear all registered data sources."""
        self.sources.clear()

`clear()` ¶

Clear all registered data sources.

Source code in graflo/data_source/registry.py

def clear(self) -> None:
    """Clear all registered data sources."""
    self.sources.clear()

`get_all_data_sources()` ¶

Get all registered data sources.

Returns:

Type	Description
`list[AbstractDataSource]`	List of all registered data sources

Source code in graflo/data_source/registry.py

def get_all_data_sources(self) -> list[AbstractDataSource]:
    """Get all registered data sources.

    Returns:
        List of all registered data sources
    """
    all_sources = []
    for sources_list in self.sources.values():
        all_sources.extend(sources_list)
    return all_sources

`get_data_sources(resource_name)` ¶

Get all data sources for a resource.

Parameters:

Name	Type	Description	Default
`resource_name`	`str`	Name of the resource	required

Returns:

Type	Description
`list[AbstractDataSource]`	List of data sources for the resource (empty list if none found)

Source code in graflo/data_source/registry.py

def get_data_sources(self, resource_name: str) -> list[AbstractDataSource]:
    """Get all data sources for a resource.

    Args:
        resource_name: Name of the resource

    Returns:
        List of data sources for the resource (empty list if none found)
    """
    return self.sources.get(resource_name, [])

`has_resource(resource_name)` ¶

Check if a resource has any data sources.

Parameters:

Name	Type	Description	Default
`resource_name`	`str`	Name of the resource	required

Returns:

Type	Description
`bool`	True if the resource has data sources, False otherwise

Source code in graflo/data_source/registry.py

def has_resource(self, resource_name: str) -> bool:
    """Check if a resource has any data sources.

    Args:
        resource_name: Name of the resource

    Returns:
        True if the resource has data sources, False otherwise
    """
    return resource_name in self.sources and len(self.sources[resource_name]) > 0

`register(data_source, resource_name)` ¶

Register a data source for a resource.

Parameters:

Name	Type	Description	Default
`data_source`	`AbstractDataSource`	Data source to register	required
`resource_name`	`str`	Name of the resource to map to	required

Source code in graflo/data_source/registry.py

def register(self, data_source: AbstractDataSource, resource_name: str) -> None:
    """Register a data source for a resource.

    Args:
        data_source: Data source to register
        resource_name: Name of the resource to map to
    """
    if resource_name not in self.sources:
        self.sources[resource_name] = []
    self.sources[resource_name].append(data_source)
    data_source.resource_name = resource_name

`DataSourceType` ¶

Bases: BaseEnum

Types of data sources supported by the system.

FILE: File-based data sources (JSON, JSONL, CSV/TSV) API: REST API data sources SQL: SQL database data sources IN_MEMORY: In-memory data sources (lists, DataFrames) SPARQL: RDF data sources (local files via rdflib, remote endpoints via SPARQLWrapper)

Source code in graflo/data_source/base.py

class DataSourceType(BaseEnum):
    """Types of data sources supported by the system.

    FILE: File-based data sources (JSON, JSONL, CSV/TSV)
    API: REST API data sources
    SQL: SQL database data sources
    IN_MEMORY: In-memory data sources (lists, DataFrames)
    SPARQL: RDF data sources (local files via rdflib, remote endpoints via SPARQLWrapper)
    """

    FILE = "file"
    API = "api"
    SQL = "sql"
    IN_MEMORY = "in_memory"
    SPARQL = "sparql"

`FileDataSource` ¶

Bases: AbstractDataSource

Base class for file-based data sources.

This class provides a common interface for file-based data sources, integrating with the existing chunker system for batch processing.

Attributes:

Name	Type	Description
`path`	`Path \| str`	Path to the file
`file_type`	`str \| None`	Type of file (json, jsonl, table)
`encoding`	`EncodingType`	File encoding (default: UTF_8)

Source code in graflo/data_source/file.py

class FileDataSource(AbstractDataSource):
    """Base class for file-based data sources.

    This class provides a common interface for file-based data sources,
    integrating with the existing chunker system for batch processing.

    Attributes:
        path: Path to the file
        file_type: Type of file (json, jsonl, table)
        encoding: File encoding (default: UTF_8)
    """

    path: Path | str
    file_type: str | None = None
    encoding: EncodingType = EncodingType.UTF_8
    source_type: DataSourceType = DataSourceType.FILE

    @field_validator("path", mode="before")
    @classmethod
    def _path_to_path(cls, v: Path | str) -> Path:
        return Path(v) if isinstance(v, str) else v

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Iterate over file data in batches.

        Args:
            batch_size: Number of items per batch
            limit: Maximum number of items to retrieve

        Yields:
            list[dict]: Batches of documents as dictionaries
        """
        # Determine chunker type
        chunker_type = None
        if self.file_type:
            chunker_type = ChunkerType(self.file_type.lower())

        # Create chunker using factory
        chunker_kwargs: dict[str, Any] = {
            "resource": self.path,
            "type": chunker_type,
            "batch_size": batch_size,
            "limit": limit,
            "encoding": self.encoding,
        }
        # Only add sep for table files
        if chunker_type == ChunkerType.TABLE and hasattr(self, "sep"):
            chunker_kwargs["sep"] = self.sep

        chunker = ChunkerFactory.create_chunker(**chunker_kwargs)

        # Yield batches
        for batch in chunker:
            yield batch

`iter_batches(batch_size=1000, limit=None)` ¶

Iterate over file data in batches.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Number of items per batch	`1000`
`limit`	`int \| None`	Maximum number of items to retrieve	`None`

Yields:

Type	Description
`list[dict]`	list[dict]: Batches of documents as dictionaries

Source code in graflo/data_source/file.py

def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Iterate over file data in batches.

    Args:
        batch_size: Number of items per batch
        limit: Maximum number of items to retrieve

    Yields:
        list[dict]: Batches of documents as dictionaries
    """
    # Determine chunker type
    chunker_type = None
    if self.file_type:
        chunker_type = ChunkerType(self.file_type.lower())

    # Create chunker using factory
    chunker_kwargs: dict[str, Any] = {
        "resource": self.path,
        "type": chunker_type,
        "batch_size": batch_size,
        "limit": limit,
        "encoding": self.encoding,
    }
    # Only add sep for table files
    if chunker_type == ChunkerType.TABLE and hasattr(self, "sep"):
        chunker_kwargs["sep"] = self.sep

    chunker = ChunkerFactory.create_chunker(**chunker_kwargs)

    # Yield batches
    for batch in chunker:
        yield batch

`InMemoryDataSource` ¶

Bases: AbstractDataSource

Data source for in-memory data structures.

This class provides a data source for Python objects that are already in memory, including lists of dictionaries, lists of lists, and Pandas DataFrames.

Attributes:

Name	Type	Description
`data`	`list[dict] \| list[list] \| DataFrame`	Data to process (list[dict], list[list], or pd.DataFrame)
`columns`	`list[str] \| None`	Optional column names for list[list] data

Source code in graflo/data_source/memory.py

class InMemoryDataSource(AbstractDataSource):
    """Data source for in-memory data structures.

    This class provides a data source for Python objects that are already
    in memory, including lists of dictionaries, lists of lists, and Pandas DataFrames.

    Attributes:
        data: Data to process (list[dict], list[list], or pd.DataFrame)
        columns: Optional column names for list[list] data
    """

    model_config = {"arbitrary_types_allowed": True}

    data: list[dict] | list[list] | pd.DataFrame
    columns: list[str] | None = None
    source_type: DataSourceType = DataSourceType.IN_MEMORY

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Iterate over in-memory data in batches.

        Args:
            batch_size: Number of items per batch
            limit: Maximum number of items to retrieve

        Yields:
            list[dict]: Batches of documents as dictionaries
        """
        # Normalize data: convert list[list] to list[dict] if needed
        data = self.data
        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
            # list[list] - convert to list[dict] using columns
            if self.columns is None:
                raise ValueError(
                    "columns parameter is required when data is list[list]"
                )
            # Type narrowing: we've confirmed data[0] is a list, so data is list[list]
            # Create a properly typed list for iteration
            data_list: list[list] = []
            for item in data:
                if isinstance(item, list):
                    data_list.append(item)
            data = [{k: v for k, v in zip(self.columns, item)} for item in data_list]

        # Create chunker using factory (only pass columns if it's a DataFrame)
        chunker_kwargs = {
            "resource": data,
            "batch_size": batch_size,
            "limit": limit,
        }
        # Note: columns is not passed to chunker - we handle list[list] conversion above
        # DataFrame chunker doesn't need columns either

        chunker = ChunkerFactory.create_chunker(**chunker_kwargs)

        # Yield batches
        for batch in chunker:
            yield batch

`iter_batches(batch_size=1000, limit=None)` ¶

Iterate over in-memory data in batches.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Number of items per batch	`1000`
`limit`	`int \| None`	Maximum number of items to retrieve	`None`

Yields:

Type	Description
`list[dict]`	list[dict]: Batches of documents as dictionaries

Source code in graflo/data_source/memory.py

def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Iterate over in-memory data in batches.

    Args:
        batch_size: Number of items per batch
        limit: Maximum number of items to retrieve

    Yields:
        list[dict]: Batches of documents as dictionaries
    """
    # Normalize data: convert list[list] to list[dict] if needed
    data = self.data
    if isinstance(data, list) and len(data) > 0 and isinstance(data[0], list):
        # list[list] - convert to list[dict] using columns
        if self.columns is None:
            raise ValueError(
                "columns parameter is required when data is list[list]"
            )
        # Type narrowing: we've confirmed data[0] is a list, so data is list[list]
        # Create a properly typed list for iteration
        data_list: list[list] = []
        for item in data:
            if isinstance(item, list):
                data_list.append(item)
        data = [{k: v for k, v in zip(self.columns, item)} for item in data_list]

    # Create chunker using factory (only pass columns if it's a DataFrame)
    chunker_kwargs = {
        "resource": data,
        "batch_size": batch_size,
        "limit": limit,
    }
    # Note: columns is not passed to chunker - we handle list[list] conversion above
    # DataFrame chunker doesn't need columns either

    chunker = ChunkerFactory.create_chunker(**chunker_kwargs)

    # Yield batches
    for batch in chunker:
        yield batch

`JsonFileDataSource` ¶

Bases: FileDataSource

Data source for JSON files.

JSON files are expected to contain hierarchical data structures, similar to REST API responses. The chunker handles nested structures and converts them to dictionaries.

Attributes:

Name	Type	Description
`path`	`Path \| str`	Path to the JSON file
`encoding`	`EncodingType`	File encoding (default: UTF_8)

Source code in graflo/data_source/file.py

class JsonFileDataSource(FileDataSource):
    """Data source for JSON files.

    JSON files are expected to contain hierarchical data structures,
    similar to REST API responses. The chunker handles nested structures
    and converts them to dictionaries.

    Attributes:
        path: Path to the JSON file
        encoding: File encoding (default: UTF_8)
    """

    file_type: str = ChunkerType.JSON.value

`JsonlFileDataSource` ¶

Bases: FileDataSource

Data source for JSONL (JSON Lines) files.

JSONL files contain one JSON object per line, making them suitable for streaming and batch processing.

Attributes:

Name	Type	Description
`path`	`Path \| str`	Path to the JSONL file
`encoding`	`EncodingType`	File encoding (default: UTF_8)

Source code in graflo/data_source/file.py

class JsonlFileDataSource(FileDataSource):
    """Data source for JSONL (JSON Lines) files.

    JSONL files contain one JSON object per line, making them suitable
    for streaming and batch processing.

    Attributes:
        path: Path to the JSONL file
        encoding: File encoding (default: UTF_8)
    """

    file_type: str = ChunkerType.JSONL.value

`PaginationConfig` ¶

Bases: ConfigBaseModel

Configuration for API pagination (contract-level, secret-free).

Combines request construction (request) with response envelope parsing (response).

Source code in graflo/architecture/contract/bindings/connectors.py

class PaginationConfig(ConfigBaseModel):
    """Configuration for API pagination (contract-level, secret-free).

    Combines request construction (``request``) with response envelope parsing
    (``response``).
    """

    request: PaginationRequestConfig = Field(default_factory=PaginationRequestConfig)
    response: ApiResponseStructure = Field(default_factory=ApiResponseStructure)

`PaginationRequestConfig` ¶

Bases: ConfigBaseModel

Configuration for building paginated HTTP requests.

Source code in graflo/architecture/contract/bindings/connectors.py

class PaginationRequestConfig(ConfigBaseModel):
    """Configuration for building paginated HTTP requests."""

    strategy: Literal["offset", "page", "cursor"] = "offset"
    offset_param: str = "offset"
    limit_param: str = Field(
        default="limit",
        description=(
            "Query parameter name for page size (offset strategy only). "
            "The value sent is ``page_size``, not a total item cap."
        ),
    )
    cursor_param: str = "cursor"
    page_param: str = "page"
    per_page_param: str = Field(
        default="per_page",
        description=(
            "Query parameter name for page size (page strategy only). "
            "The value sent is ``page_size``, not a total item cap."
        ),
    )
    initial_offset: int = 0
    initial_page: int = 1
    initial_cursor: str | None = None
    page_size: int = Field(
        default=100,
        description=(
            "Records requested per HTTP page. Sent as the value of "
            "``limit_param`` (offset) or ``per_page_param`` (page)."
        ),
    )

`RdfDataSource` ¶

Bases: AbstractDataSource, ABC

Abstract base for RDF data sources (file and endpoint).

Captures the fields and batch-yielding logic shared by both :class:RdfFileDataSource and :class:SparqlEndpointDataSource.

Attributes:

Name	Type	Description
`rdf_class`	`str \| None`	Optional URI of the `rdf:Class` to filter subjects by.

Source code in graflo/data_source/rdf.py

class RdfDataSource(AbstractDataSource, abc.ABC):
    """Abstract base for RDF data sources (file and endpoint).

    Captures the fields and batch-yielding logic shared by both
    :class:`RdfFileDataSource` and :class:`SparqlEndpointDataSource`.

    Attributes:
        rdf_class: Optional URI of the ``rdf:Class`` to filter subjects by.
    """

    source_type: DataSourceType = DataSourceType.SPARQL
    rdf_class: str | None = Field(
        default=None, description="URI of the rdf:Class to filter by"
    )

    @staticmethod
    def _yield_batches(
        docs: list[dict], batch_size: int, limit: int | None
    ) -> Iterator[list[dict]]:
        """Apply *limit*, then yield *docs* in chunks of *batch_size*."""
        if limit is not None:
            docs = docs[:limit]
        for i in range(0, max(len(docs), 1), batch_size):
            batch = docs[i : i + batch_size]
            if batch:
                yield batch

`RdfFileDataSource` ¶

Bases: RdfDataSource

Data source for local RDF files.

Parses RDF files using rdflib and yields flat dictionaries grouped by subject URI. Optionally filters by rdf_class so that only instances of a specific class are returned.

Attributes:

Name	Type	Description
`path`	`Path`	Path to the RDF file.
`rdf_format`	`str \| None`	Explicit rdflib format string (e.g. `"turtle"`). When `None` the format is guessed from the file extension.

Source code in graflo/data_source/rdf.py

class RdfFileDataSource(RdfDataSource):
    """Data source for local RDF files.

    Parses RDF files using *rdflib* and yields flat dictionaries grouped by
    subject URI.  Optionally filters by ``rdf_class`` so that only instances
    of a specific class are returned.

    Attributes:
        path: Path to the RDF file.
        rdf_format: Explicit rdflib format string (e.g. ``"turtle"``).
            When ``None`` the format is guessed from the file extension.
    """

    path: Path
    rdf_format: str | None = Field(
        default=None, description="rdflib serialization format"
    )

    def _resolve_format(self) -> str:
        """Return the rdflib format string, guessing from extension if needed."""
        if self.rdf_format:
            return self.rdf_format
        ext = self.path.suffix.lower()
        fmt = _EXT_FORMAT.get(ext)
        if fmt is None:
            raise ValueError(
                f"Cannot determine RDF format for extension '{ext}'. "
                f"Set rdf_format explicitly. Known: {list(_EXT_FORMAT.keys())}"
            )
        return fmt

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Parse the RDF file and yield batches of flat dictionaries."""
        try:
            from rdflib import Graph
        except ImportError as exc:
            raise ImportError(
                "rdflib is required for RDF data sources. "
                "It is a core dependency of graflo; reinstall with "
                "`pip install --force-reinstall graflo` or install rdflib manually."
            ) from exc

        g = Graph()
        g.parse(str(self.path), format=self._resolve_format())
        logger.info(
            "Parsed %d triples from %s (format=%s)",
            len(g),
            self.path,
            self._resolve_format(),
        )

        docs = _triples_to_docs(g, rdf_class=self.rdf_class)
        yield from self._yield_batches(docs, batch_size, limit)

`iter_batches(batch_size=1000, limit=None)` ¶

Parse the RDF file and yield batches of flat dictionaries.

Source code in graflo/data_source/rdf.py

def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Parse the RDF file and yield batches of flat dictionaries."""
    try:
        from rdflib import Graph
    except ImportError as exc:
        raise ImportError(
            "rdflib is required for RDF data sources. "
            "It is a core dependency of graflo; reinstall with "
            "`pip install --force-reinstall graflo` or install rdflib manually."
        ) from exc

    g = Graph()
    g.parse(str(self.path), format=self._resolve_format())
    logger.info(
        "Parsed %d triples from %s (format=%s)",
        len(g),
        self.path,
        self._resolve_format(),
    )

    docs = _triples_to_docs(g, rdf_class=self.rdf_class)
    yield from self._yield_batches(docs, batch_size, limit)

`SQLConfig` ¶

Bases: ConfigBaseModel

Configuration for SQL data source.

Uses SQLAlchemy connection string format.

Attributes:

Name	Type	Description
`connection_string`	`str`	SQLAlchemy connection string (e.g., 'postgresql://user:pass@localhost/dbname')
`query`	`str`	SQL query string (supports parameterized queries)
`params`	`dict[str, Any]`	Query parameters as dictionary (for parameterized queries)
`pagination`	`bool \| None`	Deprecated. Ignored; retained for config compatibility.
`page_size`	`int \| None`	Deprecated. Ignored; use `iter_batches(batch_size=...)`.

Source code in graflo/data_source/sql.py

class SQLConfig(ConfigBaseModel):
    """Configuration for SQL data source.

    Uses SQLAlchemy connection string format.

    Attributes:
        connection_string: SQLAlchemy connection string
            (e.g., 'postgresql://user:pass@localhost/dbname')
        query: SQL query string (supports parameterized queries)
        params: Query parameters as dictionary (for parameterized queries)
        pagination: Deprecated. Ignored; retained for config compatibility.
        page_size: Deprecated. Ignored; use ``iter_batches(batch_size=...)``.
    """

    connection_string: str
    query: str
    params: dict[str, Any] = Field(default_factory=dict)
    pagination: bool | None = None
    page_size: int | None = None

`SQLDataSource` ¶

Bases: AbstractDataSource

Data source for SQL databases.

This class provides a data source for SQL databases using SQLAlchemy. Results are streamed with stream_results and fetchmany so large queries avoid OFFSET-based re-scans and bounded memory per chunk. Rows are returned as dictionaries with column names as keys.

Attributes:

Name	Type	Description
`config`	`SQLConfig`	SQL configuration
`engine`	`SQLConfig`	SQLAlchemy engine (created on first use)

Source code in graflo/data_source/sql.py

class SQLDataSource(AbstractDataSource):
    """Data source for SQL databases.

    This class provides a data source for SQL databases using SQLAlchemy.
    Results are streamed with ``stream_results`` and ``fetchmany`` so large
    queries avoid OFFSET-based re-scans and bounded memory per chunk.
    Rows are returned as dictionaries with column names as keys.

    Attributes:
        config: SQL configuration
        engine: SQLAlchemy engine (created on first use)
    """

    config: SQLConfig
    source_type: DataSourceType = DataSourceType.SQL
    _engine: Engine | None = PrivateAttr(default=None)

    def _get_engine(self) -> Engine:
        """Get or create SQLAlchemy engine.

        Returns:
            SQLAlchemy engine instance
        """
        if self._engine is None:
            self._engine = create_engine(self.config.connection_string)
        return self._engine

    @staticmethod
    def _row_to_json_dict(row: Any) -> dict[str, Any]:
        """Map one result row to a plain dict with JSON-friendly values."""
        row_dict: dict[str, Any] = dict(row._mapping)
        for key, value in row_dict.items():
            if isinstance(value, Decimal):
                row_dict[key] = float(value)
        return row_dict

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Iterate over SQL query results in batches.

        Executes the configured query once per call and reads via
        ``fetchmany`` on a streaming result. Optional ``limit`` stops after
        that many rows without adding LIMIT/OFFSET to the SQL text.

        Args:
            batch_size: Target size of each yielded batch of row dicts
                (last batch may be smaller).
            limit: Maximum total rows to read, or ``None`` for full result.

        Yields:
            list[dict]: Batches of rows as dictionaries
        """
        effective_batch = max(1, batch_size)
        engine = self._get_engine()
        total_items = 0

        try:
            with engine.connect() as conn:
                stream = conn.execution_options(stream_results=True)
                result = stream.execute(text(self.config.query), self.config.params)
                try:
                    while True:
                        if limit is not None and total_items >= limit:
                            break

                        remaining = None if limit is None else limit - total_items
                        fetch_n = (
                            effective_batch
                            if remaining is None
                            else min(effective_batch, remaining)
                        )

                        rows = result.fetchmany(fetch_n)
                        if not rows:
                            break

                        batch: list[dict] = []
                        for row in rows:
                            batch.append(self._row_to_json_dict(row))
                            total_items += 1
                            if limit is not None and total_items >= limit:
                                break

                        if batch:
                            yield batch

                        if limit is not None and total_items >= limit:
                            break
                finally:
                    result.close()

        except Exception as e:
            logger.error("SQL query execution failed: %s", e)

`iter_batches(batch_size=1000, limit=None)` ¶

Iterate over SQL query results in batches.

Executes the configured query once per call and reads via fetchmany on a streaming result. Optional limit stops after that many rows without adding LIMIT/OFFSET to the SQL text.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Target size of each yielded batch of row dicts (last batch may be smaller).	`1000`
`limit`	`int \| None`	Maximum total rows to read, or `None` for full result.	`None`

Yields:

Type	Description
`list[dict]`	list[dict]: Batches of rows as dictionaries

Source code in graflo/data_source/sql.py

def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Iterate over SQL query results in batches.

    Executes the configured query once per call and reads via
    ``fetchmany`` on a streaming result. Optional ``limit`` stops after
    that many rows without adding LIMIT/OFFSET to the SQL text.

    Args:
        batch_size: Target size of each yielded batch of row dicts
            (last batch may be smaller).
        limit: Maximum total rows to read, or ``None`` for full result.

    Yields:
        list[dict]: Batches of rows as dictionaries
    """
    effective_batch = max(1, batch_size)
    engine = self._get_engine()
    total_items = 0

    try:
        with engine.connect() as conn:
            stream = conn.execution_options(stream_results=True)
            result = stream.execute(text(self.config.query), self.config.params)
            try:
                while True:
                    if limit is not None and total_items >= limit:
                        break

                    remaining = None if limit is None else limit - total_items
                    fetch_n = (
                        effective_batch
                        if remaining is None
                        else min(effective_batch, remaining)
                    )

                    rows = result.fetchmany(fetch_n)
                    if not rows:
                        break

                    batch: list[dict] = []
                    for row in rows:
                        batch.append(self._row_to_json_dict(row))
                        total_items += 1
                        if limit is not None and total_items >= limit:
                            break

                    if batch:
                        yield batch

                    if limit is not None and total_items >= limit:
                        break
            finally:
                result.close()

    except Exception as e:
        logger.error("SQL query execution failed: %s", e)

`SparqlEndpointDataSource` ¶

Bases: RdfDataSource

Data source that reads from a SPARQL endpoint.

Uses SPARQLWrapper to query an endpoint and returns flat dictionaries grouped by subject.

Attributes:

Name	Type	Description
`config`	`SparqlSourceConfig`	SPARQL source configuration.

Source code in graflo/data_source/rdf.py

class SparqlEndpointDataSource(RdfDataSource):
    """Data source that reads from a SPARQL endpoint.

    Uses ``SPARQLWrapper`` to query an endpoint and returns flat dictionaries
    grouped by subject.

    Attributes:
        config: SPARQL source configuration.
    """

    config: SparqlSourceConfig

    def _create_wrapper(self) -> Any:
        """Create a configured ``SPARQLWrapper`` instance."""
        try:
            from SPARQLWrapper import JSON, SPARQLWrapper
        except ImportError as exc:
            raise ImportError(
                "SPARQLWrapper is required for SPARQL endpoint data sources. "
                "It is a core dependency of graflo; reinstall with "
                "`pip install --force-reinstall graflo` or install SPARQLWrapper manually."
            ) from exc

        sparql = SPARQLWrapper(self.config.endpoint_url)
        sparql.setReturnFormat(JSON)
        if self.config.username and self.config.password:
            sparql.setCredentials(self.config.username, self.config.password)
        return sparql

    def iter_batches(
        self, batch_size: int = 1000, limit: int | None = None
    ) -> Iterator[list[dict]]:
        """Query the SPARQL endpoint and yield batches of flat dictionaries.

        Paginates with SPARQL LIMIT/OFFSET on **bindings** (triple rows), merges
        rows into subject documents in a streaming fashion, and stops fetching
        once *limit* subjects have been yielded (when set).
        """
        wrapper = self._create_wrapper()
        offset = 0
        page_size = self.config.page_size
        open_uri: str | None = None
        open_doc: dict[str, Any] | None = None
        batch: list[dict] = []
        total_emitted = 0

        def subject_completed(doc: dict[str, Any]) -> Iterator[list[dict]]:
            """Append a finished subject and yield when a batch is full."""
            nonlocal batch, total_emitted
            if limit is not None and total_emitted >= limit:
                return
            batch.append(doc)
            total_emitted += 1
            if len(batch) >= batch_size:
                to_send = batch
                batch = []
                yield to_send

        while True:
            if limit is not None and total_emitted >= limit:
                break

            query = self.config.build_query(offset=offset, limit=page_size)
            wrapper.setQuery(query)

            logger.debug("SPARQL query (offset=%d): %s", offset, query)
            results = wrapper.queryAndConvert()

            bindings = results.get("results", {}).get("bindings", [])
            if not bindings:
                break

            stop_fetching = False
            for binding in bindings:
                s_val = binding["s"]["value"]
                if open_uri is None:
                    open_uri = s_val
                    open_doc = {"_uri": s_val, "_key": _local_name(s_val)}
                elif s_val != open_uri:
                    assert open_doc is not None
                    yield from subject_completed(open_doc)
                    open_uri = None
                    open_doc = None
                    if limit is not None and total_emitted >= limit:
                        stop_fetching = True
                        break
                    open_uri = s_val
                    open_doc = {"_uri": s_val, "_key": _local_name(s_val)}

                assert open_doc is not None
                _merge_sparql_binding_into_doc(open_doc, binding)

            if stop_fetching:
                break

            if len(bindings) < page_size:
                break

            offset += page_size

        if (
            open_doc is not None
            and open_uri is not None
            and (limit is None or total_emitted < limit)
        ):
            batch.append(open_doc)
        if batch:
            yield batch

`iter_batches(batch_size=1000, limit=None)` ¶

Query the SPARQL endpoint and yield batches of flat dictionaries.

Paginates with SPARQL LIMIT/OFFSET on bindings (triple rows), merges rows into subject documents in a streaming fashion, and stops fetching once limit subjects have been yielded (when set).

Source code in graflo/data_source/rdf.py

def iter_batches(
    self, batch_size: int = 1000, limit: int | None = None
) -> Iterator[list[dict]]:
    """Query the SPARQL endpoint and yield batches of flat dictionaries.

    Paginates with SPARQL LIMIT/OFFSET on **bindings** (triple rows), merges
    rows into subject documents in a streaming fashion, and stops fetching
    once *limit* subjects have been yielded (when set).
    """
    wrapper = self._create_wrapper()
    offset = 0
    page_size = self.config.page_size
    open_uri: str | None = None
    open_doc: dict[str, Any] | None = None
    batch: list[dict] = []
    total_emitted = 0

    def subject_completed(doc: dict[str, Any]) -> Iterator[list[dict]]:
        """Append a finished subject and yield when a batch is full."""
        nonlocal batch, total_emitted
        if limit is not None and total_emitted >= limit:
            return
        batch.append(doc)
        total_emitted += 1
        if len(batch) >= batch_size:
            to_send = batch
            batch = []
            yield to_send

    while True:
        if limit is not None and total_emitted >= limit:
            break

        query = self.config.build_query(offset=offset, limit=page_size)
        wrapper.setQuery(query)

        logger.debug("SPARQL query (offset=%d): %s", offset, query)
        results = wrapper.queryAndConvert()

        bindings = results.get("results", {}).get("bindings", [])
        if not bindings:
            break

        stop_fetching = False
        for binding in bindings:
            s_val = binding["s"]["value"]
            if open_uri is None:
                open_uri = s_val
                open_doc = {"_uri": s_val, "_key": _local_name(s_val)}
            elif s_val != open_uri:
                assert open_doc is not None
                yield from subject_completed(open_doc)
                open_uri = None
                open_doc = None
                if limit is not None and total_emitted >= limit:
                    stop_fetching = True
                    break
                open_uri = s_val
                open_doc = {"_uri": s_val, "_key": _local_name(s_val)}

            assert open_doc is not None
            _merge_sparql_binding_into_doc(open_doc, binding)

        if stop_fetching:
            break

        if len(bindings) < page_size:
            break

        offset += page_size

    if (
        open_doc is not None
        and open_uri is not None
        and (limit is None or total_emitted < limit)
    ):
        batch.append(open_doc)
    if batch:
        yield batch

`SparqlSourceConfig` ¶

Bases: ConfigBaseModel

Configuration for a SPARQL endpoint data source.

Attributes:

Name	Type	Description
`endpoint_url`	`str`	Full SPARQL query endpoint URL (e.g. `http://localhost:3030/dataset/sparql`)
`rdf_class`	`str \| None`	URI of the rdf:Class whose instances to fetch
`graph_uri`	`str \| None`	Named graph to restrict the query to (optional)
`sparql_query`	`str \| None`	Custom SPARQL query override (optional)
`username`	`str \| None`	HTTP basic-auth username (optional)
`password`	`str \| None`	HTTP basic-auth password (optional)
`page_size`	`int`	Number of results per SPARQL LIMIT/OFFSET page

Source code in graflo/data_source/rdf.py

class SparqlSourceConfig(ConfigBaseModel):
    """Configuration for a SPARQL endpoint data source.

    Attributes:
        endpoint_url: Full SPARQL query endpoint URL
            (e.g. ``http://localhost:3030/dataset/sparql``)
        rdf_class: URI of the rdf:Class whose instances to fetch
        graph_uri: Named graph to restrict the query to (optional)
        sparql_query: Custom SPARQL query override (optional)
        username: HTTP basic-auth username (optional)
        password: HTTP basic-auth password (optional)
        page_size: Number of results per SPARQL LIMIT/OFFSET page
    """

    endpoint_url: str
    rdf_class: str | None = None
    graph_uri: str | None = None
    sparql_query: str | None = None
    username: str | None = None
    password: str | None = None
    page_size: int = Field(default=10_000, description="SPARQL pagination page size")

    def build_query(self, offset: int = 0, limit: int | None = None) -> str:
        """Build a SPARQL SELECT query.

        If *sparql_query* is set it is returned with LIMIT/OFFSET appended.
        Otherwise generates::

            SELECT ?s ?p ?o WHERE { ?s a <rdf_class> . ?s ?p ?o . }
        """
        if self.sparql_query:
            base = self.sparql_query.rstrip().rstrip(";")
        else:
            graph_open = f"GRAPH <{self.graph_uri}> {{" if self.graph_uri else ""
            graph_close = "}" if self.graph_uri else ""
            class_filter = f"?s a <{self.rdf_class}> . " if self.rdf_class else ""
            base = (
                f"SELECT ?s ?p ?o WHERE {{ "
                f"{graph_open} "
                f"{class_filter}"
                f"?s ?p ?o . "
                f"{graph_close} "
                f"}}"
            )

        effective_limit = limit if limit is not None else self.page_size
        # Group bindings by subject during streaming pagination; requires all
        # triple rows for one ?s to appear contiguously in the result.
        order_clause = "" if "ORDER BY" in base.upper() else " ORDER BY ?s"
        return f"{base}{order_clause} LIMIT {effective_limit} OFFSET {offset}"

`build_query(offset=0, limit=None)` ¶

Build a SPARQL SELECT query.

If sparql_query is set it is returned with LIMIT/OFFSET appended. Otherwise generates::

SELECT ?s ?p ?o WHERE { ?s a <rdf_class> . ?s ?p ?o . }

Source code in graflo/data_source/rdf.py

def build_query(self, offset: int = 0, limit: int | None = None) -> str:
    """Build a SPARQL SELECT query.

    If *sparql_query* is set it is returned with LIMIT/OFFSET appended.
    Otherwise generates::

        SELECT ?s ?p ?o WHERE { ?s a <rdf_class> . ?s ?p ?o . }
    """
    if self.sparql_query:
        base = self.sparql_query.rstrip().rstrip(";")
    else:
        graph_open = f"GRAPH <{self.graph_uri}> {{" if self.graph_uri else ""
        graph_close = "}" if self.graph_uri else ""
        class_filter = f"?s a <{self.rdf_class}> . " if self.rdf_class else ""
        base = (
            f"SELECT ?s ?p ?o WHERE {{ "
            f"{graph_open} "
            f"{class_filter}"
            f"?s ?p ?o . "
            f"{graph_close} "
            f"}}"
        )

    effective_limit = limit if limit is not None else self.page_size
    # Group bindings by subject during streaming pagination; requires all
    # triple rows for one ?s to appear contiguously in the result.
    order_clause = "" if "ORDER BY" in base.upper() else " ORDER BY ?s"
    return f"{base}{order_clause} LIMIT {effective_limit} OFFSET {offset}"

`TableFileDataSource` ¶

Bases: FileDataSource

Data source for CSV/TSV files.

Table files are converted to dictionaries with column headers as keys. Each row becomes a dictionary.

Attributes:

Name	Type	Description
`path`	`Path \| str`	Path to the CSV/TSV file
`encoding`	`EncodingType`	File encoding (default: UTF_8)
`sep`	`str`	Field separator (default: ',')

Source code in graflo/data_source/file.py

class TableFileDataSource(FileDataSource):
    """Data source for CSV/TSV files.

    Table files are converted to dictionaries with column headers as keys.
    Each row becomes a dictionary.

    Attributes:
        path: Path to the CSV/TSV file
        encoding: File encoding (default: UTF_8)
        sep: Field separator (default: ',')
    """

    sep: str = ","
    file_type: str = ChunkerType.TABLE.value

graflo.data_source¶

APIConfig ¶

APIConnector ¶

build_api_config(*, base_url, auth=None, default_headers=None, page_size_override=None) ¶

matches(resource_identifier) ¶

APIDataSource ¶

AbstractDataSource ¶

resource_name property writable ¶

__iter__() ¶

iter_batches(batch_size=1000, limit=None) abstractmethod ¶

ApiResponseStructure ¶

DataSourceFactory ¶

DataSourceRegistry ¶

clear() ¶

get_all_data_sources() ¶

get_data_sources(resource_name) ¶

has_resource(resource_name) ¶

register(data_source, resource_name) ¶

DataSourceType ¶

FileDataSource ¶

iter_batches(batch_size=1000, limit=None) ¶

InMemoryDataSource ¶

iter_batches(batch_size=1000, limit=None) ¶

JsonFileDataSource ¶

JsonlFileDataSource ¶

PaginationConfig ¶

PaginationRequestConfig ¶

RdfDataSource ¶

RdfFileDataSource ¶

iter_batches(batch_size=1000, limit=None) ¶

SQLConfig ¶

SQLDataSource ¶

iter_batches(batch_size=1000, limit=None) ¶

SparqlEndpointDataSource ¶

iter_batches(batch_size=1000, limit=None) ¶

SparqlSourceConfig ¶

build_query(offset=0, limit=None) ¶

TableFileDataSource ¶

`graflo.data_source`¶

`APIConfig` ¶

`APIConnector` ¶

`build_api_config(*, base_url, auth=None, default_headers=None, page_size_override=None)` ¶

`matches(resource_identifier)` ¶

`APIDataSource` ¶

`AbstractDataSource` ¶

`resource_name` `property` `writable` ¶

`iter()` ¶

`iter_batches(batch_size=1000, limit=None)` `abstractmethod` ¶

`ApiResponseStructure` ¶

`DataSourceFactory` ¶

`DataSourceRegistry` ¶

`clear()` ¶

`get_all_data_sources()` ¶

`get_data_sources(resource_name)` ¶

`has_resource(resource_name)` ¶

`register(data_source, resource_name)` ¶

`DataSourceType` ¶

`FileDataSource` ¶

`iter_batches(batch_size=1000, limit=None)` ¶

`InMemoryDataSource` ¶

`iter_batches(batch_size=1000, limit=None)` ¶

`JsonFileDataSource` ¶

`JsonlFileDataSource` ¶

`PaginationConfig` ¶

`PaginationRequestConfig` ¶

`RdfDataSource` ¶

`RdfFileDataSource` ¶

`iter_batches(batch_size=1000, limit=None)` ¶

`SQLConfig` ¶

`SQLDataSource` ¶

`iter_batches(batch_size=1000, limit=None)` ¶

`SparqlEndpointDataSource` ¶

`iter_batches(batch_size=1000, limit=None)` ¶

`SparqlSourceConfig` ¶

`build_query(offset=0, limit=None)` ¶

`TableFileDataSource` ¶