Merge pull request #1209 from kedro-org/feature/pdfdataset

jitu5 · web-flow · commit c297ad05efa2 · 2025-10-13T09:35:36.000+01:00
feat(datasets): Added the Experimental pypdf.PDFDataset
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -12,6 +12,14 @@
 ## Bug fixes and other changes
 - Add HTMLPreview type.
 
+## Major features and improvements
+
+- Added the following new experimental datasets:
+
+| Type                           | Description                                                   | Location                             |
+|--------------------------------|---------------------------------------------------------------|--------------------------------------|
+| `pypdf.PDFDataset`             | A dataset to read PDF files and extract text using pypdf      | `kedro_datasets_experimental.pypdf`  |
+
 # Release 8.1.0
 ## Major features and improvements
 
diff --git a/kedro-datasets/docs/api/kedro_datasets_experimental/index.md b/kedro-datasets/docs/api/kedro_datasets_experimental/index.md
@@ -13,6 +13,7 @@ Name | Description
 [langchain.OpenAIEmbeddingsDataset](langchain.OpenAIEmbeddingsDataset.md) | ``OpenAIEmbeddingsDataset`` loads a OpenAIEmbeddings `langchain` model.
 [langchain.LangChainPromptDataset](langchain.LangChainPromptDataset.md) | ``LangChainPromptDataset`` loads a `langchain` prompt template.
 [netcdf.NetCDFDataset](netcdf.NetCDFDataset.md) | ``NetCDFDataset`` loads/saves data from/to a NetCDF file using an underlying filesystem (e.g.: local, S3, GCS). It uses xarray to handle the NetCDF file.
+[pypdf.PDFDataset](pypdf.PDFDataset.md) | ``PDFDataset`` loads data from PDF files using pypdf to extract text from pages. Read-only dataset.
 [polars.PolarsDatabaseDataset](polars.PolarsDatabaseDataset.md) | ``PolarsDatabaseDataset`` implementation to access databases as Polars DataFrames. It supports reading from a SQL query and writing to a database table.
 [prophet.ProphetModelDataset](prophet.ProphetModelDataset.md) | ``ProphetModelDataset`` loads/saves Facebook Prophet models to a JSON file using an underlying filesystem (e.g., local, S3, GCS). It uses Prophet's built-in serialisation to handle the JSON file.
 [pytorch.PyTorchDataset](pytorch.PyTorchDataset.md) | ``PyTorchDataset`` loads and saves PyTorch models' `state_dict` using PyTorch's recommended zipfile serialization protocol. To avoid security issues with Pickle.
diff --git a/kedro-datasets/docs/api/kedro_datasets_experimental/pypdf.PDFDataset.md b/kedro-datasets/docs/api/kedro_datasets_experimental/pypdf.PDFDataset.md
@@ -0,0 +1,4 @@
+::: kedro_datasets_experimental.pypdf.PDFDataset
+    options:
+        members: true
+        show_source: true
diff --git a/kedro-datasets/kedro_datasets_experimental/pypdf/__init__.py b/kedro-datasets/kedro_datasets_experimental/pypdf/__init__.py
@@ -0,0 +1,16 @@
+"""``AbstractDataset`` implementation to load data from PDF files using pypdf."""
+
+from typing import Any
+
+import lazy_loader as lazy
+
+try:
+    from .pdf_dataset import PDFDataset
+except (ImportError, RuntimeError):
+    # For documentation builds that might fail due to dependency issues
+    # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
+    PDFDataset: Any
+
+__getattr__, __dir__, __all__ = lazy.attach(
+    __name__, submod_attrs={"pdf_dataset": ["PDFDataset"]}
+)
diff --git a/kedro-datasets/kedro_datasets_experimental/pypdf/pdf_dataset.py b/kedro-datasets/kedro_datasets_experimental/pypdf/pdf_dataset.py
@@ -0,0 +1,163 @@
+"""``PDFDataset`` loads data from PDF files using an underlying
+filesystem (e.g.: local, S3, GCS). It uses pypdf to read and extract text from PDF files.
+"""
+from __future__ import annotations
+
+from copy import deepcopy
+from pathlib import PurePosixPath
+from typing import Any, NoReturn
+
+import fsspec
+import pypdf
+from kedro.io.core import (
+    AbstractDataset,
+    DatasetError,
+    get_filepath_str,
+    get_protocol_and_path,
+)
+
+
+class PDFDataset(AbstractDataset[NoReturn, list[str]]):
+    """``PDFDataset`` loads data from PDF files using an underlying
+    filesystem (e.g.: local, S3, GCS). It uses pypdf to read and extract text from PDF files.
+
+    This is a read-only dataset - saving is not supported.
+
+    Examples:
+        Using the [YAML API](https://docs.kedro.org/en/stable/catalog-data/data_catalog_yaml_examples/):
+
+        ```yaml
+        my_pdf_document:
+          type: pypdf.PDFDataset
+          filepath: data/01_raw/document.pdf
+
+        password_protected_pdf:
+          type: pypdf.PDFDataset
+          filepath: data/01_raw/protected.pdf
+          load_args:
+            password: "pass123"  # pragma: allowlist secret
+
+        s3_pdf:
+          type: pypdf.PDFDataset
+          filepath: s3://your_bucket/document.pdf
+          credentials: dev_s3
+        ```
+
+        Using the [Python API](https://docs.kedro.org/en/stable/catalog-data/advanced_data_catalog_usage/):
+
+        >>> from kedro_datasets_experimental.pypdf import PDFDataset
+        >>>
+        >>> dataset = PDFDataset(filepath="data/document.pdf")
+        >>> pages = dataset.load()
+        >>> # pages is a list of strings, one per page
+        >>> assert isinstance(pages, list)
+        >>> assert all(isinstance(page, str) for page in pages)
+
+    """
+
+    DEFAULT_LOAD_ARGS: dict[str, Any] = {"strict": False}
+
+    def __init__(
+        self,
+        *,
+        filepath: str,
+        load_args: dict[str, Any] | None = None,
+        credentials: dict[str, Any] | None = None,
+        fs_args: dict[str, Any] | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> None:
+        """Creates a new instance of ``PDFDataset`` pointing to a concrete PDF file
+        on a specific filesystem.
+
+        Args:
+            filepath: Filepath in POSIX format to a PDF file prefixed with a protocol like `s3://`.
+                If prefix is not provided, `file` protocol (local filesystem) will be used.
+                The prefix should be any protocol supported by ``fsspec``.
+            load_args: Pypdf options for loading PDF files (arguments passed
+                into ``pypdf.PdfReader``). Here you can find all available arguments:
+                https://pypdf.readthedocs.io/en/stable/modules/PdfReader.html
+                All defaults are preserved, except "strict", which is set to False.
+                Common options include:
+                - password (str): Password for encrypted PDFs
+                - strict (bool): Whether to raise errors on malformed PDFs (default: False)
+            credentials: Credentials required to get access to the underlying filesystem.
+                E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
+            fs_args: Extra arguments to pass into underlying filesystem class constructor
+                (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as
+                to pass to the filesystem's `open` method through nested keys
+                `open_args_load` and `open_args_save`.
+                Here you can find all available arguments for `open`:
+                https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
+                All defaults are preserved.
+            metadata: Any arbitrary metadata.
+                This is ignored by Kedro, but may be consumed by users or external plugins.
+        """
+        _fs_args = deepcopy(fs_args) or {}
+        _fs_open_args_load = _fs_args.pop("open_args_load", {})
+        _credentials = deepcopy(credentials) or {}
+
+        super().__init__()
+
+        protocol, path = get_protocol_and_path(filepath)
+        if protocol == "file":
+            _fs_args.setdefault("auto_mkdir", True)
+
+        self._protocol = protocol
+        self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
+        self._filepath = PurePosixPath(path)
+        self.metadata = metadata
+
+        # Handle default load and fs arguments
+        self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})}
+        self._fs_open_args_load = _fs_open_args_load or {}
+
+    def _describe(self) -> dict[str, Any]:
+        return {
+            "filepath": self._filepath,
+            "protocol": self._protocol,
+            "load_args": self._load_args,
+        }
+
+    def load(self) -> list[str]:
+        """Loads data from a PDF file.
+
+        Returns:
+            list[str]: A list of strings, where each string contains the text extracted from one page.
+        """
+        load_path = get_filepath_str(self._filepath, self._protocol)
+
+        with self._fs.open(load_path, mode="rb", **self._fs_open_args_load) as fs_file:
+            pdf_reader = pypdf.PdfReader(stream=fs_file, **self._load_args)
+            pages = []
+            for page in pdf_reader.pages:
+                pages.append(page.extract_text())
+            return pages
+
+    def save(self, data: NoReturn) -> None:
+        """Saving to PDFDataset is not supported.
+
+        Args:
+            data: Data to save.
+
+        Raises:
+            DatasetError: Always raised as saving is not supported.
+        """
+        raise DatasetError("Saving to PDFDataset is not supported.")
+
+    def _exists(self) -> bool:
+        """Check if the PDF file exists.
+
+        Returns:
+            bool: True if the file exists, False otherwise.
+        """
+        load_path = get_filepath_str(self._filepath, self._protocol)
+        return self._fs.exists(load_path)
+
+    def _release(self) -> None:
+        """Release any cached filesystem information."""
+        self._invalidate_cache()
+
+    def _invalidate_cache(self) -> None:
+        """Invalidate underlying filesystem caches."""
+        filepath = get_filepath_str(self._filepath, self._protocol)
+        self._fs.invalidate_cache(filepath)
diff --git a/kedro-datasets/kedro_datasets_experimental/tests/pypdf/__init__.py b/kedro-datasets/kedro_datasets_experimental/tests/pypdf/__init__.py
diff --git a/kedro-datasets/kedro_datasets_experimental/tests/pypdf/test_pdf_dataset.py b/kedro-datasets/kedro_datasets_experimental/tests/pypdf/test_pdf_dataset.py
@@ -0,0 +1,152 @@
+import shutil
+from pathlib import PurePosixPath
+
+import pypdf
+import pytest
+from fsspec.implementations.http import HTTPFileSystem
+from fsspec.implementations.local import LocalFileSystem
+from gcsfs import GCSFileSystem
+from kedro.io.core import PROTOCOL_DELIMITER, DatasetError
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+from s3fs.core import S3FileSystem
+
+from kedro_datasets_experimental.pypdf import PDFDataset
+
+
+@pytest.fixture
+def filepath_pdf(tmp_path):
+    return (tmp_path / "test.pdf").as_posix()
+
+
+@pytest.fixture
+def pdf_dataset(filepath_pdf, load_args, fs_args):
+    return PDFDataset(filepath=filepath_pdf, load_args=load_args, fs_args=fs_args)
+
+
+@pytest.fixture
+def dummy_pdf_data(tmp_path):
+    """Create a simple PDF file for testing."""
+    filepath = tmp_path / "test_dummy.pdf"
+
+    # Create a simple PDF with pypdf
+    writer = pypdf.PdfWriter()
+
+    # Add page 1
+    page1 = pypdf.PageObject.create_blank_page(width=200, height=200)
+    writer.add_page(page1)
+
+    # Add page 2
+    page2 = pypdf.PageObject.create_blank_page(width=200, height=200)
+    writer.add_page(page2)
+
+    # Write to file
+    with open(filepath, "wb") as f:
+        writer.write(f)
+
+    return filepath
+
+
+@pytest.fixture
+def dummy_pdf_with_text(tmp_path):
+    """Create a PDF with actual text content."""
+    filepath = tmp_path / "test_with_text.pdf"
+
+    # Create PDF with reportlab
+    c = canvas.Canvas(str(filepath), pagesize=letter)
+
+    # Page 1
+    c.drawString(100, 750, "This is page 1")
+    c.drawString(100, 730, "Hello World")
+    c.showPage()
+
+    # Page 2
+    c.drawString(100, 750, "This is page 2")
+    c.drawString(100, 730, "Testing PDF Dataset")
+    c.showPage()
+
+    c.save()
+
+    return filepath
+
+
+class TestPDFDataset:
+    def test_save_raises_error(self, pdf_dataset):
+        """Test that saving raises an error."""
+        pattern = r"Saving to PDFDataset is not supported\."
+        with pytest.raises(DatasetError, match=pattern):
+            pdf_dataset.save(["some", "data"])
+
+    def test_load_pdf(self, dummy_pdf_data):
+        """Test loading a PDF file."""
+        dataset = PDFDataset(filepath=str(dummy_pdf_data))
+        pages = dataset.load()
+
+        assert isinstance(pages, list)
+        assert len(pages) == 2  # Two pages created in dummy_pdf_data
+        assert all(isinstance(page, str) for page in pages)
+
+    def test_load_pdf_with_text(self, dummy_pdf_with_text):
+        """Test loading a PDF with actual text content."""
+        dataset = PDFDataset(filepath=str(dummy_pdf_with_text))
+        pages = dataset.load()
+
+        assert len(pages) == 2
+        assert "page 1" in pages[0].lower()
+        assert "page 2" in pages[1].lower()
+
+    def test_exists(self, pdf_dataset, dummy_pdf_data):
+        """Test `exists` method invocation for both existing and
+        nonexistent dataset."""
+        assert not pdf_dataset.exists()
+
+        # Copy dummy PDF to the expected filepath
+        shutil.copy(dummy_pdf_data, pdf_dataset._filepath)
+
+        assert pdf_dataset.exists()
+
+    @pytest.mark.parametrize("load_args", [{"strict": True}], indirect=True)
+    def test_load_extra_params(self, pdf_dataset, load_args):
+        """Test overriding the default load arguments."""
+        for key, value in load_args.items():
+            assert pdf_dataset._load_args[key] == value
+
+    @pytest.mark.parametrize(
+        "fs_args",
+        [{"open_args_load": {"mode": "rb", "compression": "gzip"}}],
+        indirect=True,
+    )
+    def test_open_extra_args(self, pdf_dataset, fs_args):
+        assert pdf_dataset._fs_open_args_load == fs_args["open_args_load"]
+
+    def test_load_missing_file(self, pdf_dataset):
+        """Check the error when trying to load missing file."""
+        pattern = r"Failed while loading data from dataset kedro_datasets_experimental.pypdf.pdf_dataset.PDFDataset\(.*\)"
+        with pytest.raises(DatasetError, match=pattern):
+            pdf_dataset.load()
+
+    @pytest.mark.parametrize(
+        "filepath,instance_type",
+        [
+            ("s3://bucket/file.pdf", S3FileSystem),
+            ("file:///tmp/test.pdf", LocalFileSystem),
+            ("/tmp/test.pdf", LocalFileSystem),  # nosec
+            ("gcs://bucket/file.pdf", GCSFileSystem),
+            ("https://example.com/file.pdf", HTTPFileSystem),
+        ],
+    )
+    def test_protocol_usage(self, filepath, instance_type):
+        dataset = PDFDataset(filepath=filepath)
+        assert isinstance(dataset._fs, instance_type)
+
+        path = filepath.split(PROTOCOL_DELIMITER, 1)[-1]
+
+        assert str(dataset._filepath) == path
+        assert isinstance(dataset._filepath, PurePosixPath)
+
+    def test_catalog_release(self, mocker):
+        fs_mock = mocker.patch("fsspec.filesystem").return_value
+        filepath = "test.pdf"
+        dataset = PDFDataset(filepath=filepath)
+        dataset.release()
+        fs_mock.invalidate_cache.assert_called_once_with(filepath)
diff --git a/kedro-datasets/mkdocs.yml b/kedro-datasets/mkdocs.yml
@@ -176,6 +176,7 @@ plugins:
         Experimental Specialized Formats:
           - api/kedro_datasets_experimental/prophet.ProphetModelDataset.md: Time series with Prophet
           - api/kedro_datasets_experimental/video.VideoDataset.md: Video file processing
+          - api/kedro_datasets_experimental/pypdf.PDFDataset.md: PDF file text extraction
           - api/kedro_datasets_experimental/netcdf.NetCDFDataset.md: NetCDF scientific data
           - api/kedro_datasets_experimental/rioxarray.GeoTIFFDataset.md: GeoTIFF raster data
           - api/kedro_datasets_experimental/polars.PolarsDatabaseDataset.md: Polars database connector
@@ -326,6 +327,8 @@ nav:
           - langchain.LangChainPromptDataset: api/kedro_datasets_experimental/langchain.LangChainPromptDataset.md
         - NetCDF:
           - netcdf.NetCDFDataset: api/kedro_datasets_experimental/netcdf.NetCDFDataset.md
+        - PyPDF:
+          - pypdf.PDFDataset: api/kedro_datasets_experimental/pypdf.PDFDataset.md
         - Polars:
           - polars.PolarsDatabaseDataset: api/kedro_datasets_experimental/polars.PolarsDatabaseDataset.md
         - Prophet:
diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml