Skip to content

Commit d79d5ea

Browse files
committed
Versioning removed
1 parent c0629d1 commit d79d5ea

File tree

2 files changed

+16
-77
lines changed

2 files changed

+16
-77
lines changed

kedro-datasets/kedro_datasets_experimental/pypdf/pdf_dataset.py

Lines changed: 14 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@
1010
import fsspec
1111
import pypdf
1212
from kedro.io.core import (
13-
AbstractVersionedDataset,
13+
AbstractDataset,
1414
DatasetError,
15-
Version,
1615
get_filepath_str,
1716
get_protocol_and_path,
1817
)
1918

2019

21-
class PDFDataset(AbstractVersionedDataset[NoReturn, list[str]]):
20+
class PDFDataset(AbstractDataset[NoReturn, list[str]]):
2221
"""``PDFDataset`` loads data from PDF files using an underlying
2322
filesystem (e.g.: local, S3, GCS). It uses pypdf to read and extract text from PDF files.
2423
@@ -58,12 +57,11 @@ class PDFDataset(AbstractVersionedDataset[NoReturn, list[str]]):
5857

5958
DEFAULT_LOAD_ARGS: dict[str, Any] = {"strict": False}
6059

61-
def __init__( # noqa: PLR0913
60+
def __init__(
6261
self,
6362
*,
6463
filepath: str,
6564
load_args: dict[str, Any] | None = None,
66-
version: Version | None = None,
6765
credentials: dict[str, Any] | None = None,
6866
fs_args: dict[str, Any] | None = None,
6967
metadata: dict[str, Any] | None = None,
@@ -75,18 +73,13 @@ def __init__( # noqa: PLR0913
7573
filepath: Filepath in POSIX format to a PDF file prefixed with a protocol like `s3://`.
7674
If prefix is not provided, `file` protocol (local filesystem) will be used.
7775
The prefix should be any protocol supported by ``fsspec``.
78-
Note: `http(s)` doesn't support versioning.
7976
load_args: Pypdf options for loading PDF files (arguments passed
8077
into ``pypdf.PdfReader``). Here you can find all available arguments:
8178
https://pypdf.readthedocs.io/en/stable/modules/PdfReader.html
8279
All defaults are preserved, except "strict", which is set to False.
8380
Common options include:
8481
- password (str): Password for encrypted PDFs
8582
- strict (bool): Whether to raise errors on malformed PDFs (default: False)
86-
version: If specified, should be an instance of
87-
``kedro.io.core.Version``. If its ``load`` attribute is
88-
None, the latest version will be loaded. If its ``save``
89-
attribute is None, save version will be autogenerated.
9083
credentials: Credentials required to get access to the underlying filesystem.
9184
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
9285
fs_args: Extra arguments to pass into underlying filesystem class constructor
@@ -103,22 +96,17 @@ def __init__( # noqa: PLR0913
10396
_fs_open_args_load = _fs_args.pop("open_args_load", {})
10497
_credentials = deepcopy(credentials) or {}
10598

106-
protocol, path = get_protocol_and_path(filepath, version)
99+
super().__init__()
100+
101+
protocol, path = get_protocol_and_path(filepath)
107102
if protocol == "file":
108103
_fs_args.setdefault("auto_mkdir", True)
109104

110105
self._protocol = protocol
111106
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
112-
107+
self._filepath = PurePosixPath(path)
113108
self.metadata = metadata
114109

115-
super().__init__(
116-
filepath=PurePosixPath(path),
117-
version=version,
118-
exists_function=self._fs.exists,
119-
glob_function=self._fs.glob,
120-
)
121-
122110
# Handle default load and fs arguments
123111
self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})}
124112
self._fs_open_args_load = _fs_open_args_load or {}
@@ -128,7 +116,6 @@ def _describe(self) -> dict[str, Any]:
128116
"filepath": self._filepath,
129117
"protocol": self._protocol,
130118
"load_args": self._load_args,
131-
"version": self._version,
132119
}
133120

134121
def load(self) -> list[str]:
@@ -137,7 +124,7 @@ def load(self) -> list[str]:
137124
Returns:
138125
list[str]: A list of strings, where each string contains the text extracted from one page.
139126
"""
140-
load_path = get_filepath_str(self._get_load_path(), self._protocol)
127+
load_path = get_filepath_str(self._filepath, self._protocol)
141128

142129
with self._fs.open(load_path, mode="rb", **self._fs_open_args_load) as fs_file:
143130
pdf_reader = pypdf.PdfReader(stream=fs_file, **self._load_args)
@@ -158,15 +145,16 @@ def save(self, data: NoReturn) -> None:
158145
raise DatasetError("Saving to PDFDataset is not supported.")
159146

160147
def _exists(self) -> bool:
161-
try:
162-
load_path = get_filepath_str(self._get_load_path(), self._protocol)
163-
except DatasetError:
164-
return False
148+
"""Check if the PDF file exists.
165149
150+
Returns:
151+
bool: True if the file exists, False otherwise.
152+
"""
153+
load_path = get_filepath_str(self._filepath, self._protocol)
166154
return self._fs.exists(load_path)
167155

168156
def _release(self) -> None:
169-
super()._release()
157+
"""Release any cached filesystem information."""
170158
self._invalidate_cache()
171159

172160
def _invalidate_cache(self) -> None:

kedro-datasets/kedro_datasets_experimental/tests/pypdf/test_pdf_dataset.py

Lines changed: 2 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import shutil
2-
from pathlib import Path, PurePosixPath
2+
from pathlib import PurePosixPath
33

44
import pypdf
55
import pytest
66
from fsspec.implementations.http import HTTPFileSystem
77
from fsspec.implementations.local import LocalFileSystem
88
from gcsfs import GCSFileSystem
9-
from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version
9+
from kedro.io.core import PROTOCOL_DELIMITER, DatasetError
1010
from reportlab.lib.pagesizes import letter
1111
from reportlab.pdfgen import canvas
1212
from s3fs.core import S3FileSystem
@@ -24,13 +24,6 @@ def pdf_dataset(filepath_pdf, load_args, fs_args):
2424
return PDFDataset(filepath=filepath_pdf, load_args=load_args, fs_args=fs_args)
2525

2626

27-
@pytest.fixture
28-
def versioned_pdf_dataset(filepath_pdf, load_version, save_version):
29-
return PDFDataset(
30-
filepath=filepath_pdf, version=Version(load_version, save_version)
31-
)
32-
33-
3427
@pytest.fixture
3528
def dummy_pdf_data(tmp_path):
3629
"""Create a simple PDF file for testing."""
@@ -157,45 +150,3 @@ def test_catalog_release(self, mocker):
157150
dataset = PDFDataset(filepath=filepath)
158151
dataset.release()
159152
fs_mock.invalidate_cache.assert_called_once_with(filepath)
160-
161-
162-
class TestPDFDatasetVersioned:
163-
def test_version_str_repr(self, load_version, save_version):
164-
"""Test that version is in string representation of the class instance
165-
when applicable."""
166-
filepath = "test.pdf"
167-
ds = PDFDataset(filepath=filepath)
168-
ds_versioned = PDFDataset(
169-
filepath=filepath, version=Version(load_version, save_version)
170-
)
171-
assert filepath in str(ds)
172-
assert "version" not in str(ds)
173-
174-
assert filepath in str(ds_versioned)
175-
ver_str = f"version=Version(load={load_version}, save='{save_version}')"
176-
assert ver_str in str(ds_versioned)
177-
assert "PDFDataset" in str(ds_versioned)
178-
assert "PDFDataset" in str(ds)
179-
assert "protocol" in str(ds_versioned)
180-
assert "protocol" in str(ds)
181-
# Default load_args
182-
assert "load_args={'strict': False}" in str(ds)
183-
assert "load_args={'strict': False}" in str(ds_versioned)
184-
185-
def test_no_versions(self, versioned_pdf_dataset):
186-
"""Check the error if no versions are available for load."""
187-
pattern = r"Did not find any versions for kedro_datasets_experimental.pypdf.pdf_dataset.PDFDataset\(.+\)"
188-
with pytest.raises(DatasetError, match=pattern):
189-
versioned_pdf_dataset.load()
190-
191-
def test_exists(self, versioned_pdf_dataset, dummy_pdf_data):
192-
"""Test `exists` method invocation for versioned dataset."""
193-
assert not versioned_pdf_dataset.exists()
194-
195-
def test_http_filesystem_no_versioning(self):
196-
pattern = "Versioning is not supported for HTTP protocols."
197-
198-
with pytest.raises(DatasetError, match=pattern):
199-
PDFDataset(
200-
filepath="https://example.com/file.pdf", version=Version(None, None)
201-
)

0 commit comments

Comments
 (0)