1010import fsspec
1111import pypdf
1212from kedro .io .core import (
13- AbstractVersionedDataset ,
13+ AbstractDataset ,
1414 DatasetError ,
15- Version ,
1615 get_filepath_str ,
1716 get_protocol_and_path ,
1817)
1918
2019
21- class PDFDataset (AbstractVersionedDataset [NoReturn , list [str ]]):
20+ class PDFDataset (AbstractDataset [NoReturn , list [str ]]):
2221 """``PDFDataset`` loads data from PDF files using an underlying
2322 filesystem (e.g.: local, S3, GCS). It uses pypdf to read and extract text from PDF files.
2423
@@ -58,12 +57,11 @@ class PDFDataset(AbstractVersionedDataset[NoReturn, list[str]]):
5857
5958 DEFAULT_LOAD_ARGS : dict [str , Any ] = {"strict" : False }
6059
61- def __init__ ( # noqa: PLR0913
60+ def __init__ (
6261 self ,
6362 * ,
6463 filepath : str ,
6564 load_args : dict [str , Any ] | None = None ,
66- version : Version | None = None ,
6765 credentials : dict [str , Any ] | None = None ,
6866 fs_args : dict [str , Any ] | None = None ,
6967 metadata : dict [str , Any ] | None = None ,
@@ -75,18 +73,13 @@ def __init__( # noqa: PLR0913
7573 filepath: Filepath in POSIX format to a PDF file prefixed with a protocol like `s3://`.
7674 If prefix is not provided, `file` protocol (local filesystem) will be used.
7775 The prefix should be any protocol supported by ``fsspec``.
78- Note: `http(s)` doesn't support versioning.
7976 load_args: Pypdf options for loading PDF files (arguments passed
8077 into ``pypdf.PdfReader``). Here you can find all available arguments:
8178 https://pypdf.readthedocs.io/en/stable/modules/PdfReader.html
8279 All defaults are preserved, except "strict", which is set to False.
8380 Common options include:
8481 - password (str): Password for encrypted PDFs
8582 - strict (bool): Whether to raise errors on malformed PDFs (default: False)
86- version: If specified, should be an instance of
87- ``kedro.io.core.Version``. If its ``load`` attribute is
88- None, the latest version will be loaded. If its ``save``
89- attribute is None, save version will be autogenerated.
9083 credentials: Credentials required to get access to the underlying filesystem.
9184 E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
9285 fs_args: Extra arguments to pass into underlying filesystem class constructor
@@ -103,22 +96,17 @@ def __init__( # noqa: PLR0913
10396 _fs_open_args_load = _fs_args .pop ("open_args_load" , {})
10497 _credentials = deepcopy (credentials ) or {}
10598
106- protocol , path = get_protocol_and_path (filepath , version )
99+ super ().__init__ ()
100+
101+ protocol , path = get_protocol_and_path (filepath )
107102 if protocol == "file" :
108103 _fs_args .setdefault ("auto_mkdir" , True )
109104
110105 self ._protocol = protocol
111106 self ._fs = fsspec .filesystem (self ._protocol , ** _credentials , ** _fs_args )
112-
107+ self . _filepath = PurePosixPath ( path )
113108 self .metadata = metadata
114109
115- super ().__init__ (
116- filepath = PurePosixPath (path ),
117- version = version ,
118- exists_function = self ._fs .exists ,
119- glob_function = self ._fs .glob ,
120- )
121-
122110 # Handle default load and fs arguments
123111 self ._load_args = {** self .DEFAULT_LOAD_ARGS , ** (load_args or {})}
124112 self ._fs_open_args_load = _fs_open_args_load or {}
@@ -128,7 +116,6 @@ def _describe(self) -> dict[str, Any]:
128116 "filepath" : self ._filepath ,
129117 "protocol" : self ._protocol ,
130118 "load_args" : self ._load_args ,
131- "version" : self ._version ,
132119 }
133120
134121 def load (self ) -> list [str ]:
@@ -137,7 +124,7 @@ def load(self) -> list[str]:
137124 Returns:
138125 list[str]: A list of strings, where each string contains the text extracted from one page.
139126 """
140- load_path = get_filepath_str (self ._get_load_path () , self ._protocol )
127+ load_path = get_filepath_str (self ._filepath , self ._protocol )
141128
142129 with self ._fs .open (load_path , mode = "rb" , ** self ._fs_open_args_load ) as fs_file :
143130 pdf_reader = pypdf .PdfReader (stream = fs_file , ** self ._load_args )
@@ -158,15 +145,16 @@ def save(self, data: NoReturn) -> None:
158145 raise DatasetError ("Saving to PDFDataset is not supported." )
159146
160147 def _exists (self ) -> bool :
161- try :
162- load_path = get_filepath_str (self ._get_load_path (), self ._protocol )
163- except DatasetError :
164- return False
148+ """Check if the PDF file exists.
165149
150+ Returns:
151+ bool: True if the file exists, False otherwise.
152+ """
153+ load_path = get_filepath_str (self ._filepath , self ._protocol )
166154 return self ._fs .exists (load_path )
167155
168156 def _release (self ) -> None :
169- super (). _release ()
157+ """Release any cached filesystem information."""
170158 self ._invalidate_cache ()
171159
172160 def _invalidate_cache (self ) -> None :
0 commit comments