44from dataclasses import dataclass
55from ftplib import FTP , error_perm
66from pathlib import Path
7- from typing import List , Optional , Union
7+ from typing import List , Union
88
99import requests
1010from bs4 import BeautifulSoup
1313 BaseConnector ,
1414 BaseConnectorConfig ,
1515 BaseIngestDoc ,
16+ StandardConnectorConfig ,
1617)
1718from unstructured .ingest .logger import logger
1819from unstructured .utils import (
@@ -38,26 +39,11 @@ class SimpleBiomedConfig(BaseConnectorConfig):
3839 id_, from_, until, format are API parameters."""
3940
4041 path : str
41-
4242 # OA Web Service API Options
4343 id_ : str
4444 from_ : str
4545 until : str
4646
47- # Standard Connector options
48- download_dir : str
49- # where to write structured data, with the directory structure matching FTP path
50- output_dir : str
51- re_download : bool = False
52- download_only : bool = False
53- preserve_downloads : bool = False
54- metadata_include : Optional [str ] = None
55- metadata_exclude : Optional [str ] = None
56- partition_by_api : bool = False
57- partition_endpoint : str = "https://api.unstructured.io/general/v0/general"
58- fields_include : str = "element_id,text,type,metadata"
59- flatten_metadata : bool = False
60-
6147 def validate_api_inputs (self ):
6248 valid = False
6349
@@ -78,7 +64,7 @@ def __post_init__(self):
7864 is_valid = self .validate_api_inputs ()
7965 if not is_valid :
8066 raise ValueError (
81- "Path argument or atleast one of the "
67+ "Path argument or at least one of the "
8268 "OA Web Service arguments MUST be provided." ,
8369 )
8470
@@ -126,9 +112,9 @@ def _output_filename(self):
126112
127113 def cleanup_file (self ):
128114 if (
129- not self .config .preserve_downloads
115+ not self .standard_config .preserve_downloads
130116 and self .filename .is_file ()
131- and not self .config .download_only
117+ and not self .standard_config .download_only
132118 ):
133119 logger .debug (f"Cleaning up { self } " )
134120 Path .unlink (self .filename )
@@ -157,7 +143,7 @@ def get_file(self):
157143
158144 def write_result (self ):
159145 """Write the structured json result for this doc. result must be json serializable."""
160- if self .config .download_only :
146+ if self .standard_config .download_only :
161147 return
162148 output_filename = self ._output_filename ()
163149 output_filename .parent .mkdir (parents = True , exist_ok = True )
@@ -169,9 +155,13 @@ def write_result(self):
169155class BiomedConnector (BaseConnector ):
170156 """Objects of this class support fetching documents from Biomedical literature FTP directory"""
171157
172- def __init__ (self , config ):
173- self .config = config
174- self .cleanup_files = not self .config .preserve_downloads and not self .config .download_only
158+ config : SimpleBiomedConfig
159+
160+ def __init__ (self , standard_config : StandardConnectorConfig , config : SimpleBiomedConfig ):
161+ super ().__init__ (standard_config , config )
162+ self .cleanup_files = (
163+ not self .standard_config .preserve_downloads and not self .standard_config .download_only
164+ )
175165
176166 def _list_objects_api (self ):
177167 def urls_to_metadata (urls ):
@@ -184,9 +174,11 @@ def urls_to_metadata(urls):
184174 BiomedFileMeta (
185175 ftp_path = url ,
186176 download_filepath = (
187- Path (self .config .download_dir ) / local_path
177+ Path (self .standard_config .download_dir ) / local_path
178+ ).resolve (),
179+ output_filepath = (
180+ Path (self .standard_config .output_dir ) / local_path
188181 ).resolve (),
189- output_filepath = (Path (self .config .output_dir ) / local_path ).resolve (),
190182 ),
191183 )
192184
@@ -251,10 +243,10 @@ def traverse(path, download_dir, output_dir):
251243 BiomedFileMeta (
252244 ftp_path = ftp_path ,
253245 download_filepath = (
254- Path (self .config .download_dir ) / local_path
246+ Path (self .standard_config .download_dir ) / local_path
255247 ).resolve (),
256248 output_filepath = (
257- Path (self .config .output_dir ) / local_path
249+ Path (self .standard_config .output_dir ) / local_path
258250 ).resolve (),
259251 ),
260252 )
@@ -272,15 +264,17 @@ def traverse(path, download_dir, output_dir):
272264 return [
273265 BiomedFileMeta (
274266 ftp_path = ftp_path ,
275- download_filepath = (Path (self .config .download_dir ) / local_path ).resolve (),
276- output_filepath = (Path (self .config .output_dir ) / local_path ).resolve (),
267+ download_filepath = (
268+ Path (self .standard_config .download_dir ) / local_path
269+ ).resolve (),
270+ output_filepath = (Path (self .standard_config .output_dir ) / local_path ).resolve (),
277271 ),
278272 ]
279273 else :
280274 traverse (
281275 Path (self .config .path ),
282- Path (self .config .download_dir ),
283- Path (self .config .output_dir ),
276+ Path (self .standard_config .download_dir ),
277+ Path (self .standard_config .output_dir ),
284278 )
285279
286280 return files
@@ -290,7 +284,7 @@ def cleanup(self, cur_dir=None):
290284 return
291285
292286 if cur_dir is None :
293- cur_dir = self .config .download_dir
287+ cur_dir = self .standard_config .download_dir
294288
295289 if cur_dir is None or not Path (cur_dir ).is_dir ():
296290 return
@@ -310,4 +304,4 @@ def initialize(self):
310304
311305 def get_ingest_docs (self ):
312306 files = self ._list_objects_api () if self .config .is_api else self ._list_objects ()
313- return [BiomedIngestDoc (self .config , file ) for file in files ]
307+ return [BiomedIngestDoc (self .standard_config , self . config , file ) for file in files ]
0 commit comments