Skip to content

Commit 7942bc9

Browse files
chore: refactor for ingest standard_config options (#599)
1 parent 23ff32c commit 7942bc9

File tree

18 files changed

+294
-396
lines changed

18 files changed

+294
-396
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.6.7-dev6
1+
## 0.6.7-dev7
22

33
### Enhancements
44

5+
* Refactor out non-connector-specific config variables
56
* Add `file_directory` to metadata
67
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
78
* Added a `--partition-strategy` parameter to unstructured-ingest so that users can specify

test_unstructured_ingest/test_interfaces.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
7+
from unstructured.ingest.interfaces import StandardConnectorConfig
78

89
DIRECTORY = pathlib.Path(__file__).parent.resolve()
910
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
@@ -17,9 +18,12 @@
1718
@pytest.mark.parametrize("filename", test_files)
1819
def test_process_file_metadata_include_filename(filename: str):
1920
ingest_doc = GitIngestDoc(
21+
standard_config=StandardConnectorConfig(
22+
download_dir=EXAMPLE_DOCS_DIRECTORY,
23+
output_dir="",
24+
),
2025
path=filename,
2126
config=SimpleGitConfig(
22-
download_dir=EXAMPLE_DOCS_DIRECTORY,
2327
metadata_include="filename",
2428
),
2529
)
@@ -32,11 +36,13 @@ def test_process_file_metadata_include_filename(filename: str):
3236
@pytest.mark.parametrize("filename", test_files)
3337
def test_process_file_metadata_include_filename_pagenum(filename: str):
3438
ingest_doc = GitIngestDoc(
35-
path=filename,
36-
config=SimpleGitConfig(
39+
standard_config=StandardConnectorConfig(
3740
download_dir=EXAMPLE_DOCS_DIRECTORY,
41+
output_dir="",
3842
metadata_include="filename,page_number",
3943
),
44+
path=filename,
45+
config=SimpleGitConfig(),
4046
)
4147
isd_elems = ingest_doc.process_file(strategy="hi_res")
4248

@@ -47,11 +53,13 @@ def test_process_file_metadata_include_filename_pagenum(filename: str):
4753
@pytest.mark.parametrize("filename", test_files)
4854
def test_process_file_metadata_exclude_filename(filename: str):
4955
ingest_doc = GitIngestDoc(
50-
path=filename,
51-
config=SimpleGitConfig(
56+
standard_config=StandardConnectorConfig(
5257
download_dir=EXAMPLE_DOCS_DIRECTORY,
58+
output_dir="",
5359
metadata_exclude="filename",
5460
),
61+
path=filename,
62+
config=SimpleGitConfig(),
5563
)
5664
isd_elems = ingest_doc.process_file(strategy="hi_res")
5765

@@ -62,11 +70,13 @@ def test_process_file_metadata_exclude_filename(filename: str):
6270
@pytest.mark.parametrize("filename", test_files)
6371
def test_process_file_metadata_exclude_filename_pagenum(filename: str):
6472
ingest_doc = GitIngestDoc(
65-
path=filename,
66-
config=SimpleGitConfig(
73+
standard_config=StandardConnectorConfig(
6774
download_dir=EXAMPLE_DOCS_DIRECTORY,
75+
output_dir="",
6876
metadata_exclude="filename,page_number",
6977
),
78+
path=filename,
79+
config=SimpleGitConfig(),
7080
)
7181
isd_elems = ingest_doc.process_file(strategy="hi_res")
7282

@@ -78,10 +88,12 @@ def test_process_file_metadata_exclude_filename_pagenum(filename: str):
7888
@pytest.mark.parametrize("filename", test_files)
7989
def test_process_file_fields_include_default(filename: str):
8090
ingest_doc = GitIngestDoc(
81-
path=filename,
82-
config=SimpleGitConfig(
91+
standard_config=StandardConnectorConfig(
8392
download_dir=EXAMPLE_DOCS_DIRECTORY,
93+
output_dir="",
8494
),
95+
path=filename,
96+
config=SimpleGitConfig(),
8597
)
8698
isd_elems = ingest_doc.process_file(strategy="hi_res")
8799

@@ -92,11 +104,13 @@ def test_process_file_fields_include_default(filename: str):
92104
@pytest.mark.parametrize("filename", test_files)
93105
def test_process_file_fields_include_elementid(filename: str):
94106
ingest_doc = GitIngestDoc(
95-
path=filename,
96-
config=SimpleGitConfig(
107+
standard_config=StandardConnectorConfig(
97108
download_dir=EXAMPLE_DOCS_DIRECTORY,
109+
output_dir="",
98110
fields_include="element_id",
99111
),
112+
path=filename,
113+
config=SimpleGitConfig(),
100114
)
101115
isd_elems = ingest_doc.process_file(strategy="hi_res")
102116

@@ -107,12 +121,14 @@ def test_process_file_fields_include_elementid(filename: str):
107121
@pytest.mark.parametrize("filename", test_files)
108122
def test_process_file_flatten_metadata_filename(filename: str):
109123
ingest_doc = GitIngestDoc(
110-
path=filename,
111-
config=SimpleGitConfig(
124+
standard_config=StandardConnectorConfig(
112125
download_dir=EXAMPLE_DOCS_DIRECTORY,
126+
output_dir="",
113127
metadata_include="filename",
114128
flatten_metadata=True,
115129
),
130+
path=filename,
131+
config=SimpleGitConfig(),
116132
)
117133
isd_elems = ingest_doc.process_file(strategy="hi_res")
118134
for elem in isd_elems:
@@ -122,12 +138,14 @@ def test_process_file_flatten_metadata_filename(filename: str):
122138
@pytest.mark.parametrize("filename", test_files)
123139
def test_process_file_flatten_metadata_filename_pagenum(filename: str):
124140
ingest_doc = GitIngestDoc(
125-
path=filename,
126-
config=SimpleGitConfig(
141+
standard_config=StandardConnectorConfig(
127142
download_dir=EXAMPLE_DOCS_DIRECTORY,
143+
output_dir="",
128144
metadata_include="filename,page_number",
129145
flatten_metadata=True,
130146
),
147+
path=filename,
148+
config=SimpleGitConfig(),
131149
)
132150
isd_elems = ingest_doc.process_file(strategy="hi_res")
133151
for elem in isd_elems:

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.7-dev6" # pragma: no cover
1+
__version__ = "0.6.7-dev7" # pragma: no cover

unstructured/ingest/connector/azure.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
FsspecIngestDoc,
77
SimpleFsspecConfig,
88
)
9+
from unstructured.ingest.interfaces import StandardConnectorConfig
910
from unstructured.utils import requires_dependencies
1011

1112

@@ -26,6 +27,7 @@ class AzureBlobStorageConnector(FsspecConnector):
2627

2728
def __init__(
2829
self,
30+
standard_config: StandardConnectorConfig,
2931
config: SimpleAzureBlobStorageConfig,
3032
) -> None:
31-
super().__init__(config=config)
33+
super().__init__(standard_config=standard_config, config=config)

unstructured/ingest/connector/biomed.py

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from dataclasses import dataclass
55
from ftplib import FTP, error_perm
66
from pathlib import Path
7-
from typing import List, Optional, Union
7+
from typing import List, Union
88

99
import requests
1010
from bs4 import BeautifulSoup
@@ -13,6 +13,7 @@
1313
BaseConnector,
1414
BaseConnectorConfig,
1515
BaseIngestDoc,
16+
StandardConnectorConfig,
1617
)
1718
from unstructured.ingest.logger import logger
1819
from unstructured.utils import (
@@ -38,26 +39,11 @@ class SimpleBiomedConfig(BaseConnectorConfig):
3839
id_, from_, until, format are API parameters."""
3940

4041
path: str
41-
4242
# OA Web Service API Options
4343
id_: str
4444
from_: str
4545
until: str
4646

47-
# Standard Connector options
48-
download_dir: str
49-
# where to write structured data, with the directory structure matching FTP path
50-
output_dir: str
51-
re_download: bool = False
52-
download_only: bool = False
53-
preserve_downloads: bool = False
54-
metadata_include: Optional[str] = None
55-
metadata_exclude: Optional[str] = None
56-
partition_by_api: bool = False
57-
partition_endpoint: str = "https://api.unstructured.io/general/v0/general"
58-
fields_include: str = "element_id,text,type,metadata"
59-
flatten_metadata: bool = False
60-
6147
def validate_api_inputs(self):
6248
valid = False
6349

@@ -78,7 +64,7 @@ def __post_init__(self):
7864
is_valid = self.validate_api_inputs()
7965
if not is_valid:
8066
raise ValueError(
81-
"Path argument or atleast one of the "
67+
"Path argument or at least one of the "
8268
"OA Web Service arguments MUST be provided.",
8369
)
8470

@@ -126,9 +112,9 @@ def _output_filename(self):
126112

127113
def cleanup_file(self):
128114
if (
129-
not self.config.preserve_downloads
115+
not self.standard_config.preserve_downloads
130116
and self.filename.is_file()
131-
and not self.config.download_only
117+
and not self.standard_config.download_only
132118
):
133119
logger.debug(f"Cleaning up {self}")
134120
Path.unlink(self.filename)
@@ -157,7 +143,7 @@ def get_file(self):
157143

158144
def write_result(self):
159145
"""Write the structured json result for this doc. result must be json serializable."""
160-
if self.config.download_only:
146+
if self.standard_config.download_only:
161147
return
162148
output_filename = self._output_filename()
163149
output_filename.parent.mkdir(parents=True, exist_ok=True)
@@ -169,9 +155,13 @@ def write_result(self):
169155
class BiomedConnector(BaseConnector):
170156
"""Objects of this class support fetching documents from Biomedical literature FTP directory"""
171157

172-
def __init__(self, config):
173-
self.config = config
174-
self.cleanup_files = not self.config.preserve_downloads and not self.config.download_only
158+
config: SimpleBiomedConfig
159+
160+
def __init__(self, standard_config: StandardConnectorConfig, config: SimpleBiomedConfig):
161+
super().__init__(standard_config, config)
162+
self.cleanup_files = (
163+
not self.standard_config.preserve_downloads and not self.standard_config.download_only
164+
)
175165

176166
def _list_objects_api(self):
177167
def urls_to_metadata(urls):
@@ -184,9 +174,11 @@ def urls_to_metadata(urls):
184174
BiomedFileMeta(
185175
ftp_path=url,
186176
download_filepath=(
187-
Path(self.config.download_dir) / local_path
177+
Path(self.standard_config.download_dir) / local_path
178+
).resolve(),
179+
output_filepath=(
180+
Path(self.standard_config.output_dir) / local_path
188181
).resolve(),
189-
output_filepath=(Path(self.config.output_dir) / local_path).resolve(),
190182
),
191183
)
192184

@@ -251,10 +243,10 @@ def traverse(path, download_dir, output_dir):
251243
BiomedFileMeta(
252244
ftp_path=ftp_path,
253245
download_filepath=(
254-
Path(self.config.download_dir) / local_path
246+
Path(self.standard_config.download_dir) / local_path
255247
).resolve(),
256248
output_filepath=(
257-
Path(self.config.output_dir) / local_path
249+
Path(self.standard_config.output_dir) / local_path
258250
).resolve(),
259251
),
260252
)
@@ -272,15 +264,17 @@ def traverse(path, download_dir, output_dir):
272264
return [
273265
BiomedFileMeta(
274266
ftp_path=ftp_path,
275-
download_filepath=(Path(self.config.download_dir) / local_path).resolve(),
276-
output_filepath=(Path(self.config.output_dir) / local_path).resolve(),
267+
download_filepath=(
268+
Path(self.standard_config.download_dir) / local_path
269+
).resolve(),
270+
output_filepath=(Path(self.standard_config.output_dir) / local_path).resolve(),
277271
),
278272
]
279273
else:
280274
traverse(
281275
Path(self.config.path),
282-
Path(self.config.download_dir),
283-
Path(self.config.output_dir),
276+
Path(self.standard_config.download_dir),
277+
Path(self.standard_config.output_dir),
284278
)
285279

286280
return files
@@ -290,7 +284,7 @@ def cleanup(self, cur_dir=None):
290284
return
291285

292286
if cur_dir is None:
293-
cur_dir = self.config.download_dir
287+
cur_dir = self.standard_config.download_dir
294288

295289
if cur_dir is None or not Path(cur_dir).is_dir():
296290
return
@@ -310,4 +304,4 @@ def initialize(self):
310304

311305
def get_ingest_docs(self):
312306
files = self._list_objects_api() if self.config.is_api else self._list_objects()
313-
return [BiomedIngestDoc(self.config, file) for file in files]
307+
return [BiomedIngestDoc(self.standard_config, self.config, file) for file in files]

0 commit comments

Comments
 (0)