Skip to content

Commit e6d6509

Browse files
authored
feat: add --download-only parameter to unstructured-ingest (#416)
Add --download-only parameter so that files may be downloaded if they are not already present (as usual, in either --download-dir or the default download ~/.cache/... location if --download-dir is not specified) and skip processing them through unstructured.
1 parent 5398cdf commit e6d6509

File tree

11 files changed

+77
-20
lines changed

11 files changed

+77
-20
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.5.10-dev1
1+
## 0.5.10-dev2
22

33
### Enhancements
44

@@ -7,6 +7,8 @@
77

88
### Features
99

10+
* Added `--download-only` parameter to `unstructured-ingest`
11+
1012
### Fixes
1113

1214
## 0.5.9

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.10-dev1" # pragma: no cover
1+
__version__ = "0.5.10-dev2" # pragma: no cover

unstructured/ingest/connector/biomed.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class SimpleBiomedConfig(BaseConnectorConfig):
4747
# where to write structured data, with the directory structure matching FTP path
4848
output_dir: str
4949
re_download: bool = False
50+
download_only: bool = False
5051
preserve_downloads: bool = False
5152
metadata_include: Optional[str] = None
5253
metadata_exclude: Optional[str] = None
@@ -142,7 +143,11 @@ def _output_filename(self):
142143
return Path(f"{self.file_meta.output_filepath}.json").resolve()
143144

144145
def cleanup_file(self):
145-
if not self.config.preserve_downloads and self.filename.is_file():
146+
if (
147+
not self.config.preserve_downloads
148+
and self.filename.is_file()
149+
and not self.config.download_only
150+
):
146151
logger.debug(f"Cleaning up {self}")
147152
Path.unlink(self.filename)
148153

@@ -170,6 +175,8 @@ def get_file(self):
170175

171176
def write_result(self):
172177
"""Write the structured json result for this doc. result must be json serializable."""
178+
if self.config.download_only:
179+
return
173180
output_filename = self._output_filename()
174181
output_filename.parent.mkdir(parents=True, exist_ok=True)
175182
with open(output_filename, "w") as output_f:
@@ -182,7 +189,7 @@ class BiomedConnector(BaseConnector):
182189

183190
def __init__(self, config):
184191
self.config = config
185-
self.cleanup_files = not self.config.preserve_downloads
192+
self.cleanup_files = not self.config.preserve_downloads and not self.config.download_only
186193

187194
def _list_objects_api(self):
188195
def urls_to_metadata(urls):

unstructured/ingest/connector/fsspec.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class SimpleFsspecConfig(BaseConnectorConfig):
2929
output_dir: str
3030
preserve_downloads: bool = False
3131
re_download: bool = False
32+
download_only: bool = False
3233
metadata_include: Optional[str] = None
3334
metadata_exclude: Optional[str] = None
3435
fields_include: str = "element_id,text,type,metadata"
@@ -121,6 +122,8 @@ def get_file(self):
121122

122123
def write_result(self):
123124
"""Write the structured json result for this doc. result must be json serializable."""
125+
if self.config.download_only:
126+
return
124127
output_filename = self._output_filename()
125128
output_filename.parent.mkdir(parents=True, exist_ok=True)
126129
with open(output_filename, "w") as output_f:
@@ -133,8 +136,8 @@ def filename(self):
133136
return self._tmp_download_file()
134137

135138
def cleanup_file(self):
136-
"""Removes the local copy the file after successful processing."""
137-
if not self.config.preserve_downloads:
139+
"""Removes the local copy of the file after successful processing."""
140+
if not self.config.preserve_downloads and not self.config.download_only:
138141
logger.debug(f"Cleaning up {self}")
139142
os.unlink(self._tmp_download_file())
140143

@@ -154,7 +157,7 @@ def __init__(
154157
self.fs: AbstractFileSystem = get_filesystem_class(self.config.protocol)(
155158
**self.config.access_kwargs,
156159
)
157-
self.cleanup_files = not config.preserve_downloads
160+
self.cleanup_files = not config.preserve_downloads and not config.download_only
158161

159162
def cleanup(self, cur_dir=None):
160163
"""cleanup linginering empty sub-dirs from s3 paths, but leave remaining files

unstructured/ingest/connector/git.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class SimpleGitConfig(BaseConnectorConfig):
2626
output_dir: str
2727
preserve_downloads: bool = False
2828
re_download: bool = False
29+
download_only: bool = False
2930
metadata_include: Optional[str] = None
3031
metadata_exclude: Optional[str] = None
3132
fields_include: str = "element_id,text,type,metadata"
@@ -51,8 +52,8 @@ def _create_full_tmp_dir_path(self):
5152
self.filename.parent.mkdir(parents=True, exist_ok=True)
5253

5354
def cleanup_file(self):
54-
"""Removes the local copy the file (or anything else) after successful processing."""
55-
if not self.config.preserve_downloads:
55+
"""Removes the local copy of the file (or anything else) after successful processing."""
56+
if not self.config.preserve_downloads and not self.config.download_only:
5657
logger.debug(f"Cleaning up {self}")
5758
os.unlink(self.filename)
5859

@@ -76,6 +77,8 @@ def has_output(self):
7677

7778
def write_result(self):
7879
"""Write the structured json result for this doc. result must be json serializable."""
80+
if self.config.download_only:
81+
return
7982
output_filename = self._output_filename()
8083
output_filename.parent.mkdir(parents=True, exist_ok=True)
8184
with open(output_filename, "w", encoding="utf8") as output_f:
@@ -88,7 +91,7 @@ class GitConnector(BaseConnector):
8891
config: SimpleGitConfig
8992

9093
def __post_init__(self) -> None:
91-
self.cleanup_files = not self.config.preserve_downloads
94+
self.cleanup_files = not self.config.preserve_downloads and not self.config.download_only
9295

9396
def cleanup(self, cur_dir=None):
9497
if not self.cleanup_files:

unstructured/ingest/connector/google_drive.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
7676
# where to write structured data, with the directory structure matching drive path
7777
output_dir: str
7878
re_download: bool = False
79+
download_only: bool = False
7980
preserve_downloads: bool = False
8081
metadata_include: Optional[str] = None
8182
metadata_exclude: Optional[str] = None
@@ -106,7 +107,11 @@ def _output_filename(self):
106107
return Path(f"{self.file_meta.get('output_filepath')}.json").resolve()
107108

108109
def cleanup_file(self):
109-
if not self.config.preserve_downloads and self.filename.is_file():
110+
if (
111+
not self.config.preserve_downloads
112+
and self.filename.is_file()
113+
and not self.config.download_only
114+
):
110115
logger.debug(f"Cleaning up {self}")
111116
Path.unlink(self.filename)
112117

@@ -174,6 +179,8 @@ def get_file(self):
174179

175180
def write_result(self):
176181
"""Write the structured json result for this doc. result must be json serializable."""
182+
if self.config.download_only:
183+
return
177184
output_filename = self._output_filename()
178185
output_filename.parent.mkdir(parents=True, exist_ok=True)
179186
with open(output_filename, "w") as output_f:
@@ -186,7 +193,7 @@ class GoogleDriveConnector(BaseConnector):
186193

187194
def __init__(self, config):
188195
self.config = config
189-
self.cleanup_files = not self.config.preserve_downloads
196+
self.cleanup_files = not self.config.preserve_downloads and not self.config.download_only
190197

191198
def _list_objects(self, drive_id, recursive=False):
192199
files = []

unstructured/ingest/connector/local.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class SimpleLocalConfig(BaseConnectorConfig):
2424
file_glob: Optional[str] = None
2525

2626
# base connector options
27+
download_only: bool = False
2728
metadata_include: Optional[str] = None
2829
metadata_exclude: Optional[str] = None
2930
fields_include: str = "element_id,text,type,metadata"
@@ -69,6 +70,8 @@ def has_output(self):
6970

7071
def write_result(self):
7172
"""Write the structured json result for this doc. result must be json serializable."""
73+
if self.config.download_only:
74+
return
7275
output_filename = self._output_filename()
7376
output_filename.parent.mkdir(parents=True, exist_ok=True)
7477
with open(output_filename, "w") as output_f:

unstructured/ingest/connector/reddit.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class SimpleRedditConfig(BaseConnectorConfig):
3131
output_dir: str
3232
preserve_downloads: bool = False
3333
re_download: bool = False
34+
download_only: bool = False
3435
metadata_include: Optional[str] = None
3536
metadata_exclude: Optional[str] = None
3637
fields_include: str = "element_id,text,type,metadata"
@@ -57,8 +58,8 @@ def _create_full_tmp_dir_path(self):
5758
self.filename.parent.mkdir(parents=True, exist_ok=True)
5859

5960
def cleanup_file(self):
60-
"""Removes the local copy the file (or anything else) after successful processing."""
61-
if not self.config.preserve_downloads:
61+
"""Removes the local copy of the file (or anything else) after successful processing."""
62+
if not self.config.preserve_downloads and not self.config.download_only:
6263
logger.debug(f"Cleaning up {self}")
6364
os.unlink(self.filename)
6465

@@ -82,6 +83,8 @@ def has_output(self):
8283

8384
def write_result(self):
8485
"""Write the structured json result for this doc. result must be json serializable."""
86+
if self.config.download_only:
87+
return
8588
output_filename = self._output_filename()
8689
output_filename.parent.mkdir(parents=True, exist_ok=True)
8790
with open(output_filename, "w", encoding="utf8") as output_f:
@@ -100,7 +103,7 @@ def __init__(self, config: SimpleRedditConfig):
100103
client_secret=config.client_secret,
101104
user_agent=config.user_agent,
102105
)
103-
self.cleanup_files = not config.preserve_downloads
106+
self.cleanup_files = not config.preserve_downloads and not config.download_only
104107

105108
def cleanup(self, cur_dir=None):
106109
if not self.cleanup_files:

unstructured/ingest/connector/wikipedia.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
2626
output_dir: str
2727
preserve_downloads: bool = False
2828
re_download: bool = False
29+
download_only: bool = False
2930
metadata_include: Optional[str] = None
3031
metadata_exclude: Optional[str] = None
3132
fields_include: str = "element_id,text,type,metadata"
@@ -52,8 +53,8 @@ def _create_full_tmp_dir_path(self):
5253
self.filename.parent.mkdir(parents=True, exist_ok=True)
5354

5455
def cleanup_file(self):
55-
"""Removes the local copy the file (or anything else) after successful processing."""
56-
if not self.config.preserve_downloads:
56+
"""Removes the local copy of the file (or anything else) after successful processing."""
57+
if not self.config.preserve_downloads and not self.config.download_only:
5758
logger.debug(f"Cleaning up {self}")
5859
os.unlink(self.filename)
5960

@@ -75,6 +76,8 @@ def has_output(self):
7576

7677
def write_result(self):
7778
"""Write the structured json result for this doc. result must be json serializable."""
79+
if self.config.download_only:
80+
return
7881
output_filename = self._output_filename()
7982
output_filename.parent.mkdir(parents=True, exist_ok=True)
8083
with open(output_filename, "w", encoding="utf8") as output_f:
@@ -133,7 +136,7 @@ def _output_filename(self):
133136
class WikipediaConnector(BaseConnector):
134137
def __init__(self, config: SimpleWikipediaConfig):
135138
self.config = config
136-
self.cleanup_files = not config.preserve_downloads
139+
self.cleanup_files = not config.preserve_downloads and not config.download_only
137140

138141
def cleanup(self, cur_dir=None):
139142
if not self.cleanup_files:

unstructured/ingest/interfaces.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class BaseConnectorConfig(ABC):
4848
# where to write structured data outputs
4949
output_dir: str
5050
re_download: bool = False
51+
download_only: bool = False
5152
metadata_include: Optional[str] = None
5253
metadata_exclude: Optional[str] = None
5354
fields_include: str = "element_id,text,type,metadata"
@@ -93,8 +94,9 @@ def write_result(self):
9394
pass
9495

9596
def process_file(self):
97+
if self.config.download_only:
98+
return
9699
logger.info(f"Processing {self.filename}")
97-
98100
elements = partition(filename=str(self.filename))
99101
isd_elems = convert_to_dict(elements)
100102

0 commit comments

Comments
 (0)