Skip to content

Commit 1b8bf31

Browse files
authored
refactor: move processing logic to IngestDoc (#248)
Moves the logic to partition a raw document to the IngestDoc level to allow for easier overrides for subclasses of IngestDoc.
1 parent 69acb08 commit 1b8bf31

File tree

5 files changed

+24
-21
lines changed

5 files changed

+24
-21
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
## 0.4.12-dev1
22

3-
* Adds console_entrypoint for unstructured-ingest and more structure/docs related to ingest.
3+
* Adds console_entrypoint for unstructured-ingest, other structure/doc updates related to ingest.
44

55
## 0.4.11
66

Ingest.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ The `main.py` flags of --re-download/--no-re-download , --download-dir, --preser
5555
In checklist form, the above steps are summarized as:
5656

5757
- [ ] Create a new module under [unstructured/ingest/connector/](unstructured/ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).
58+
- [ ] The subclass of `BaseIngestDoc` overrides `process_file()` if extra processing logic is needed other than what is provided by [auto.partition()](unstructured/partition/auto.py).
5859
- [ ] Update [unstructured/ingest/main.py](unstructured/ingest/main.py) with support for the new connector.
5960
- [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script.
6061
- [ ] Add a script test_unstructured_ingest/test-ingest-\<the-new-data-source\>.sh. It's json output files should have a total of no more than 100K.

unstructured/ingest/connector/s3_connector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,12 @@ def get_file(self):
107107
print(f"fetching {self} - PID: {os.getpid()}")
108108
s3_cli.download_file(self.config.s3_bucket, self.s3_key, self._tmp_download_file())
109109

110-
def write_result(self, result):
110+
def write_result(self):
111111
"""Write the structured json result for this doc. result must be json serializable."""
112112
output_filename = self._output_filename()
113113
output_filename.parent.mkdir(parents=True, exist_ok=True)
114114
with open(output_filename, "w") as output_f:
115-
output_f.write(json.dumps(result, ensure_ascii=False, indent=2))
115+
output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
116116
print(f"Wrote {output_filename}")
117117

118118
@property

unstructured/ingest/doc_processor/generalized.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22

33
import logging
44

5-
from unstructured.partition.auto import partition
6-
from unstructured.staging.base import convert_to_isd
7-
85
from unstructured_inference.models.detectron2 import MODEL_TYPES
96

107

@@ -25,24 +22,12 @@ def process_document(doc):
2522
# in the future, get_file_handle() could also be supported
2623
doc.get_file()
2724

28-
# accessing the .filename property could lazily call .get_file(), but
29-
# keeping them as two distinct calls for end-user transparency for now
30-
print(f"Processing {doc.filename}")
31-
32-
elements = partition(filename=doc.filename)
33-
34-
isd_elems = convert_to_isd(elements)
35-
36-
isd_elems_no_filename = []
37-
for elem in isd_elems:
38-
# type: ignore
39-
elem["metadata"].pop("filename") # type: ignore[attr-defined]
40-
isd_elems_no_filename.append(elem)
25+
isd_elems_no_filename = doc.process_file()
4126

4227
# Note, this may be a no-op if the IngestDoc doesn't do anything to persist
4328
# the results. Instead, the MainProcess (caller) may work with the aggregate
4429
# results across all docs in memory.
45-
doc.write_result(isd_elems_no_filename)
30+
doc.write_result()
4631

4732
except Exception:
4833
# TODO(crag) save the exception instead of print?

unstructured/ingest/interfaces.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44
from abc import ABC, abstractmethod
55

6+
from unstructured.partition.auto import partition
7+
from unstructured.staging.base import convert_to_isd
8+
69

710
class BaseConnector(ABC):
811
"""Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
@@ -80,6 +83,20 @@ def has_output(self):
8083
pass
8184

8285
@abstractmethod
83-
def write_result(self, result):
86+
def write_result(self):
8487
"""Write the structured json result for this doc. result must be json serializable."""
8588
pass
89+
90+
def process_file(self):
91+
print(f"Processing {self.filename}")
92+
93+
elements = partition(filename=self.filename)
94+
isd_elems = convert_to_isd(elements)
95+
96+
self.isd_elems_no_filename = []
97+
for elem in isd_elems:
98+
# type: ignore
99+
elem["metadata"].pop("filename") # type: ignore[attr-defined]
100+
self.isd_elems_no_filename.append(elem)
101+
102+
return self.isd_elems_no_filename

0 commit comments

Comments
 (0)