Skip to content

Commit a74d389

Browse files
authored
fix: process_document behavior when exception is raised (#298)
1 parent c7eba16 commit a74d389

File tree

2 files changed

+9
-6
lines changed

2 files changed

+9
-6
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
### Fixes
88

9+
* Fix `process_document` file cleaning on failure
10+
911
## 0.4.16
1012

1113
### Enhancements

unstructured/ingest/doc_processor/generalized.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
"""Process aribritrary files with the Unstructured library"""
22

3-
import logging
3+
from typing import Any, Dict, List, Optional
44

55
from unstructured_inference.models.detectron2 import MODEL_TYPES
66

7+
from unstructured.ingest.interfaces import BaseIngestDoc as IngestDoc
8+
from unstructured.logger import logger
9+
710

811
def initialize():
912
"""Download models (avoids subprocesses all doing the same)"""
@@ -14,7 +17,7 @@ def initialize():
1417
MODEL_TYPES[None]["config_path"]
1518

1619

17-
def process_document(doc):
20+
def process_document(doc: "IngestDoc") -> Optional[List[Dict[str, Any]]]:
1821
"""Process any IngestDoc-like class of document with Unstructured's auto partition logic."""
1922
isd_elems_no_filename = None
2023
try:
@@ -28,11 +31,9 @@ def process_document(doc):
2831
# the results. Instead, the MainProcess (caller) may work with the aggregate
2932
# results across all docs in memory.
3033
doc.write_result()
31-
3234
except Exception:
3335
# TODO(crag) save the exception instead of print?
34-
logging.error(f"Failed to process {doc}", exc_info=True)
35-
else:
36-
doc.cleanup_file()
36+
logger.error(f"Failed to process {doc}", exc_info=True)
3737
finally:
38+
doc.cleanup_file()
3839
return isd_elems_no_filename

0 commit comments

Comments
 (0)