Skip to content

Commit f25eb60

Browse files
fix: expose drawing options as function params rather than env config (#3598)
This PR: - changes the interface of analysis tools to expose drawing params as function parameters rather than env_config (=environmental variables) - restructures analysis package
1 parent acd070c commit f25eb60

File tree

6 files changed

+198
-176
lines changed

6 files changed

+198
-176
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.15.10-dev3
1+
## 0.15.10-dev4
22

33
### Enhancements
44

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.10-dev3" # pragma: no cover
1+
__version__ = "0.15.10-dev4" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@
5353
prepare_languages_for_tesseract,
5454
tesseract_to_paddle_language,
5555
)
56-
from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
5756
from unstructured.partition.pdf_image.analysis.layout_dump import (
5857
ExtractedLayoutDumper,
5958
FinalLayoutDumper,
6059
ObjectDetectionLayoutDumper,
6160
OCRLayoutDumper,
6261
)
62+
from unstructured.partition.pdf_image.analysis.tools import save_analysis_artifiacts
6363
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
6464
from unstructured.partition.pdf_image.pdf_image_utils import (
6565
check_element_types_to_extract,
@@ -816,6 +816,10 @@ def _partition_pdf_or_image_local(
816816
analyzed_image_output_dir_path=analyzed_image_output_dir_path,
817817
skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
818818
skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
819+
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
820+
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
821+
resize=env_config.ANALYSIS_BBOX_RESIZE,
822+
format=env_config.ANALYSIS_BBOX_FORMAT,
819823
)
820824

821825
return out_elements
Lines changed: 0 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -1,172 +0,0 @@
1-
import json
2-
import uuid
3-
from io import BytesIO
4-
from pathlib import Path
5-
from typing import Optional
6-
7-
from unstructured import env_config
8-
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
9-
AnalysisDrawer,
10-
FinalLayoutDrawer,
11-
LayoutDrawer,
12-
OCRLayoutDrawer,
13-
ODModelLayoutDrawer,
14-
PdfminerLayoutDrawer,
15-
)
16-
from unstructured.partition.pdf_image.analysis.layout_dump import (
17-
ExtractedLayoutDumper,
18-
FinalLayoutDumper,
19-
JsonLayoutDumper,
20-
LayoutDumper,
21-
ObjectDetectionLayoutDumper,
22-
OCRLayoutDumper,
23-
)
24-
25-
26-
def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]:
27-
"""For a given layout dumper, return the corresponding layout drawer instance initialized with
28-
a dumped layout dict.
29-
30-
Args:
31-
dumper: The layout dumper instance
32-
33-
Returns:
34-
LayoutDrawer: The corresponding layout drawer instance
35-
"""
36-
if isinstance(dumper, ObjectDetectionLayoutDumper):
37-
return ODModelLayoutDrawer(layout_dump=dumper.dump())
38-
elif isinstance(dumper, ExtractedLayoutDumper):
39-
return PdfminerLayoutDrawer(layout_dump=dumper.dump())
40-
elif isinstance(dumper, OCRLayoutDumper):
41-
return OCRLayoutDrawer(layout_dump=dumper.dump())
42-
elif isinstance(dumper, FinalLayoutDumper):
43-
return FinalLayoutDrawer(layout_dump=dumper.dump())
44-
else:
45-
raise ValueError(f"Unknown dumper type: {dumper}")
46-
47-
48-
def _generate_filename(is_image: bool):
49-
"""Generate a filename for the analysis artifacts based on the file type.
50-
Adds a random uuid suffix
51-
"""
52-
suffix = uuid.uuid4().hex[:5]
53-
if is_image:
54-
return f"image_{suffix}.png"
55-
return f"pdf_{suffix}.pdf"
56-
57-
58-
def save_analysis_artifiacts(
59-
*layout_dumpers: LayoutDumper,
60-
is_image: bool,
61-
analyzed_image_output_dir_path: str,
62-
filename: Optional[str] = None,
63-
file: Optional[BytesIO] = None,
64-
skip_bboxes: bool = False,
65-
skip_dump_od: bool = False,
66-
):
67-
"""Save the analysis artifacts for a given file. Loads some settings from
68-
the environment configuration.
69-
70-
Args:
71-
layout_dumpers: The layout dumpers to save and use for bboxes rendering
72-
filename: The filename of the sources analyzed file (pdf/image)
73-
analyzed_image_output_dir_path: The directory to save the analysis artifacts
74-
"""
75-
if not filename:
76-
filename = _generate_filename(is_image)
77-
if skip_bboxes or skip_dump_od:
78-
return
79-
80-
output_path = Path(analyzed_image_output_dir_path)
81-
output_path.mkdir(parents=True, exist_ok=True)
82-
if not skip_dump_od:
83-
json_layout_dumper = JsonLayoutDumper(
84-
filename=filename,
85-
save_dir=output_path,
86-
)
87-
for layout_dumper in layout_dumpers:
88-
json_layout_dumper.add_layout_dumper(layout_dumper)
89-
json_layout_dumper.process()
90-
91-
if not skip_bboxes:
92-
analysis_drawer = AnalysisDrawer(
93-
filename=filename,
94-
file=file,
95-
is_image=is_image,
96-
save_dir=output_path,
97-
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
98-
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
99-
resize=env_config.ANALYSIS_BBOX_RESIZE,
100-
format=env_config.ANALYSIS_BBOX_FORMAT,
101-
)
102-
103-
for layout_dumper in layout_dumpers:
104-
drawer = _get_drawer_for_dumper(layout_dumper)
105-
analysis_drawer.add_drawer(drawer)
106-
analysis_drawer.process()
107-
108-
109-
def render_bboxes_for_file(
110-
filename: str,
111-
analyzed_image_output_dir_path: str,
112-
renders_output_dir_path: Optional[str] = None,
113-
):
114-
"""Render the bounding boxes for a given layout dimp file.
115-
To be used for analysis after the partition is performed for
116-
only dumping the layouts - the bboxes can be rendered later.
117-
118-
Expects that the analyzed_image_output_dir_path keeps the structure
119-
that was created by the save_analysis_artifacts function.
120-
121-
Args:
122-
filename: The filename of the sources analyzed file (pdf/image)
123-
analyzed_image_output_dir_path: The directory where the analysis artifacts
124-
(layout dumps) are saved. It should be the root directory of the structure
125-
created by the save_analysis_artifacts function.
126-
renders_output_dir_path: Optional directory to save the rendered bboxes -
127-
if not provided, it will be saved in the analysis directory.
128-
"""
129-
filename_stem = Path(filename).stem
130-
is_image = not Path(filename).suffix.endswith("pdf")
131-
analysis_dumps_dir = (
132-
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump"
133-
)
134-
print(f"analysis_dumps_dir: {analysis_dumps_dir}")
135-
if not analysis_dumps_dir.exists():
136-
return
137-
layout_drawers = []
138-
for analysis_dump_filename in analysis_dumps_dir.iterdir():
139-
if not analysis_dump_filename.is_file():
140-
continue
141-
with open(analysis_dump_filename) as f:
142-
layout_dump = json.load(f)
143-
if analysis_dump_filename.stem == "final":
144-
layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump))
145-
if analysis_dump_filename.stem == "object_detection":
146-
layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump))
147-
if analysis_dump_filename.stem == "ocr":
148-
layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump))
149-
if analysis_dump_filename.stem == "pdfminer":
150-
layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump))
151-
152-
if layout_drawers:
153-
if not renders_output_dir_path:
154-
output_path = (
155-
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes"
156-
)
157-
else:
158-
output_path = Path(renders_output_dir_path)
159-
output_path.mkdir(parents=True, exist_ok=True)
160-
analysis_drawer = AnalysisDrawer(
161-
filename=filename,
162-
save_dir=output_path,
163-
is_image=is_image,
164-
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
165-
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
166-
resize=env_config.ANALYSIS_BBOX_RESIZE,
167-
format=env_config.ANALYSIS_BBOX_FORMAT,
168-
)
169-
170-
for drawer in layout_drawers:
171-
analysis_drawer.add_drawer(drawer)
172-
analysis_drawer.process()
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import json
2+
import uuid
3+
from io import BytesIO
4+
from pathlib import Path
5+
from typing import Optional
6+
7+
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
8+
AnalysisDrawer,
9+
FinalLayoutDrawer,
10+
LayoutDrawer,
11+
OCRLayoutDrawer,
12+
ODModelLayoutDrawer,
13+
PdfminerLayoutDrawer,
14+
)
15+
from unstructured.partition.pdf_image.analysis.layout_dump import (
16+
ExtractedLayoutDumper,
17+
FinalLayoutDumper,
18+
JsonLayoutDumper,
19+
LayoutDumper,
20+
ObjectDetectionLayoutDumper,
21+
OCRLayoutDumper,
22+
)
23+
24+
25+
def _get_drawer_for_dumper(dumper: LayoutDumper) -> Optional[LayoutDrawer]:
26+
"""For a given layout dumper, return the corresponding layout drawer instance initialized with
27+
a dumped layout dict.
28+
29+
Args:
30+
dumper: The layout dumper instance
31+
32+
Returns:
33+
LayoutDrawer: The corresponding layout drawer instance
34+
"""
35+
if isinstance(dumper, ObjectDetectionLayoutDumper):
36+
return ODModelLayoutDrawer(layout_dump=dumper.dump())
37+
elif isinstance(dumper, ExtractedLayoutDumper):
38+
return PdfminerLayoutDrawer(layout_dump=dumper.dump())
39+
elif isinstance(dumper, OCRLayoutDumper):
40+
return OCRLayoutDrawer(layout_dump=dumper.dump())
41+
elif isinstance(dumper, FinalLayoutDumper):
42+
return FinalLayoutDrawer(layout_dump=dumper.dump())
43+
else:
44+
raise ValueError(f"Unknown dumper type: {dumper}")
45+
46+
47+
def _generate_filename(is_image: bool):
48+
"""Generate a filename for the analysis artifacts based on the file type.
49+
Adds a random uuid suffix
50+
"""
51+
suffix = uuid.uuid4().hex[:5]
52+
if is_image:
53+
return f"image_{suffix}.png"
54+
return f"pdf_{suffix}.pdf"
55+
56+
57+
def save_analysis_artifiacts(
58+
*layout_dumpers: LayoutDumper,
59+
is_image: bool,
60+
analyzed_image_output_dir_path: str,
61+
filename: Optional[str] = None,
62+
file: Optional[BytesIO] = None,
63+
skip_bboxes: bool = False,
64+
skip_dump_od: bool = False,
65+
draw_grid: bool = False,
66+
draw_caption: bool = True,
67+
resize: Optional[float] = None,
68+
format: str = "png",
69+
):
70+
"""Save the analysis artifacts for a given file. Loads some settings from
71+
the environment configuration.
72+
73+
Args:
74+
layout_dumpers: The layout dumpers to save and use for bboxes rendering
75+
is_image: Flag for the file type (pdf/image)
76+
analyzed_image_output_dir_path: The directory to save the analysis artifacts
77+
filename: The filename of the sources analyzed file (pdf/image).
78+
Only one of filename or file should be provided.
79+
file: The file object for the analyzed file.
80+
Only one of filename or file should be provided.
81+
draw_grid: Flag for drawing the analysis bboxes on a single image (as grid)
82+
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
83+
resize: Output image resize value. If not provided, the image will not be resized.
84+
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
85+
"""
86+
if not filename:
87+
filename = _generate_filename(is_image)
88+
if skip_bboxes or skip_dump_od:
89+
return
90+
91+
output_path = Path(analyzed_image_output_dir_path)
92+
output_path.mkdir(parents=True, exist_ok=True)
93+
if not skip_dump_od:
94+
json_layout_dumper = JsonLayoutDumper(
95+
filename=filename,
96+
save_dir=output_path,
97+
)
98+
for layout_dumper in layout_dumpers:
99+
json_layout_dumper.add_layout_dumper(layout_dumper)
100+
json_layout_dumper.process()
101+
102+
if not skip_bboxes:
103+
analysis_drawer = AnalysisDrawer(
104+
filename=filename,
105+
file=file,
106+
is_image=is_image,
107+
save_dir=output_path,
108+
draw_grid=draw_grid,
109+
draw_caption=draw_caption,
110+
resize=resize,
111+
format=format,
112+
)
113+
114+
for layout_dumper in layout_dumpers:
115+
drawer = _get_drawer_for_dumper(layout_dumper)
116+
analysis_drawer.add_drawer(drawer)
117+
analysis_drawer.process()
118+
119+
120+
def render_bboxes_for_file(
121+
filename: str,
122+
analyzed_image_output_dir_path: str,
123+
renders_output_dir_path: Optional[str] = None,
124+
draw_grid: bool = False,
125+
draw_caption: bool = True,
126+
resize: Optional[float] = None,
127+
format: str = "png",
128+
):
129+
"""Render the bounding boxes for a given layout dimp file.
130+
To be used for analysis after the partition is performed for
131+
only dumping the layouts - the bboxes can be rendered later.
132+
133+
Expects that the analyzed_image_output_dir_path keeps the structure
134+
that was created by the save_analysis_artifacts function.
135+
136+
Args:
137+
filename: The filename of the sources analyzed file (pdf/image)
138+
analyzed_image_output_dir_path: The directory where the analysis artifacts
139+
(layout dumps) are saved. It should be the root directory of the structure
140+
created by the save_analysis_artifacts function.
141+
renders_output_dir_path: Optional directory to save the rendered bboxes -
142+
if not provided, it will be saved in the analysis directory.
143+
draw_grid: Flag for drawing the analysis bboxes on a single image (as grid)
144+
draw_caption: Flag for drawing the caption above the analyzed page (for e.g. layout source)
145+
resize: Output image resize value. If not provided, the image will not be resized.
146+
format: The format for analyzed pages with bboxes drawn on them. Default is 'png'.
147+
"""
148+
filename_stem = Path(filename).stem
149+
is_image = not Path(filename).suffix.endswith("pdf")
150+
analysis_dumps_dir = (
151+
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "layout_dump"
152+
)
153+
if not analysis_dumps_dir.exists():
154+
return
155+
layout_drawers = []
156+
for analysis_dump_filename in analysis_dumps_dir.iterdir():
157+
if not analysis_dump_filename.is_file():
158+
continue
159+
with open(analysis_dump_filename) as f:
160+
layout_dump = json.load(f)
161+
if analysis_dump_filename.stem == "final":
162+
layout_drawers.append(FinalLayoutDrawer(layout_dump=layout_dump))
163+
if analysis_dump_filename.stem == "object_detection":
164+
layout_drawers.append(ODModelLayoutDrawer(layout_dump=layout_dump))
165+
if analysis_dump_filename.stem == "ocr":
166+
layout_drawers.append(OCRLayoutDrawer(layout_dump=layout_dump))
167+
if analysis_dump_filename.stem == "pdfminer":
168+
layout_drawers.append(PdfminerLayoutDrawer(layout_dump=layout_dump))
169+
170+
if layout_drawers:
171+
if not renders_output_dir_path:
172+
output_path = (
173+
Path(analyzed_image_output_dir_path) / "analysis" / filename_stem / "bboxes"
174+
)
175+
else:
176+
output_path = Path(renders_output_dir_path)
177+
output_path.mkdir(parents=True, exist_ok=True)
178+
analysis_drawer = AnalysisDrawer(
179+
filename=filename,
180+
save_dir=output_path,
181+
is_image=is_image,
182+
draw_grid=draw_grid,
183+
draw_caption=draw_caption,
184+
resize=resize,
185+
format=format,
186+
)
187+
188+
for drawer in layout_drawers:
189+
analysis_drawer.add_drawer(drawer)
190+
analysis_drawer.process()

0 commit comments

Comments
 (0)