Skip to content

Commit b634c11

Browse files
fix: update vizualisation script (#95)
Signed-off-by: Peter Staar <[email protected]>
1 parent 59aa984 commit b634c11

File tree

2 files changed

+36
-22
lines changed

2 files changed

+36
-22
lines changed

docling_parse/pdf_parser.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,6 @@ def _to_cells(self, cells: dict) -> List[PdfCell]:
174174

175175
result: List[PdfCell] = []
176176
for ind, row in enumerate(data):
177-
print(row)
178177
rect = BoundingRectangle(
179178
r_x0=row[header.index(f"r_x0")],
180179
r_y0=row[header.index(f"r_y0")],

docling_parse/visualize.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,13 @@ def parse_args():
8383
help="Enable interactive mode (default: False)",
8484
)
8585

86+
# Add an optional boolean argument for interactive mode
87+
parser.add_argument(
88+
"--log-text",
89+
action="store_true",
90+
help="Enable interactive mode (default: False)",
91+
)
92+
8693
# Add an argument for the output directory, defaulting to "./tmp"
8794
parser.add_argument(
8895
"-o",
@@ -122,6 +129,7 @@ def parse_args():
122129
args.output_dir,
123130
int(args.page),
124131
args.display_text,
132+
args.log_text,
125133
args.page_boundary,
126134
args.category,
127135
)
@@ -266,6 +274,7 @@ def visualise_py(
266274
output_dir: str,
267275
page_num: int,
268276
display_text: bool,
277+
log_text: bool,
269278
page_boundary: str = "crop_box", # media_box
270279
category: str = "both", # "both", "sanitized", "original"
271280
):
@@ -282,31 +291,35 @@ def visualise_py(
282291

283292
pdf_page: ParsedPdfPage = pdf_doc.get_page(page_no=page_no)
284293

285-
if category in ["sanitized", "both"]:
286-
pdf_page.sanitized.render(
287-
draw_cells_bbox=(not display_text), draw_cells_text=display_text
288-
).show()
289-
elif category in ["original", "both"]:
290-
pdf_page.original.render(
294+
if category in ["original", "both"]:
295+
img = pdf_page.original.render(
291296
draw_cells_bbox=(not display_text), draw_cells_text=display_text
292-
).show()
297+
)
293298

294-
lines = pdf_page.original.export_to_textlines(
295-
add_fontkey=True, add_fontname=False
296-
)
297-
print(f"text-lines (original, page_no: {page_no}):")
298-
print("\n".join(lines))
299+
if interactive:
300+
img.show()
299301

300-
lines = pdf_page.sanitized.export_to_textlines(
301-
add_fontkey=True, add_fontname=False
302-
)
303-
print(f"text-lines (sanitized, page_no: {page_no}):")
304-
print("\n".join(lines))
302+
if log_text:
303+
lines = pdf_page.original.export_to_textlines(
304+
add_fontkey=True, add_fontname=False
305+
)
306+
print(f"text-lines (original, page_no: {page_no}):")
307+
print("\n".join(lines))
305308

306-
"""
307-
lines = pdf_page.original.export_to_textlines(add_fontkey=True)
308-
print("\n".join(lines))
309-
"""
309+
if category in ["sanitized", "both"]:
310+
img = pdf_page.sanitized.render(
311+
draw_cells_bbox=(not display_text), draw_cells_text=display_text
312+
)
313+
314+
if interactive:
315+
img.show()
316+
317+
if log_text:
318+
lines = pdf_page.sanitized.export_to_textlines(
319+
add_fontkey=True, add_fontname=False
320+
)
321+
print(f"text-lines (sanitized, page_no: {page_no}):")
322+
print("\n".join(lines))
310323

311324

312325
def main():
@@ -319,6 +332,7 @@ def main():
319332
output_dir,
320333
page_num,
321334
display_text,
335+
log_text,
322336
page_boundary,
323337
category,
324338
) = parse_args()
@@ -353,6 +367,7 @@ def main():
353367
output_dir=output_dir,
354368
page_num=page_num,
355369
display_text=display_text,
370+
log_text=log_text,
356371
page_boundary=page_boundary,
357372
category=category,
358373
)

0 commit comments

Comments
 (0)