@@ -83,6 +83,13 @@ def parse_args():
8383 help = "Enable interactive mode (default: False)" ,
8484 )
8585
86+ # Add an optional boolean argument for interactive mode
87+ parser .add_argument (
88+ "--log-text" ,
89+ action = "store_true" ,
90+ help = "Enable interactive mode (default: False)" ,
91+ )
92+
8693 # Add an argument for the output directory, defaulting to "./tmp"
8794 parser .add_argument (
8895 "-o" ,
@@ -122,6 +129,7 @@ def parse_args():
122129 args .output_dir ,
123130 int (args .page ),
124131 args .display_text ,
132+ args .log_text ,
125133 args .page_boundary ,
126134 args .category ,
127135 )
@@ -266,6 +274,7 @@ def visualise_py(
266274 output_dir : str ,
267275 page_num : int ,
268276 display_text : bool ,
277+ log_text : bool ,
269278 page_boundary : str = "crop_box" , # media_box
270279 category : str = "both" , # "both", "sanitized", "original"
271280):
@@ -282,31 +291,35 @@ def visualise_py(
282291
283292 pdf_page : ParsedPdfPage = pdf_doc .get_page (page_no = page_no )
284293
285- if category in ["sanitized" , "both" ]:
286- pdf_page .sanitized .render (
287- draw_cells_bbox = (not display_text ), draw_cells_text = display_text
288- ).show ()
289- elif category in ["original" , "both" ]:
290- pdf_page .original .render (
294+ if category in ["original" , "both" ]:
295+ img = pdf_page .original .render (
291296 draw_cells_bbox = (not display_text ), draw_cells_text = display_text
292- ). show ()
297+ )
293298
294- lines = pdf_page .original .export_to_textlines (
295- add_fontkey = True , add_fontname = False
296- )
297- print (f"text-lines (original, page_no: { page_no } ):" )
298- print ("\n " .join (lines ))
299+ if interactive :
300+ img .show ()
299301
300- lines = pdf_page .sanitized .export_to_textlines (
301- add_fontkey = True , add_fontname = False
302- )
303- print (f"text-lines (sanitized, page_no: { page_no } ):" )
304- print ("\n " .join (lines ))
302+ if log_text :
303+ lines = pdf_page .original .export_to_textlines (
304+ add_fontkey = True , add_fontname = False
305+ )
306+ print (f"text-lines (original, page_no: { page_no } ):" )
307+ print ("\n " .join (lines ))
305308
306- """
307- lines = pdf_page.original.export_to_textlines(add_fontkey=True)
308- print("\n ".join(lines))
309- """
309+ if category in ["sanitized" , "both" ]:
310+ img = pdf_page .sanitized .render (
311+ draw_cells_bbox = (not display_text ), draw_cells_text = display_text
312+ )
313+
314+ if interactive :
315+ img .show ()
316+
317+ if log_text :
318+ lines = pdf_page .sanitized .export_to_textlines (
319+ add_fontkey = True , add_fontname = False
320+ )
321+ print (f"text-lines (sanitized, page_no: { page_no } ):" )
322+ print ("\n " .join (lines ))
310323
311324
312325def main ():
@@ -319,6 +332,7 @@ def main():
319332 output_dir ,
320333 page_num ,
321334 display_text ,
335+ log_text ,
322336 page_boundary ,
323337 category ,
324338 ) = parse_args ()
@@ -353,6 +367,7 @@ def main():
353367 output_dir = output_dir ,
354368 page_num = page_num ,
355369 display_text = display_text ,
370+ log_text = log_text ,
356371 page_boundary = page_boundary ,
357372 category = category ,
358373 )
0 commit comments