@@ -49,7 +49,7 @@ class StdoutWrapper:
4949 sys .stdout .write (data )
5050
5151
52- def export_pdf (playground , default_dpi , savefile = False ):
52+ def export_pdf (playground , default_dpi , savefile = False , selectable_text = False ):
5353 """Create a searchable PDF from a pile of HOCR + JPEG"""
5454 images = sorted (glob .glob (os .path .join (playground , '*.jpg' )))
5555 images = Tcl ().call ('lsort' , '-dict' , images )
@@ -58,6 +58,7 @@ def export_pdf(playground, default_dpi, savefile=False):
5858 "\n Script cannot proceed without them and will terminate now.\n " )
5959 sys .exit (0 )
6060 load_invisible_font ()
61+ load_default_font ()
6162 pdf = Canvas (savefile if savefile else StdoutWrapper (), pageCompression = 1 )
6263 pdf .setCreator ('hocr-tools' )
6364 pdf .setTitle (os .path .basename (playground ))
@@ -73,13 +74,17 @@ def export_pdf(playground, default_dpi, savefile=False):
7374 height = h * 72 / dpi
7475 pdf .setPageSize ((width , height ))
7576 pdf .drawImage (image , 0 , 0 , width = width , height = height )
76- add_text_layer (pdf , image , height , dpi )
77+ add_text_layer (pdf , image , height , dpi , selectable_text )
7778 pdf .showPage ()
7879 pdf .save ()
7980
8081
81- def add_text_layer (pdf , image , height , dpi ):
82+ def add_text_layer (pdf , image , height , dpi , selectable_text = False ):
8283 """Draw an invisible text layer for OCR data"""
84+ font = 'invisible'
85+ if selectable_text :
86+ font = "DejaVuSans"
87+
8388 p1 = re .compile (r'bbox((\s+\d+){4})' )
8489 p2 = re .compile (r'baseline((\s+[\d\.\-]+){2})' )
8590 hocrfile = os .path .splitext (image )[0 ] + ".hocr"
@@ -101,19 +106,14 @@ def add_text_layer(pdf, image, height, dpi):
101106 rawtext = word .text_content ().strip ()
102107 if rawtext == '' :
103108 continue
104- font_width = pdf .stringWidth (rawtext , 'invisible' , 8 )
105- if font_width <= 0 :
106- continue
107109 box = p1 .search (word .attrib ['title' ]).group (1 ).split ()
108110 box = [float (i ) for i in box ]
109111 b = polyval (baseline ,
110112 (box [0 ] + box [2 ]) / 2 - linebox [0 ]) + linebox [3 ]
111113 text = pdf .beginText ()
112114 text .setTextRenderMode (3 ) # double invisible
113- text .setFont ('invisible' , 8 )
115+ text .setFont (font , 8 )
114116 text .setTextOrigin (box [0 ] * 72 / dpi , height - b * 72 / dpi )
115- box_width = (box [2 ] - box [0 ]) * 72 / dpi
116- text .setHorizScale (100.0 * box_width / font_width )
117117 text .textLine (rawtext )
118118 pdf .drawText (text )
119119
@@ -154,6 +154,10 @@ CMGjwvxTsr74/f/F95m3TH9x8o0/TU//N+7/D/ScVcA=
154154 pdfmetrics .registerFont (TTFont ('invisible' , ttf ))
155155
156156
157+ def load_default_font ():
158+ pdfmetrics .registerFont (TTFont ('DejaVuSans' , 'DejaVuSans.ttf' ))
159+
160+
157161if __name__ == "__main__" :
158162 parser = argparse .ArgumentParser (
159163 description = "Create a searchable PDF from a pile of hOCR and JPEG" )
@@ -167,7 +171,9 @@ if __name__ == "__main__":
167171 "--savefile" ,
168172 help = "Save to this file instead of outputting to stdout"
169173 )
174+ parser .add_argument ("--selectable-text" , action = "store_true" ,
175+ help = "allow text selection on pdf" )
170176 args = parser .parse_args ()
171177 if not os .path .isdir (args .imgdir ):
172178 sys .exit ("ERROR: Given path '" + args .imgdir + "' is not a directory" )
173- export_pdf (args .imgdir , 300 , args .savefile )
179+ export_pdf (args .imgdir , 300 , args .savefile , args . selectable_text )
0 commit comments