Skip to content

Commit 515bcba

Browse files
author
Iordanis Kostelidis
authored
Merge pull request #5 from datascouting/all-text-select-on-pdf
allow add selectable text on pdf
2 parents 4dbab05 + ccedadc commit 515bcba

File tree

3 files changed

+22
-11
lines changed

3 files changed

+22
-11
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
/dist/
55
/*.egg-info/
66
.idea
7-
7+
*.pdf

hocr-pdf

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class StdoutWrapper:
4949
sys.stdout.write(data)
5050

5151

52-
def export_pdf(playground, default_dpi, savefile=False):
52+
def export_pdf(playground, default_dpi, savefile=False, selectable_text=False):
5353
"""Create a searchable PDF from a pile of HOCR + JPEG"""
5454
images = sorted(glob.glob(os.path.join(playground, '*.jpg')))
5555
images = Tcl().call('lsort', '-dict', images)
@@ -58,6 +58,7 @@ def export_pdf(playground, default_dpi, savefile=False):
5858
"\nScript cannot proceed without them and will terminate now.\n")
5959
sys.exit(0)
6060
load_invisible_font()
61+
load_default_font()
6162
pdf = Canvas(savefile if savefile else StdoutWrapper(), pageCompression=1)
6263
pdf.setCreator('hocr-tools')
6364
pdf.setTitle(os.path.basename(playground))
@@ -73,13 +74,17 @@ def export_pdf(playground, default_dpi, savefile=False):
7374
height = h * 72 / dpi
7475
pdf.setPageSize((width, height))
7576
pdf.drawImage(image, 0, 0, width=width, height=height)
76-
add_text_layer(pdf, image, height, dpi)
77+
add_text_layer(pdf, image, height, dpi, selectable_text)
7778
pdf.showPage()
7879
pdf.save()
7980

8081

81-
def add_text_layer(pdf, image, height, dpi):
82+
def add_text_layer(pdf, image, height, dpi, selectable_text=False):
8283
"""Draw an invisible text layer for OCR data"""
84+
font = 'invisible'
85+
if selectable_text:
86+
font = "DejaVuSans"
87+
8388
p1 = re.compile(r'bbox((\s+\d+){4})')
8489
p2 = re.compile(r'baseline((\s+[\d\.\-]+){2})')
8590
hocrfile = os.path.splitext(image)[0] + ".hocr"
@@ -101,19 +106,14 @@ def add_text_layer(pdf, image, height, dpi):
101106
rawtext = word.text_content().strip()
102107
if rawtext == '':
103108
continue
104-
font_width = pdf.stringWidth(rawtext, 'invisible', 8)
105-
if font_width <= 0:
106-
continue
107109
box = p1.search(word.attrib['title']).group(1).split()
108110
box = [float(i) for i in box]
109111
b = polyval(baseline,
110112
(box[0] + box[2]) / 2 - linebox[0]) + linebox[3]
111113
text = pdf.beginText()
112114
text.setTextRenderMode(3) # double invisible
113-
text.setFont('invisible', 8)
115+
text.setFont(font, 8)
114116
text.setTextOrigin(box[0] * 72 / dpi, height - b * 72 / dpi)
115-
box_width = (box[2] - box[0]) * 72 / dpi
116-
text.setHorizScale(100.0 * box_width / font_width)
117117
text.textLine(rawtext)
118118
pdf.drawText(text)
119119

@@ -154,6 +154,10 @@ CMGjwvxTsr74/f/F95m3TH9x8o0/TU//N+7/D/ScVcA=
154154
pdfmetrics.registerFont(TTFont('invisible', ttf))
155155

156156

157+
def load_default_font():
158+
pdfmetrics.registerFont(TTFont('DejaVuSans', 'DejaVuSans.ttf'))
159+
160+
157161
if __name__ == "__main__":
158162
parser = argparse.ArgumentParser(
159163
description="Create a searchable PDF from a pile of hOCR and JPEG")
@@ -167,7 +171,9 @@ if __name__ == "__main__":
167171
"--savefile",
168172
help="Save to this file instead of outputting to stdout"
169173
)
174+
parser.add_argument("--selectable-text", action="store_true",
175+
help="allow text selection on pdf")
170176
args = parser.parse_args()
171177
if not os.path.isdir(args.imgdir):
172178
sys.exit("ERROR: Given path '" + args.imgdir + "' is not a directory")
173-
export_pdf(args.imgdir, 300, args.savefile)
179+
export_pdf(args.imgdir, 300, args.savefile, args.selectable_text)

test/hocr-pdf/test-hocr-pdf.tsht

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,9 @@ file_not_empty "${work}.pdf"
1414
exec_ok pdfgrep 'tribunali' "${work}.pdf"
1515
hocr-pdf . --savefile "${work}-saved.pdf"
1616
file_not_empty "${work}-saved.pdf"
17+
hocr-pdf --selectable-text . > "${work}.pdf"
18+
file_not_empty "${work}.pdf"
19+
exec_ok pdfgrep 'tribunali' "${work}.pdf"
20+
hocr-pdf --selectable-text . --savefile "${work}-saved.pdf"
21+
file_not_empty "${work}-saved.pdf"
1722
rm $work*

0 commit comments

Comments
 (0)