-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathfind_pagenos.py
More file actions
165 lines (140 loc) · 5.93 KB
/
find_pagenos.py
File metadata and controls
165 lines (140 loc) · 5.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import re
from lxml import etree
from rnums import rnum_to_int
from tuples import *
ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
def guess_best_pageno(pageinfo, pages, window=None):
""" Select the best candidate pagenumber for the given page, with
reference to neighboring pages. 'pages' must be a
windowed_iterator; 'window', if provided, will look to a smaller
set of neighboring pages to determine a likely page number.
(Smaller than that provided by the given windowed_iterator.)
"""
if window is None:
window = pages.window
def tally(pageinfo, current_index, sofar, weight):
for c in pageinfo.info['pageno_candidates']:
if c.offset >= current_index:
continue
if c.offset not in sofar[c.type]:
sofar[c.type][c.offset] = weight
else:
sofar[c.type][c.offset] += weight
sofar = {'roman':{},'arabic':{}}
tally(pageinfo, pageinfo.index, sofar, 2)
for neighbor_info in pages.neighbors(window):
tally(neighbor_info, pageinfo.index, sofar, 1)
def thin(obj):
kys = [k for k in obj]
for k in kys:
if obj[k] < 2:
del obj[k]
thin(sofar['roman'])
thin(sofar['arabic'])
mostsofar = None
votes = 0
likelytype = None
for k in sofar['arabic']:
if sofar['arabic'][k] > votes:
votes = sofar['arabic'][k]
likelytype = 'arabic'
mostsofar = k
for k in sofar['roman']:
if sofar['roman'][k] > votes:
votes = sofar['roman'][k]
likelytype = 'roman'
mostsofar = k
pageno_guess = None
if mostsofar:
pageno_guess = pageinfo.index - int(mostsofar)
# print 'index %s: page guess %s %s' % (pageinfo.index, pageno_guess, likelytype)
pageinfo.info['pageno_guess'] = pageno_guess
# if a page coord candidate on *this* page matches, capture it
for c in pageinfo.info['pageno_candidates']:
if c.type == likelytype and c.offset == mostsofar:
# just take first potential coordinate for now
pageinfo.info['pageno_fmt'] = c.coords[0][0]
pageinfo.info['pageno_coord'] = c.coords[0][1]
break
return pageno_guess
# print 'roman: %s' % json.dumps(sofar['roman'])
# print 'arabic: %s' % json.dumps(sofar['arabic'])
re_roman = re.compile(r'\b[xvi]+\b')
re_arabic = re.compile(r'\b\d+\b')
def annotate_page(page):
cands = [c for c in pageno_candidates(page, page.page, page.index)]
page.info['pageno_candidates'] = cands
def pageno_candidates(pageinfo, page, index):
seen = {}
# find margin % of top/bottom of text bounding box
pagebounds = pageinfo.info['bounds']
page_height = int(page.get('height'))
margin = .05
top_margin = pagebounds.t + page_height * margin
bottom_margin = pagebounds.b - page_height * margin
# findexpr = './/'+ns+'formatting'
# for fmt in page.findall(findexpr):
# # move on if not near page top/bottom
# line = fmt.getparent()
# t = int(line.get('t'))
# b = int(line.get('b'))
# if t > top_margin and t < bottom_margin:
# continue
# fmt_text = etree.tostring(fmt,
# method='text',
# encoding=unicode).lower();
for word in pageinfo.get_words():
fmt_text = word.text
# def find_box(m):
# # l t r b
# start, end = m.span()
# if end >= len(fmt):
# end = len(fmt) - 1
# return Coord(fmt[start].get('l'), t, fmt[end].get('r'), b)
def find_box(m):
raise 'NYI'
# return box(1,2,3,4)
# look for roman numerals
# fix some common OCR errors
# XXX RESTORE adjusted_text = (fmt_text.replace('u', 'ii')
# .replace('n', 'ii')
# .replace('l', 'i')
# .replace(r"\'", 'v'))
adjusted_text = fmt_text
# collapse space between potential roman numerals
# XXX RESTORE adjusted_text = re.sub(r'\b([xvi]+)\b +\b([xvi]+)\b', r'\1\1', adjusted_text)
for m in re_roman.finditer(adjusted_text):
num_str = m.group()
if not num_str in seen:
i = rnum_to_int(num_str)
if i > index and i != 0:
continue
seen[num_str] = Pageno('roman', num_str, i, index - i,
# [(word, find_box(m))])
[(word, None)])
# [(fmt, find_box(m))])
else:
seen[num_str].coords.append((word, None))
# seen[num_str].coords.append((word, find_box(m)))
# seen[num_str].coords.append((fmt, find_box(m)))
yield seen[num_str]
# look for arabic numerals
# fix some common OCR errors
# XXX RESTORE adjusted_text = fmt_text.replace('i', '1').replace('o', '0').replace('s', '5').replace('"', '11')
# collapse spaces
# XXX RESTORE adjusted_text = re.sub(r'\b(\d+)\b +\b(\d+)\b', r'\1\1', adjusted_text)
for m in re_arabic.finditer(adjusted_text):
num_str = m.group()
if not num_str in seen:
i = int(num_str)
if i > index and i != 0:
continue
seen[num_str] = Pageno('arabic', num_str, i, index - i,
[(word, None)])
# [(word, find_box(m))])
# [(fmt, find_box(m))])
else:
seen[num_str].coords.append((word, None))
# seen[num_str].coords.append((word, find_box(m)))
# seen[num_str].coords.append((fmt, find_box(m)))
yield seen[num_str]