Skip to content

Commit 3c5b23e

Browse files
Robert Sachunskyfinkf
authored andcommitted
ocropy processors: put derived images under output fileGrp…
binarize/clip/denoise/deskew/dewarp/segment/resegment: instead of writing AlternativeImages under a separate fileGrp (named `OCR-D-IMG-*`), place them under the output fileGrp (along with PAGE-XML) as well. Differentiate their file ID further by adding suffixes (named `.IMG-*`).
1 parent fdd7345 commit 3c5b23e

File tree

7 files changed

+102
-131
lines changed

7 files changed

+102
-131
lines changed

ocrd_cis/ocropy/binarize.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131

3232
TOOL = 'ocrd-cis-ocropy-binarize'
3333
LOG = getLogger('processor.OcropyBinarize')
34-
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-BIN'
3534

3635
def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False):
3736
LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method)
@@ -81,12 +80,6 @@ def __init__(self, *args, **kwargs):
8180
LOG.critical('requested method %s does not support grayscale normalized output',
8281
self.parameter['method'])
8382
raise Exception('only method=ocropy allows grayscale=true')
84-
try:
85-
self.page_grp, self.image_grp = self.output_file_grp.split(',')
86-
except ValueError:
87-
self.page_grp = self.output_file_grp
88-
self.image_grp = FALLBACK_FILEGRP_IMG
89-
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_FILEGRP_IMG)
9083

9184
def process(self):
9285
"""Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
@@ -102,21 +95,24 @@ def process(self):
10295
by removing connected components smaller than ``noise_maxsize``.
10396
Finally, apply results to the image and export it as an image file.
10497
105-
Add the new image file to the workspace with the fileGrp USE given
106-
in the second position of the output fileGrp, or ``OCR-D-IMG-BIN``,
107-
and an ID based on input file and input element.
98+
Add the new image file to the workspace along with the output fileGrp,
99+
and using a file ID with suffix ``.IMG-BIN`` along with further
100+
identification of the input element.
108101
109102
Reference each new image in the AlternativeImage of the element.
110103
111104
Produce a new output file by serialising the resulting hierarchy.
112105
"""
113106
level = self.parameter['level-of-operation']
107+
assert len(self.output_file_grp.split(',')) == 1, \
108+
"Expected exactly one output file group, but '%s' has %d" % (
109+
self.output_file_grp, len(self.output_file_grp.split(',')))
114110

115111
for (n, input_file) in enumerate(self.input_files):
116112
LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
117-
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
113+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
118114
if file_id == input_file.ID:
119-
file_id = concat_padded(self.image_grp, n)
115+
file_id = concat_padded(self.output_file_grp, n)
120116

121117
pcgts = page_from_file(self.workspace.download_file(input_file))
122118
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -164,19 +160,19 @@ def process(self):
164160
file_id + '_' + region.id + '_' + line.id)
165161

166162
# update METS (add the PAGE file):
167-
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
163+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
168164
if file_id == input_file.ID:
169-
file_id = concat_padded(self.page_grp, n)
170-
file_path = os.path.join(self.page_grp, file_id + '.xml')
165+
file_id = concat_padded(self.output_file_grp, n)
166+
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
171167
out = self.workspace.add_file(
172168
ID=file_id,
173-
file_grp=self.page_grp,
169+
file_grp=self.output_file_grp,
174170
pageId=input_file.pageId,
175171
local_filename=file_path,
176172
mimetype=MIMETYPE_PAGE,
177173
content=to_xml(pcgts))
178174
LOG.info('created file ID: %s, file_grp: %s, path: %s',
179-
file_id, self.page_grp, out.local_filename)
175+
file_id, self.output_file_grp, out.local_filename)
180176

181177
def process_page(self, page, page_image, page_xywh, page_id, file_id):
182178
LOG.info("About to binarize page '%s'", page_id)
@@ -207,15 +203,16 @@ def process_page(self, page, page_image, page_xywh, page_id, file_id):
207203
page.set_orientation(orientation)
208204
# update METS (add the image file):
209205
if self.parameter['grayscale']:
210-
file_id += '.nrm'
206+
file_id += '.IMG-NRM'
211207
features += ',grayscale_normalized'
212208
else:
209+
file_id += '.IMG-BIN'
213210
features += ',binarized'
214211
file_path = self.workspace.save_image_file(
215212
bin_image,
216213
file_id,
217214
page_id=page_id,
218-
file_grp=self.image_grp)
215+
file_grp=self.output_file_grp)
219216
# update PAGE (reference the image file):
220217
page.add_AlternativeImage(AlternativeImageType(
221218
filename=file_path,
@@ -251,15 +248,16 @@ def process_region(self, region, region_image, region_xywh, page_id, file_id):
251248
region.set_orientation(orientation)
252249
# update METS (add the image file):
253250
if self.parameter['grayscale']:
254-
file_id += '.nrm'
251+
file_id += '.IMG-NRM'
255252
features += ',grayscale_normalized'
256253
else:
254+
file_id += '.IMG-BIN'
257255
features += ',binarized'
258256
file_path = self.workspace.save_image_file(
259257
bin_image,
260258
file_id,
261259
page_id=page_id,
262-
file_grp=self.image_grp)
260+
file_grp=self.output_file_grp)
263261
# update PAGE (reference the image file):
264262
region.add_AlternativeImage(AlternativeImageType(
265263
filename=file_path,
@@ -289,15 +287,16 @@ def process_line(self, line, line_image, line_xywh, page_id, region_id, file_id)
289287
features += ',despeckled'
290288
# update METS (add the image file):
291289
if self.parameter['grayscale']:
292-
file_id += '.nrm'
290+
file_id += '.IMG-NRM'
293291
features += ',grayscale_normalized'
294292
else:
293+
file_id += '.IMG-BIN'
295294
features += ',binarized'
296295
file_path = self.workspace.save_image_file(
297296
bin_image,
298297
file_id,
299298
page_id=page_id,
300-
file_grp=self.image_grp)
299+
file_grp=self.output_file_grp)
301300
# update PAGE (reference the image file):
302301
line.add_AlternativeImage(AlternativeImageType(
303302
filename=file_path,

ocrd_cis/ocropy/clip.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535

3636
TOOL = 'ocrd-cis-ocropy-clip'
3737
LOG = getLogger('processor.OcropyClip')
38-
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-CLIP'
3938

4039
class OcropyClip(Processor):
4140

@@ -44,13 +43,6 @@ def __init__(self, *args, **kwargs):
4443
kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
4544
kwargs['version'] = self.ocrd_tool['version']
4645
super(OcropyClip, self).__init__(*args, **kwargs)
47-
if hasattr(self, 'output_file_grp'):
48-
try:
49-
self.page_grp, self.image_grp = self.output_file_grp.split(',')
50-
except ValueError:
51-
self.page_grp = self.output_file_grp
52-
self.image_grp = FALLBACK_FILEGRP_IMG
53-
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_FILEGRP_IMG)
5446

5547
def process(self):
5648
"""Clip text regions / lines of the workspace at intersections with neighbours.
@@ -69,9 +61,9 @@ def process(self):
6961
which are only contained in the neighbour by clipping them to white (background),
7062
and export the (final) result as image file.
7163
72-
Add the new image file to the workspace with the fileGrp USE given
73-
in the second position of the output fileGrp, or ``OCR-D-IMG-CLIP``,
74-
and an ID based on the input file and input element.
64+
Add the new image file to the workspace along with the output fileGrp,
65+
and using a file ID with suffix ``.IMG-CLIP`` along with further
66+
identification of the input element.
7567
7668
Reference each new image in the AlternativeImage of the element.
7769
@@ -88,12 +80,15 @@ def process(self):
8880
# deskewing, because that would make segments incomensurable with their
8981
# neighbours.
9082
level = self.parameter['level-of-operation']
83+
assert len(self.output_file_grp.split(',')) == 1, \
84+
"Expected exactly one output file group, but '%s' has %d" % (
85+
self.output_file_grp, len(self.output_file_grp.split(',')))
9186

9287
for (n, input_file) in enumerate(self.input_files):
9388
LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
94-
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
89+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
9590
if file_id == input_file.ID:
96-
file_id = concat_padded(self.image_grp, n)
91+
file_id = concat_padded(self.output_file_grp, n)
9792

9893
pcgts = page_from_file(self.workspace.download_file(input_file))
9994
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -218,19 +213,19 @@ def process(self):
218213
input_file.pageId, file_id + '_' + region.id + '_' + line.id)
219214

220215
# update METS (add the PAGE file):
221-
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
216+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
222217
if file_id == input_file.ID:
223-
file_id = concat_padded(self.page_grp, n)
224-
file_path = os.path.join(self.page_grp, file_id + '.xml')
218+
file_id = concat_padded(self.output_file_grp, n)
219+
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
225220
out = self.workspace.add_file(
226221
ID=file_id,
227-
file_grp=self.page_grp,
222+
file_grp=self.output_file_grp,
228223
pageId=input_file.pageId,
229224
local_filename=file_path,
230225
mimetype=MIMETYPE_PAGE,
231226
content=to_xml(pcgts))
232227
LOG.info('created file ID: %s, file_grp: %s, path: %s',
233-
file_id, self.page_grp, out.local_filename)
228+
file_id, self.output_file_grp, out.local_filename)
234229

235230
def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
236231
background_image, parent_image, parent_coords, parent_bin,
@@ -270,9 +265,9 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours,
270265
# update METS (add the image file):
271266
file_path = self.workspace.save_image_file(
272267
segment_image,
273-
file_id=file_id,
268+
file_id=file_id + '.IMG-CLIP',
274269
page_id=page_id,
275-
file_grp=self.image_grp)
270+
file_grp=self.output_file_grp)
276271
# update PAGE (reference the image file):
277272
segment.add_AlternativeImage(AlternativeImageType(
278273
filename=file_path,

ocrd_cis/ocropy/denoise.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222

2323
TOOL = 'ocrd-cis-ocropy-denoise'
2424
LOG = getLogger('processor.OcropyDenoise')
25-
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESPECK'
2625

2726
class OcropyDenoise(Processor):
2827

@@ -31,13 +30,6 @@ def __init__(self, *args, **kwargs):
3130
kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
3231
kwargs['version'] = self.ocrd_tool['version']
3332
super(OcropyDenoise, self).__init__(*args, **kwargs)
34-
if hasattr(self, 'output_file_grp'):
35-
try:
36-
self.page_grp, self.image_grp = self.output_file_grp.split(',')
37-
except ValueError:
38-
self.page_grp = self.output_file_grp
39-
self.image_grp = FALLBACK_FILEGRP_IMG
40-
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_FILEGRP_IMG)
4133

4234
def process(self):
4335
"""Despeckle the pages / regions / lines of the workspace.
@@ -52,21 +44,24 @@ def process(self):
5244
smaller than ``noise_maxsize``. Apply results to the image and export
5345
it as an image file.
5446
55-
Add the new image file to the workspace with the fileGrp USE given
56-
in the second position of the output fileGrp, or ``OCR-D-IMG-DESPECK``,
57-
and an ID based on input file and input element.
47+
Add the new image file to the workspace along with the output fileGrp,
48+
and using a file ID with suffix ``.IMG-DESPECK`` along with further
49+
identification of the input element.
5850
5951
Reference each new image in the AlternativeImage of the element.
6052
6153
Produce a new output file by serialising the resulting hierarchy.
6254
"""
6355
level = self.parameter['level-of-operation']
56+
assert len(self.output_file_grp.split(',')) == 1, \
57+
"Expected exactly one output file group, but '%s' has %d" % (
58+
self.output_file_grp, len(self.output_file_grp.split(',')))
6459

6560
for (n, input_file) in enumerate(self.input_files):
6661
LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
67-
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
62+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
6863
if file_id == input_file.ID:
69-
file_id = concat_padded(self.image_grp, n)
64+
file_id = concat_padded(self.output_file_grp, n)
7065

7166
pcgts = page_from_file(self.workspace.download_file(input_file))
7267
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -127,19 +122,19 @@ def process(self):
127122
file_id + '_' + region.id + '_' + line.id)
128123

129124
# update METS (add the PAGE file):
130-
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
125+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
131126
if file_id == input_file.ID:
132-
file_id = concat_padded(self.page_grp, n)
133-
file_path = os.path.join(self.page_grp, file_id + '.xml')
127+
file_id = concat_padded(self.output_file_grp, n)
128+
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
134129
out = self.workspace.add_file(
135130
ID=file_id,
136-
file_grp=self.page_grp,
131+
file_grp=self.output_file_grp,
137132
pageId=input_file.pageId,
138133
local_filename=file_path,
139134
mimetype=MIMETYPE_PAGE,
140135
content=to_xml(pcgts))
141136
LOG.info('created file ID: %s, file_grp: %s, path: %s',
142-
file_id, self.page_grp, out.local_filename)
137+
file_id, self.output_file_grp, out.local_filename)
143138

144139
def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id):
145140
LOG.info("About to despeckle '%s'", file_id)
@@ -148,9 +143,9 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, f
148143
# update METS (add the image file):
149144
file_path = self.workspace.save_image_file(
150145
bin_image,
151-
file_id,
146+
file_id + '.IMG-DESPECK',
152147
page_id=page_id,
153-
file_grp=self.image_grp)
148+
file_grp=self.output_file_grp)
154149
# update PAGE (reference the image file):
155150
segment.add_AlternativeImage(AlternativeImageType(
156151
filename=file_path,

ocrd_cis/ocropy/deskew.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525

2626
TOOL = 'ocrd-cis-ocropy-deskew'
2727
LOG = getLogger('processor.OcropyDeskew')
28-
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESKEW'
2928

3029
def deskew(pil_image, maxskew=2):
3130
array = pil2array(pil_image)
@@ -39,13 +38,6 @@ def __init__(self, *args, **kwargs):
3938
kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL]
4039
kwargs['version'] = ocrd_tool['version']
4140
super(OcropyDeskew, self).__init__(*args, **kwargs)
42-
if hasattr(self, 'output_file_grp'):
43-
try:
44-
self.page_grp, self.image_grp = self.output_file_grp.split(',')
45-
except ValueError:
46-
self.page_grp = self.output_file_grp
47-
self.image_grp = FALLBACK_FILEGRP_IMG
48-
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_FILEGRP_IMG)
4941

5042
def process(self):
5143
"""Deskew the regions of the workspace.
@@ -59,19 +51,22 @@ def process(self):
5951
the deskewing angle of the region (up to ``maxskew``). Annotate the
6052
angle in the region.
6153
62-
Add a new image file to the workspace with the fileGrp USE given
63-
in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``,
64-
and an ID based on input file and input element.
54+
Add the new image file to the workspace along with the output fileGrp,
55+
and using a file ID with suffix ``.IMG-DESKEW`` along with further
56+
identification of the input element.
6557
6658
Produce a new output file by serialising the resulting hierarchy.
6759
"""
6860
level = self.parameter['level-of-operation']
61+
assert len(self.output_file_grp.split(',')) == 1, \
62+
"Expected exactly one output file group, but '%s' has %d" % (
63+
self.output_file_grp, len(self.output_file_grp.split(',')))
6964

7065
for (n, input_file) in enumerate(self.input_files):
7166
LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
72-
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
67+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
7368
if file_id == input_file.ID:
74-
file_id = concat_padded(self.image_grp, n)
69+
file_id = concat_padded(self.output_file_grp, n)
7570

7671
pcgts = page_from_file(self.workspace.download_file(input_file))
7772
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -117,19 +112,19 @@ def process(self):
117112
file_id + '_' + region.id)
118113

119114
# update METS (add the PAGE file):
120-
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
115+
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
121116
if file_id == input_file.ID:
122-
file_id = concat_padded(self.page_grp, n)
123-
file_path = os.path.join(self.page_grp, file_id + '.xml')
117+
file_id = concat_padded(self.output_file_grp, n)
118+
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
124119
out = self.workspace.add_file(
125120
ID=file_id,
126-
file_grp=self.page_grp,
121+
file_grp=self.output_file_grp,
127122
pageId=input_file.pageId,
128123
local_filename=file_path,
129124
mimetype=MIMETYPE_PAGE,
130125
content=to_xml(pcgts))
131126
LOG.info('created file ID: %s, file_grp: %s, path: %s',
132-
file_id, self.page_grp, out.local_filename)
127+
file_id, self.output_file_grp, out.local_filename)
133128

134129
def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id):
135130
angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image
@@ -149,9 +144,9 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p
149144
# update METS (add the image file):
150145
file_path = self.workspace.save_image_file(
151146
segment_image,
152-
file_id,
147+
file_id + '.IMG-DESKEW',
153148
page_id=page_id,
154-
file_grp=self.image_grp)
149+
file_grp=self.output_file_grp)
155150
# update PAGE (reference the image file):
156151
segment.add_AlternativeImage(AlternativeImageType(
157152
filename=file_path,

0 commit comments

Comments
 (0)