31
31
32
32
TOOL = 'ocrd-cis-ocropy-binarize'
33
33
LOG = getLogger ('processor.OcropyBinarize' )
34
- FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-BIN'
35
34
36
35
def binarize (pil_image , method = 'ocropy' , maxskew = 2 , threshold = 0.5 , nrm = False ):
37
36
LOG .debug ('binarizing %dx%d image with method=%s' , pil_image .width , pil_image .height , method )
@@ -81,12 +80,6 @@ def __init__(self, *args, **kwargs):
81
80
LOG .critical ('requested method %s does not support grayscale normalized output' ,
82
81
self .parameter ['method' ])
83
82
raise Exception ('only method=ocropy allows grayscale=true' )
84
- try :
85
- self .page_grp , self .image_grp = self .output_file_grp .split (',' )
86
- except ValueError :
87
- self .page_grp = self .output_file_grp
88
- self .image_grp = FALLBACK_FILEGRP_IMG
89
- LOG .info ("No output file group for images specified, falling back to '%s'" , FALLBACK_FILEGRP_IMG )
90
83
91
84
def process (self ):
92
85
"""Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
@@ -102,21 +95,24 @@ def process(self):
102
95
by removing connected components smaller than ``noise_maxsize``.
103
96
Finally, apply results to the image and export it as an image file.
104
97
105
- Add the new image file to the workspace with the fileGrp USE given
106
- in the second position of the output fileGrp, or ``OCR-D- IMG-BIN``,
107
- and an ID based on input file and input element.
98
+ Add the new image file to the workspace along with the output fileGrp,
99
+ and using a file ID with suffix ``. IMG-BIN`` along with further
100
+ identification of the input element.
108
101
109
102
Reference each new image in the AlternativeImage of the element.
110
103
111
104
Produce a new output file by serialising the resulting hierarchy.
112
105
"""
113
106
level = self .parameter ['level-of-operation' ]
107
+ assert len (self .output_file_grp .split (',' )) == 1 , \
108
+ "Expected exactly one output file group, but '%s' has %d" % (
109
+ self .output_file_grp , len (self .output_file_grp .split (',' )))
114
110
115
111
for (n , input_file ) in enumerate (self .input_files ):
116
112
LOG .info ("INPUT FILE %i / %s" , n , input_file .pageId or input_file .ID )
117
- file_id = input_file .ID .replace (self .input_file_grp , self .image_grp )
113
+ file_id = input_file .ID .replace (self .input_file_grp , self .output_file_grp )
118
114
if file_id == input_file .ID :
119
- file_id = concat_padded (self .image_grp , n )
115
+ file_id = concat_padded (self .output_file_grp , n )
120
116
121
117
pcgts = page_from_file (self .workspace .download_file (input_file ))
122
118
page_id = pcgts .pcGtsId or input_file .pageId or input_file .ID # (PageType has no id)
@@ -164,19 +160,19 @@ def process(self):
164
160
file_id + '_' + region .id + '_' + line .id )
165
161
166
162
# update METS (add the PAGE file):
167
- file_id = input_file .ID .replace (self .input_file_grp , self .page_grp )
163
+ file_id = input_file .ID .replace (self .input_file_grp , self .output_file_grp )
168
164
if file_id == input_file .ID :
169
- file_id = concat_padded (self .page_grp , n )
170
- file_path = os .path .join (self .page_grp , file_id + '.xml' )
165
+ file_id = concat_padded (self .output_file_grp , n )
166
+ file_path = os .path .join (self .output_file_grp , file_id + '.xml' )
171
167
out = self .workspace .add_file (
172
168
ID = file_id ,
173
- file_grp = self .page_grp ,
169
+ file_grp = self .output_file_grp ,
174
170
pageId = input_file .pageId ,
175
171
local_filename = file_path ,
176
172
mimetype = MIMETYPE_PAGE ,
177
173
content = to_xml (pcgts ))
178
174
LOG .info ('created file ID: %s, file_grp: %s, path: %s' ,
179
- file_id , self .page_grp , out .local_filename )
175
+ file_id , self .output_file_grp , out .local_filename )
180
176
181
177
def process_page (self , page , page_image , page_xywh , page_id , file_id ):
182
178
LOG .info ("About to binarize page '%s'" , page_id )
@@ -207,15 +203,16 @@ def process_page(self, page, page_image, page_xywh, page_id, file_id):
207
203
page .set_orientation (orientation )
208
204
# update METS (add the image file):
209
205
if self .parameter ['grayscale' ]:
210
- file_id += '.nrm '
206
+ file_id += '.IMG-NRM '
211
207
features += ',grayscale_normalized'
212
208
else :
209
+ file_id += '.IMG-BIN'
213
210
features += ',binarized'
214
211
file_path = self .workspace .save_image_file (
215
212
bin_image ,
216
213
file_id ,
217
214
page_id = page_id ,
218
- file_grp = self .image_grp )
215
+ file_grp = self .output_file_grp )
219
216
# update PAGE (reference the image file):
220
217
page .add_AlternativeImage (AlternativeImageType (
221
218
filename = file_path ,
@@ -251,15 +248,16 @@ def process_region(self, region, region_image, region_xywh, page_id, file_id):
251
248
region .set_orientation (orientation )
252
249
# update METS (add the image file):
253
250
if self .parameter ['grayscale' ]:
254
- file_id += '.nrm '
251
+ file_id += '.IMG-NRM '
255
252
features += ',grayscale_normalized'
256
253
else :
254
+ file_id += '.IMG-BIN'
257
255
features += ',binarized'
258
256
file_path = self .workspace .save_image_file (
259
257
bin_image ,
260
258
file_id ,
261
259
page_id = page_id ,
262
- file_grp = self .image_grp )
260
+ file_grp = self .output_file_grp )
263
261
# update PAGE (reference the image file):
264
262
region .add_AlternativeImage (AlternativeImageType (
265
263
filename = file_path ,
@@ -289,15 +287,16 @@ def process_line(self, line, line_image, line_xywh, page_id, region_id, file_id)
289
287
features += ',despeckled'
290
288
# update METS (add the image file):
291
289
if self .parameter ['grayscale' ]:
292
- file_id += '.nrm '
290
+ file_id += '.IMG-NRM '
293
291
features += ',grayscale_normalized'
294
292
else :
293
+ file_id += '.IMG-BIN'
295
294
features += ',binarized'
296
295
file_path = self .workspace .save_image_file (
297
296
bin_image ,
298
297
file_id ,
299
298
page_id = page_id ,
300
- file_grp = self .image_grp )
299
+ file_grp = self .output_file_grp )
301
300
# update PAGE (reference the image file):
302
301
line .add_AlternativeImage (AlternativeImageType (
303
302
filename = file_path ,
0 commit comments