Skip to content

Commit a6f90ab

Browse files
authored
Merge pull request #71 from bertsky/fix-getlogger-common
update to new core
2 parents fb6cb47 + 3a10fd9 commit a6f90ab

File tree

15 files changed

+129
-232
lines changed

15 files changed

+129
-232
lines changed

ocrd_cis/ocrd-tool.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"git_url": "https://github.com/cisocrgroup/ocrd_cis",
3-
"version": "0.1.3",
3+
"version": "0.1.4",
44
"tools": {
55
"ocrd-cis-ocropy-binarize": {
66
"executable": "ocrd-cis-ocropy-binarize",
@@ -61,7 +61,7 @@
6161
},
6262
"level-of-operation": {
6363
"type": "string",
64-
"enum": ["page", "region", "line"],
64+
"enum": ["page", "table", "region", "line"],
6565
"description": "PAGE XML hierarchy level granularity to annotate images for",
6666
"default": "page"
6767
}
@@ -92,7 +92,7 @@
9292
},
9393
"level-of-operation": {
9494
"type": "string",
95-
"enum": ["page", "region"],
95+
"enum": ["page", "table", "region"],
9696
"description": "PAGE XML hierarchy level granularity to annotate images for",
9797
"default": "region"
9898
}

ocrd_cis/ocropy/binarize.py

Lines changed: 27 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
)
1616
from ocrd_modelfactory import page_from_file
1717
from ocrd_models.ocrd_page import (
18-
MetadataItemType,
19-
LabelsType, LabelType,
2018
to_xml, AlternativeImageType
2119
)
2220
from ocrd import Processor
@@ -76,12 +74,16 @@ def __init__(self, *args, **kwargs):
7674
kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL]
7775
kwargs['version'] = self.ocrd_tool['version']
7876
super(OcropyBinarize, self).__init__(*args, **kwargs)
79-
LOG = getLogger('processor.OcropyBinarize')
8077
if hasattr(self, 'output_file_grp'):
81-
if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
82-
LOG.critical('requested method %s does not support grayscale normalized output',
83-
self.parameter['method'])
84-
raise Exception('only method=ocropy allows grayscale=true')
78+
# processing context
79+
self.setup()
80+
81+
def setup(self):
82+
self.logger = getLogger('processor.OcropyBinarize')
83+
if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy':
84+
self.logger.critical('requested method %s does not support grayscale normalized output',
85+
self.parameter['method'])
86+
raise Exception('only method=ocropy allows grayscale=true')
8587

8688
def process(self):
8789
"""Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace.
@@ -105,31 +107,18 @@ def process(self):
105107
106108
Produce a new output file by serialising the resulting hierarchy.
107109
"""
108-
LOG = getLogger('processor.OcropyBinarize')
109110
level = self.parameter['level-of-operation']
110111
assert_file_grp_cardinality(self.input_file_grp, 1)
111112
assert_file_grp_cardinality(self.output_file_grp, 1)
112113

113114
for (n, input_file) in enumerate(self.input_files):
114-
LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
115+
self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
115116
file_id = make_file_id(input_file, self.output_file_grp)
116117

117118
pcgts = page_from_file(self.workspace.download_file(input_file))
119+
self.add_metadata(pcgts)
118120
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
119121
page = pcgts.get_Page()
120-
121-
# add metadata about this operation and its runtime parameters:
122-
metadata = pcgts.get_Metadata() # ensured by from_file()
123-
metadata.add_MetadataItem(
124-
MetadataItemType(type_="processingStep",
125-
name=self.ocrd_tool['steps'][0],
126-
value=TOOL,
127-
Labels=[LabelsType(
128-
externalModel="ocrd-tool",
129-
externalId="parameters",
130-
Label=[LabelType(type_=name,
131-
value=self.parameter[name])
132-
for name in self.parameter.keys()])]))
133122

134123
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
135124
page, page_id, feature_filter='binarized')
@@ -139,7 +128,7 @@ def process(self):
139128
dpi = page_image_info.resolution
140129
if page_image_info.resolutionUnit == 'cm':
141130
dpi *= 2.54
142-
LOG.info('Page "%s" uses %f DPI', page_id, dpi)
131+
self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
143132
zoom = 300.0/dpi
144133
else:
145134
zoom = 1
@@ -148,10 +137,12 @@ def process(self):
148137
self.process_page(page, page_image, page_xywh, zoom,
149138
input_file.pageId, file_id)
150139
else:
151-
regions = page.get_TextRegion() + (
152-
page.get_TableRegion() if level == 'region' else [])
140+
if level == 'table':
141+
regions = page.get_TableRegion()
142+
else: # region
143+
regions = page.get_AllRegions(classes=['Text'])
153144
if not regions:
154-
LOG.warning('Page "%s" contains no text regions', page_id)
145+
self.logger.warning('Page "%s" contains no text regions', page_id)
155146
for region in regions:
156147
region_image, region_xywh = self.workspace.image_from_segment(
157148
region, page_image, page_xywh, feature_filter='binarized')
@@ -161,7 +152,8 @@ def process(self):
161152
continue
162153
lines = region.get_TextLine()
163154
if not lines:
164-
LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
155+
self.logger.warning('Page "%s" region "%s" contains no text lines',
156+
page_id, region.id)
165157
for line in lines:
166158
line_image, line_xywh = self.workspace.image_from_segment(
167159
line, region_image, region_xywh, feature_filter='binarized')
@@ -179,12 +171,11 @@ def process(self):
179171
local_filename=file_path,
180172
mimetype=MIMETYPE_PAGE,
181173
content=to_xml(pcgts))
182-
LOG.info('created file ID: %s, file_grp: %s, path: %s',
183-
file_id, self.output_file_grp, out.local_filename)
174+
self.logger.info('created file ID: %s, file_grp: %s, path: %s',
175+
file_id, self.output_file_grp, out.local_filename)
184176

185177
def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
186-
LOG = getLogger('processor.OcropyBinarize')
187-
LOG.info("About to binarize page '%s'", page_id)
178+
self.logger.info("About to binarize page '%s'", page_id)
188179
features = page_xywh['features']
189180
if 'angle' in page_xywh and page_xywh['angle']:
190181
# orientation has already been annotated (by previous deskewing),
@@ -229,8 +220,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id):
229220
comments=features))
230221

231222
def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id):
232-
LOG = getLogger('processor.OcropyBinarize')
233-
LOG.info("About to binarize page '%s' region '%s'", page_id, region.id)
223+
self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id)
234224
features = region_xywh['features']
235225
if 'angle' in region_xywh and region_xywh['angle']:
236226
# orientation has already been annotated (by previous deskewing),
@@ -277,9 +267,8 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_
277267
comments=features))
278268

279269
def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id):
280-
LOG = getLogger('processor.OcropyBinarize')
281-
LOG.info("About to binarize page '%s' region '%s' line '%s'",
282-
page_id, region_id, line.id)
270+
self.logger.info("About to binarize page '%s' region '%s' line '%s'",
271+
page_id, region_id, line.id)
283272
features = line_xywh['features']
284273
bin_image, angle = binarize(line_image,
285274
method=self.parameter['method'],
@@ -294,8 +283,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi
294283
#orientation = -angle
295284
#orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180]
296285
#line.set_orientation(orientation) # does not exist on line level!
297-
LOG.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'",
298-
-angle, page_id, region_id, line.id)
286+
self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'",
287+
-angle, page_id, region_id, line.id)
299288
bin_image = remove_noise(bin_image,
300289
maxsize=self.parameter['noise_maxsize'])
301290
if self.parameter['noise_maxsize']:

ocrd_cis/ocropy/clip.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
from ocrd_modelfactory import page_from_file
1111
from ocrd_models.ocrd_page import (
12-
MetadataItemType,
13-
LabelsType, LabelType,
1412
to_xml, AlternativeImageType
1513
)
1614
from ocrd import Processor
@@ -89,22 +87,10 @@ def process(self):
8987
file_id = make_file_id(input_file, self.output_file_grp)
9088

9189
pcgts = page_from_file(self.workspace.download_file(input_file))
90+
self.add_metadata(pcgts)
9291
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
9392
page = pcgts.get_Page()
9493

95-
# add metadata about this operation and its runtime parameters:
96-
metadata = pcgts.get_Metadata() # ensured by from_file()
97-
metadata.add_MetadataItem(
98-
MetadataItemType(type_="processingStep",
99-
name=self.ocrd_tool['steps'][0],
100-
value=TOOL,
101-
Labels=[LabelsType(
102-
externalModel="ocrd-tool",
103-
externalId="parameters",
104-
Label=[LabelType(type_=name,
105-
value=self.parameter[name])
106-
for name in self.parameter.keys()])]))
107-
10894
page_image, page_coords, page_image_info = self.workspace.image_from_page(
10995
page, page_id, feature_selector='binarized')
11096
if self.parameter['dpi'] > 0:
@@ -135,7 +121,12 @@ def process(self):
135121
page.get_UnknownRegion())
136122
if not num_texts:
137123
LOG.warning('Page "%s" contains no text regions', page_id)
138-
background = ImageStat.Stat(page_image).median[0]
124+
background = ImageStat.Stat(page_image)
125+
# workaround for Pillow#4925
126+
if len(background.bands) > 1:
127+
background = tuple(background.median)
128+
else:
129+
background = background.median[0]
139130
if level == 'region':
140131
background_image = Image.new('L', page_image.size, background)
141132
page_array = pil2array(page_image)

ocrd_cis/ocropy/common.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@
1414
# for decorators (type-checks etc):
1515
from .ocrolib.toplevel import *
1616

17-
from ocrd_utils import getLogger
18-
19-
LOG = getLogger('ocrolib') # to be refined by importer
17+
LOG = logging.getLogger('ocrolib') # to be refined by importer
2018

2119
# method similar to ocrolib.read_image_gray
2220
@checks(Image.Image)

ocrd_cis/ocropy/denoise.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
)
1111
from ocrd_modelfactory import page_from_file
1212
from ocrd_models.ocrd_page import (
13-
MetadataItemType,
14-
LabelsType, LabelType,
1513
to_xml, AlternativeImageType
1614
)
1715
from ocrd import Processor
@@ -62,21 +60,9 @@ def process(self):
6260
file_id = make_file_id(input_file, self.output_file_grp)
6361

6462
pcgts = page_from_file(self.workspace.download_file(input_file))
63+
self.add_metadata(pcgts)
6564
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
6665
page = pcgts.get_Page()
67-
68-
# add metadata about this operation and its runtime parameters:
69-
metadata = pcgts.get_Metadata() # ensured by from_file()
70-
metadata.add_MetadataItem(
71-
MetadataItemType(type_="processingStep",
72-
name=self.ocrd_tool['steps'][0],
73-
value=TOOL,
74-
Labels=[LabelsType(
75-
externalModel="ocrd-tool",
76-
externalId="parameters",
77-
Label=[LabelType(type_=name,
78-
value=self.parameter[name])
79-
for name in self.parameter.keys()])]))
8066

8167
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
8268
page, page_id,
@@ -96,8 +82,7 @@ def process(self):
9682
self.process_segment(page, page_image, page_xywh, zoom,
9783
input_file.pageId, file_id)
9884
else:
99-
regions = page.get_TextRegion() + (
100-
page.get_TableRegion() if level == 'region' else [])
85+
regions = page.get_AllRegions(classes=['Text'])
10186
if not regions:
10287
LOG.warning('Page "%s" contains no text regions', page_id)
10388
for region in regions:

ocrd_cis/ocropy/deskew.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
)
1212
from ocrd_modelfactory import page_from_file
1313
from ocrd_models.ocrd_page import (
14-
MetadataItemType,
15-
LabelsType, LabelType,
1614
to_xml, AlternativeImageType
1715
)
1816
from ocrd import Processor
@@ -68,21 +66,9 @@ def process(self):
6866
file_id = make_file_id(input_file, self.output_file_grp)
6967

7068
pcgts = page_from_file(self.workspace.download_file(input_file))
69+
self.add_metadata(pcgts)
7170
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
7271
page = pcgts.get_Page()
73-
74-
# add metadata about this operation and its runtime parameters:
75-
metadata = pcgts.get_Metadata() # ensured by from_file()
76-
metadata.add_MetadataItem(
77-
MetadataItemType(type_="processingStep",
78-
name=self.ocrd_tool['steps'][0],
79-
value=TOOL,
80-
Labels=[LabelsType(
81-
externalModel="ocrd-tool",
82-
externalId="parameters",
83-
Label=[LabelType(type_=name,
84-
value=self.parameter[name])
85-
for name in self.parameter.keys()])]))
8672

8773
page_image, page_coords, _ = self.workspace.image_from_page(
8874
page, page_id,
@@ -95,7 +81,10 @@ def process(self):
9581
"page '%s'" % page_id, input_file.pageId,
9682
file_id)
9783
else:
98-
regions = page.get_TextRegion()
84+
if level == 'table':
85+
regions = page.get_TableRegion()
86+
else: # region
87+
regions = page.get_AllRegions(classes=['Text'])
9988
if not regions:
10089
LOG.warning('Page "%s" contains no text regions', page_id)
10190
for region in regions:

0 commit comments

Comments
 (0)