Skip to content

Commit 4f3947d

Browse files
author
Robert Sachunsky
committed
ocropy processors: rebase on new core (recursive region enumerator, add_metadata)
1 parent db55366 commit 4f3947d

File tree

9 files changed

+22
-130
lines changed

9 files changed

+22
-130
lines changed

ocrd_cis/ocrd-tool.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
},
6262
"level-of-operation": {
6363
"type": "string",
64-
"enum": ["page", "region", "line"],
64+
"enum": ["page", "table", "region", "line"],
6565
"description": "PAGE XML hierarchy level granularity to annotate images for",
6666
"default": "page"
6767
}
@@ -92,7 +92,7 @@
9292
},
9393
"level-of-operation": {
9494
"type": "string",
95-
"enum": ["page", "region"],
95+
"enum": ["page", "table", "region"],
9696
"description": "PAGE XML hierarchy level granularity to annotate images for",
9797
"default": "region"
9898
}

ocrd_cis/ocropy/binarize.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
)
1616
from ocrd_modelfactory import page_from_file
1717
from ocrd_models.ocrd_page import (
18-
MetadataItemType,
19-
LabelsType, LabelType,
2018
to_xml, AlternativeImageType
2119
)
2220
from ocrd import Processor
@@ -118,21 +116,9 @@ def process(self):
118116
file_id = make_file_id(input_file, self.output_file_grp)
119117

120118
pcgts = page_from_file(self.workspace.download_file(input_file))
119+
self.add_metadata(pcgts)
121120
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
122121
page = pcgts.get_Page()
123-
124-
# add metadata about this operation and its runtime parameters:
125-
metadata = pcgts.get_Metadata() # ensured by from_file()
126-
metadata.add_MetadataItem(
127-
MetadataItemType(type_="processingStep",
128-
name=self.ocrd_tool['steps'][0],
129-
value=TOOL,
130-
Labels=[LabelsType(
131-
externalModel="ocrd-tool",
132-
externalId="parameters",
133-
Label=[LabelType(type_=name,
134-
value=self.parameter[name])
135-
for name in self.parameter.keys()])]))
136122

137123
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
138124
page, page_id, feature_filter='binarized')
@@ -151,8 +137,10 @@ def process(self):
151137
self.process_page(page, page_image, page_xywh, zoom,
152138
input_file.pageId, file_id)
153139
else:
154-
regions = page.get_TextRegion() + (
155-
page.get_TableRegion() if level == 'region' else [])
140+
if level == 'table':
141+
regions = page.get_TableRegion()
142+
else: # region
143+
regions = page.get_AllRegions(classes=['Text'])
156144
if not regions:
157145
self.logger.warning('Page "%s" contains no text regions', page_id)
158146
for region in regions:

ocrd_cis/ocropy/clip.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
from ocrd_modelfactory import page_from_file
1111
from ocrd_models.ocrd_page import (
12-
MetadataItemType,
13-
LabelsType, LabelType,
1412
to_xml, AlternativeImageType
1513
)
1614
from ocrd import Processor
@@ -89,22 +87,10 @@ def process(self):
8987
file_id = make_file_id(input_file, self.output_file_grp)
9088

9189
pcgts = page_from_file(self.workspace.download_file(input_file))
90+
self.add_metadata(pcgts)
9291
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
9392
page = pcgts.get_Page()
9493

95-
# add metadata about this operation and its runtime parameters:
96-
metadata = pcgts.get_Metadata() # ensured by from_file()
97-
metadata.add_MetadataItem(
98-
MetadataItemType(type_="processingStep",
99-
name=self.ocrd_tool['steps'][0],
100-
value=TOOL,
101-
Labels=[LabelsType(
102-
externalModel="ocrd-tool",
103-
externalId="parameters",
104-
Label=[LabelType(type_=name,
105-
value=self.parameter[name])
106-
for name in self.parameter.keys()])]))
107-
10894
page_image, page_coords, page_image_info = self.workspace.image_from_page(
10995
page, page_id, feature_selector='binarized')
11096
if self.parameter['dpi'] > 0:

ocrd_cis/ocropy/denoise.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
)
1111
from ocrd_modelfactory import page_from_file
1212
from ocrd_models.ocrd_page import (
13-
MetadataItemType,
14-
LabelsType, LabelType,
1513
to_xml, AlternativeImageType
1614
)
1715
from ocrd import Processor
@@ -62,21 +60,9 @@ def process(self):
6260
file_id = make_file_id(input_file, self.output_file_grp)
6361

6462
pcgts = page_from_file(self.workspace.download_file(input_file))
63+
self.add_metadata(pcgts)
6564
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
6665
page = pcgts.get_Page()
67-
68-
# add metadata about this operation and its runtime parameters:
69-
metadata = pcgts.get_Metadata() # ensured by from_file()
70-
metadata.add_MetadataItem(
71-
MetadataItemType(type_="processingStep",
72-
name=self.ocrd_tool['steps'][0],
73-
value=TOOL,
74-
Labels=[LabelsType(
75-
externalModel="ocrd-tool",
76-
externalId="parameters",
77-
Label=[LabelType(type_=name,
78-
value=self.parameter[name])
79-
for name in self.parameter.keys()])]))
8066

8167
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
8268
page, page_id,
@@ -96,8 +82,7 @@ def process(self):
9682
self.process_segment(page, page_image, page_xywh, zoom,
9783
input_file.pageId, file_id)
9884
else:
99-
regions = page.get_TextRegion() + (
100-
page.get_TableRegion() if level == 'region' else [])
85+
regions = page.get_AllRegions(classes=['Text'])
10186
if not regions:
10287
LOG.warning('Page "%s" contains no text regions', page_id)
10388
for region in regions:

ocrd_cis/ocropy/deskew.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
)
1212
from ocrd_modelfactory import page_from_file
1313
from ocrd_models.ocrd_page import (
14-
MetadataItemType,
15-
LabelsType, LabelType,
1614
to_xml, AlternativeImageType
1715
)
1816
from ocrd import Processor
@@ -68,21 +66,9 @@ def process(self):
6866
file_id = make_file_id(input_file, self.output_file_grp)
6967

7068
pcgts = page_from_file(self.workspace.download_file(input_file))
69+
self.add_metadata(pcgts)
7170
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
7271
page = pcgts.get_Page()
73-
74-
# add metadata about this operation and its runtime parameters:
75-
metadata = pcgts.get_Metadata() # ensured by from_file()
76-
metadata.add_MetadataItem(
77-
MetadataItemType(type_="processingStep",
78-
name=self.ocrd_tool['steps'][0],
79-
value=TOOL,
80-
Labels=[LabelsType(
81-
externalModel="ocrd-tool",
82-
externalId="parameters",
83-
Label=[LabelType(type_=name,
84-
value=self.parameter[name])
85-
for name in self.parameter.keys()])]))
8672

8773
page_image, page_coords, _ = self.workspace.image_from_page(
8874
page, page_id,
@@ -95,7 +81,10 @@ def process(self):
9581
"page '%s'" % page_id, input_file.pageId,
9682
file_id)
9783
else:
98-
regions = page.get_TextRegion()
84+
if level == 'table':
85+
regions = page.get_TableRegion()
86+
else: # region
87+
regions = page.get_AllRegions(classes=['Text'])
9988
if not regions:
10089
LOG.warning('Page "%s" contains no text regions', page_id)
10190
for region in regions:

ocrd_cis/ocropy/dewarp.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@
1010
)
1111
from ocrd_modelfactory import page_from_file
1212
from ocrd_models.ocrd_page import (
13-
MetadataItemType,
14-
LabelsType, LabelType,
1513
to_xml, AlternativeImageType
1614
)
1715
from ocrd import Processor
@@ -112,22 +110,10 @@ def process(self):
112110
file_id = make_file_id(input_file, self.output_file_grp)
113111

114112
pcgts = page_from_file(self.workspace.download_file(input_file))
113+
self.add_metadata(pcgts)
115114
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
116115
page = pcgts.get_Page()
117116

118-
# add metadata about this operation and its runtime parameters:
119-
metadata = pcgts.get_Metadata() # ensured by from_file()
120-
metadata.add_MetadataItem(
121-
MetadataItemType(type_="processingStep",
122-
name=self.ocrd_tool['steps'][0],
123-
value=TOOL,
124-
Labels=[LabelsType(
125-
externalModel="ocrd-tool",
126-
externalId="parameters",
127-
Label=[LabelType(type_=name,
128-
value=self.parameter[name])
129-
for name in self.parameter.keys()])]))
130-
131117
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
132118
page, page_id)
133119
if self.parameter['dpi'] > 0:
@@ -141,7 +127,7 @@ def process(self):
141127
else:
142128
zoom = 1
143129

144-
regions = page.get_TextRegion()
130+
regions = page.get_AllRegions(classes=['Text'])
145131
if not regions:
146132
self.logger.warning('Page "%s" contains no text regions', page_id)
147133
for region in regions:

ocrd_cis/ocropy/recognize.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
)
1818
from ocrd_modelfactory import page_from_file
1919
from ocrd_models.ocrd_page import (
20-
MetadataItemType,
21-
LabelsType, LabelType,
2220
to_xml, TextEquivType,
2321
CoordsType, GlyphType, WordType
2422
)
@@ -151,28 +149,16 @@ def process(self):
151149
for (n, input_file) in enumerate(self.input_files):
152150
self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
153151
pcgts = page_from_file(self.workspace.download_file(input_file))
152+
self.add_metadata(pcgts)
154153
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
155154
page = pcgts.get_Page()
156155

157-
# add metadata about this operation and its runtime parameters:
158-
metadata = pcgts.get_Metadata() # ensured by from_file()
159-
metadata.add_MetadataItem(
160-
MetadataItemType(type_="processingStep",
161-
name=self.ocrd_tool['steps'][0],
162-
value=TOOL,
163-
Labels=[LabelsType(
164-
externalModel="ocrd-tool",
165-
externalId="parameters",
166-
Label=[LabelType(type_=name,
167-
value=self.parameter[name])
168-
for name in self.parameter.keys()])]))
169-
170156
page_image, page_coords, _ = self.workspace.image_from_page(
171157
page, page_id)
172158

173159
self.logger.info("Recognizing text in page '%s'", page_id)
174160
# region, line, word, or glyph level:
175-
regions = page.get_TextRegion()
161+
regions = page.get_AllRegions(classes=['Text'])
176162
if not regions:
177163
self.logger.warning("Page '%s' contains no text regions", page_id)
178164
self.process_regions(regions, maxlevel, page_image, page_coords)

ocrd_cis/ocropy/resegment.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99

1010
from ocrd_modelfactory import page_from_file
1111
from ocrd_models.ocrd_page import (
12-
MetadataItemType,
13-
LabelsType, LabelType,
1412
to_xml, AlternativeImageType
1513
)
1614
from ocrd import Processor
@@ -163,22 +161,10 @@ def process(self):
163161
file_id = make_file_id(input_file, self.output_file_grp)
164162

165163
pcgts = page_from_file(self.workspace.download_file(input_file))
164+
self.add_metadata(pcgts)
166165
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
167166
page = pcgts.get_Page()
168167

169-
# add metadata about this operation and its runtime parameters:
170-
metadata = pcgts.get_Metadata() # ensured by from_file()
171-
metadata.add_MetadataItem(
172-
MetadataItemType(type_="processingStep",
173-
name=self.ocrd_tool['steps'][0],
174-
value=TOOL,
175-
Labels=[LabelsType(
176-
externalModel="ocrd-tool",
177-
externalId="parameters",
178-
Label=[LabelType(type_=name,
179-
value=self.parameter[name])
180-
for name in self.parameter.keys()])]))
181-
182168
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
183169
page, page_id, feature_selector='binarized')
184170
if self.parameter['dpi'] > 0:
@@ -192,7 +178,7 @@ def process(self):
192178
else:
193179
zoom = 1
194180

195-
regions = page.get_TextRegion()
181+
regions = page.get_AllRegions(classes=['Text'])
196182
if not regions:
197183
LOG.warning('Page "%s" contains no text regions', page_id)
198184
for region in regions:

ocrd_cis/ocropy/segment.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111

1212
from ocrd_modelfactory import page_from_file
1313
from ocrd_models.ocrd_page import (
14-
MetadataItemType,
15-
LabelsType, LabelType,
1614
to_xml, CoordsType,
1715
TextLineType,
1816
TextRegionType,
@@ -216,22 +214,10 @@ def process(self):
216214
file_id = make_file_id(input_file, self.output_file_grp)
217215

218216
pcgts = page_from_file(self.workspace.download_file(input_file))
217+
self.add_metadata(pcgts)
219218
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
220219
page = pcgts.get_Page()
221220

222-
# add metadata about this operation and its runtime parameters:
223-
metadata = pcgts.get_Metadata() # ensured by from_file()
224-
metadata.add_MetadataItem(
225-
MetadataItemType(type_="processingStep",
226-
name=self.ocrd_tool['steps'][0],
227-
value=TOOL,
228-
Labels=[LabelsType(
229-
externalModel="ocrd-tool",
230-
externalId="parameters",
231-
Label=[LabelType(type_=name,
232-
value=self.parameter[name])
233-
for name in self.parameter.keys()])]))
234-
235221
# TODO: also allow grayscale_normalized (try/except?)
236222
page_image, page_coords, page_image_info = self.workspace.image_from_page(
237223
page, page_id, feature_selector='binarized')

0 commit comments

Comments
 (0)