Skip to content

Commit 0dc8f04

Browse files
author
Robert Sachunsky
committed
segment: add param 'overwrite_order'
1 parent fa40e7e commit 0dc8f04

File tree

3 files changed

+23
-9
lines changed

3 files changed

+23
-9
lines changed

ocrd_cis/ocrd-tool.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,11 @@
365365
"default": 1.5,
366366
"description": "(when operating on the page/table level) smallest width in multiples of scale/capheight of a valley in the horizontal or vertical profiles (across the binarized image) to still be regarded as a gap during recursive X-Y cut from lines to regions; needs to be smaller when more foreground noise is present, increase to avoid mistaking inter-line as paragraph gaps and inter-word as inter-column gaps"
367367
},
368+
"overwrite_order": {
369+
"type": "boolean",
370+
"default": true,
371+
"description": "(when operating on the page/table level) remove any references for existing TextRegion elements within the top (page/table) reading order; otherwise append"
372+
},
368373
"overwrite_separators": {
369374
"type": "boolean",
370375
"default": true,

ocrd_cis/ocropy/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1300,7 +1300,7 @@ def finalize():
13001300
lpartitions[label2])
13011301
lpartitions[label2] = [0]
13021302
# re-label and re-order surviving partitions
1303-
#lpartitions = np.setdiff1d(np.unique(partitions), [0]) # without bg/sepm
1303+
lpartitions = np.setdiff1d(np.unique(partitions), [0]) # without bg/sepm
13041304
npartitions = len(lpartitions)
13051305
if debug: LOG.debug(' %d sepmask partitions after filtering and merging', npartitions)
13061306
if npartitions > 1:

ocrd_cis/ocropy/segment.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,14 +144,17 @@ def process(self):
144144
then iterate over the element hierarchy down to the requested level.
145145
146146
Depending on ``level-of-operation``, consider existing segments:
147-
- if ``overwrite_separators=True`` on ``page`` level, then
148-
delete any SeparatorRegions,
149-
- if ``overwrite_regions=True`` on ``page`` level, then
150-
delete any top-level TextRegions (along with ReadingOrder),
151-
- if ``overwrite_regions=True`` on ``table`` level, then
152-
delete any TextRegions in TableRegions (along with their OrderGroup),
153-
- if ``overwrite_lines=True`` on ``region`` level, then
147+
- If ``overwrite_separators=True`` on ``page`` level, then
148+
delete any SeparatorRegions.
149+
- If ``overwrite_regions=True`` on ``page`` level, then
150+
delete any top-level TextRegions (along with ReadingOrder).
151+
- If ``overwrite_regions=True`` on ``table`` level, then
152+
delete any TextRegions in TableRegions (along with their OrderGroup).
153+
- If ``overwrite_lines=True`` on ``region`` level, then
154154
delete any TextLines in TextRegions.
155+
- If ``overwrite_order=True`` on ``page`` or ``table`` level, then
156+
delete the reading order OrderedGroup entry corresponding
157+
to the (page/table) segment.
155158
156159
Next, get each element image according to the layout annotation (from
157160
the alternative image of the page/region, or by cropping via coordinates
@@ -206,6 +209,7 @@ def process(self):
206209
overwrite_lines = self.parameter['overwrite_lines']
207210
overwrite_regions = self.parameter['overwrite_regions']
208211
overwrite_separators = self.parameter['overwrite_separators']
212+
overwrite_order = self.parameter['overwrite_order']
209213
oplevel = self.parameter['level-of-operation']
210214

211215
for (n, input_file) in enumerate(self.input_files):
@@ -289,7 +293,7 @@ def process(self):
289293
LOG.warning('keeping existing TextRegions in page "%s"', page_id)
290294
ignore.extend(regions)
291295
# create reading order if necessary
292-
if not ro:
296+
if not ro or overwrite_order:
293297
ro = ReadingOrderType()
294298
page.set_ReadingOrder(ro)
295299
rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
@@ -330,6 +334,11 @@ def process(self):
330334
if not roelem:
331335
LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)",
332336
page_id, region.id, "no target to add cells to")
337+
elif overwrite_order:
338+
# replace by empty ordered group with same (index and) ref
339+
# (which can then take the cells as subregions)
340+
roelem = page_subgroup_in_reading_order(roelem)
341+
reading_order[region.id] = roelem
333342
elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)):
334343
LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)",
335344
page_id, region.id, "cells will be appended")

0 commit comments

Comments
 (0)