processors: use make_file_id and assert_file_grp_cardinality

kba · finkf · commit d5f93251e4d1 · 2020-08-20T09:55:24.000+02:00
diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py
@@ -9,7 +9,8 @@
 
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -104,15 +105,12 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -160,9 +158,6 @@ def process(self):
                                           file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py
@@ -16,7 +16,8 @@
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     coordinates_of_segment,
     polygon_from_points,
     bbox_from_polygon,
@@ -80,15 +81,12 @@ def process(self):
         # deskewing, because that would make segments incomensurable with their
         # neighbours.
         level = self.parameter['level-of-operation']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -213,9 +211,6 @@ def process(self):
                                              input_file.pageId, file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py
@@ -4,7 +4,8 @@
 
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     MIMETYPE_PAGE
 )
 from ocrd_modelfactory import page_from_file
@@ -53,15 +54,12 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -122,9 +120,6 @@ def process(self):
                                              file_id + '_' + region.id + '_' + line.id)
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py
@@ -3,7 +3,9 @@
 import os.path
 
 from ocrd_utils import (
-    getLogger, concat_padded,
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
     rotate_image,
     MIMETYPE_PAGE
 )
@@ -58,15 +60,12 @@ def process(self):
         Produce a new output file by serialising the resulting hierarchy.
         """
         level = self.parameter['level-of-operation']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -112,9 +111,6 @@ def process(self):
                                           file_id + '_' + region.id)
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py
@@ -3,7 +3,11 @@
 import os.path
 import numpy as np
 
-from ocrd_utils import getLogger, concat_padded
+from ocrd_utils import (
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
+)
 from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
     MetadataItemType,
@@ -96,15 +100,12 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -178,9 +179,6 @@ def process(self):
                         comments=line_xywh['features'] + ',dewarped'))
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py
@@ -7,7 +7,9 @@
 import Levenshtein
 
 from ocrd_utils import (
-    getLogger, concat_padded,
+    getLogger,
+    make_file_id,
+    assert_file_grp_cardinality,
     coordinates_for_segment,
     polygon_from_bbox,
     points_from_polygon,
@@ -130,6 +132,10 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
+
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
+
         # from ocropus-rpred:
         self.network = load_object(self.get_model(), verbose=1)
         for x in self.network.walk():
@@ -171,12 +177,8 @@ def process(self):
             self.process_regions(regions, maxlevel, page_image, page_coords)
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp,
-                                            self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
-            file_path = os.path.join(self.output_file_grp,
-                                     file_id + '.xml')
+            file_id = make_file_id(input_file.ID, self.output_file_grp)
+            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py
@@ -16,7 +16,8 @@
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     bbox_from_polygon,
@@ -153,15 +154,12 @@ def process(self):
         # pixel density (at least if source input is not 300 DPI).
         threshold = self.parameter['min_fraction']
         margin = self.parameter['extend_margins']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
@@ -277,9 +275,6 @@ def process(self):
                         comments=region_xywh['features']))
 
             # update METS (add the PAGE file):
-            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
             file_path = os.path.join(self.output_file_grp, file_id + '.xml')
             out = self.workspace.add_file(
                 ID=file_id,
diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py
@@ -33,7 +33,8 @@
 from ocrd import Processor
 from ocrd_utils import (
     getLogger,
-    concat_padded,
+    make_file_id,
+    assert_file_grp_cardinality,
     coordinates_of_segment,
     coordinates_for_segment,
     points_from_polygon,
@@ -204,16 +205,13 @@ def process(self):
         overwrite_separators = self.parameter['overwrite_separators']
         overwrite_order = self.parameter['overwrite_order']
         oplevel = self.parameter['level-of-operation']
-        assert len(self.output_file_grp.split(',')) == 1, \
-            "Expected exactly one output file group, but '%s' has %d" % (
-                self.output_file_grp, len(self.output_file_grp.split(',')))
+
+        assert_file_grp_cardinality(self.input_file_grp, 1)
+        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         for (n, input_file) in enumerate(self.input_files):
             LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
-            file_id = input_file.ID.replace(self.input_file_grp,
-                                            self.output_file_grp)
-            if file_id == input_file.ID:
-                file_id = concat_padded(self.output_file_grp, n)
+            file_id = make_file_id(input_file, self.output_file_grp)
 
             pcgts = page_from_file(self.workspace.download_file(input_file))
             page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)