clowder-framework · ddey2 · Jun 15, 2022
diff --git a/extractor_info.json b/extractor_info.json
@@ -37,5 +37,39 @@
   "bibtex": [],
   "labels": [
     "Type/Image"
-  ]
+  ],
+  "parameters": {
+    "schema": {
+      "no-columns": {
+        "type": "string",
+        "title": "PSM config to enable multiple columns extractions",
+        "enum": [
+          "3",
+          "6"
+        ],
+        "default": "6"
+      },
+      "min-line-length": {
+        "type": "integer",
+        "title": "Minimum required length of line",
+        "default": 10
+      }
+    },
+    "form": [
+      {
+        "key": "no-columns",
+        "type": "select",
+        "notitle": true,
+        "titleMap": {
+          "3": "Multi-columns",
+          "6": "Single column "
+        }
+      },
+      {
+        "key": "min-line-length",
+        "inlinetitle": "Minimum required length of line",
+        "notitle": true
+      }
+    ]
+  }
 }
diff --git a/ocr.py b/ocr.py
@@ -17,39 +17,38 @@ def __init__(self):
         logging.getLogger('pyclowder').setLevel(logging.DEBUG)
         logging.getLogger('__main__').setLevel(logging.DEBUG)
 
-    def ocr(self, filename, tmpfilename):
+    def ocr(self, filename, tmpfilename, noColumns, minLineLength):
         text = ""
         tmpfile = None
         try:
-            subprocess.check_call(["tesseract", filename, tmpfilename])
+            subprocess.check_call(["tesseract", "--psm", noColumns, filename, tmpfilename])
             tmpfile = "./" + tmpfilename + ".txt"
             with open(tmpfile, 'r') as f:
                 text = f.read()
         finally:
             if tmpfile is not None and os.path.isfile(tmpfile):
                 os.remove(tmpfile)
-            return self.clean_text(text)
+            return self.clean_text(text, minLineLength)
 
-    def clean_text(self, text):
+    def clean_text(self, text, minLineLength):
         t = ""
-        words = text.split()
-        for word in words:
-            w = self.clean_word(word)
-            if w != "":
-                t += w + " "
+        lines = text.splitlines()
+        for line in lines:
+            if line != "" and len(line) >= int(minLineLength):
+                if len(t) == 0:
+                    t = line
+                else:
+                    t += "\n" + line
         return t
 
-    def clean_word(self, word):
-        cw = word.strip('(){}[].,')
-        if cw.isalnum() and len(cw) >= 2:
-            return cw
-        else:
-            return ""
-
     def process_message(self, connector, host, secret_key, resource, parameters):
         inputfile = resource["local_paths"][0]
 
-        ocrtext = self.ocr(inputfile, str(uuid.uuid4())).strip()
+        # get the parameters
+        noColumns = parameters['parameters']["no-columns"]
+        minLineLength = parameters['parameters']["min-line-length"]
+
+        ocrtext = self.ocr(inputfile, str(uuid.uuid4()), str(noColumns), minLineLength).strip()
         if not ocrtext:
             ocrtext = 'No text detected'