diff --git a/extractor_info.json b/extractor_info.json index d845216..3454c30 100644 --- a/extractor_info.json +++ b/extractor_info.json @@ -37,5 +37,39 @@ "bibtex": [], "labels": [ "Type/Image" - ] + ], + "parameters": { + "schema": { + "no-columns": { + "type": "string", + "title": "PSM config to enable multiple columns extractions", + "enum": [ + "3", + "6" + ], + "default": "6" + }, + "min-line-length": { + "type": "integer", + "title": "Minimum required length of line", + "default": 10 + } + }, + "form": [ + { + "key": "no-columns", + "type": "select", + "notitle": true, + "titleMap": { + "3": "Multi-columns", + "6": "Single column " + } + }, + { + "key": "min-line-length", + "inlinetitle": "Minimum required length of line", + "notitle": true + } + ] + } } diff --git a/ocr.py b/ocr.py index 6b5d32d..bcfdf14 100755 --- a/ocr.py +++ b/ocr.py @@ -17,39 +17,38 @@ def __init__(self): logging.getLogger('pyclowder').setLevel(logging.DEBUG) logging.getLogger('__main__').setLevel(logging.DEBUG) - def ocr(self, filename, tmpfilename): + def ocr(self, filename, tmpfilename, noColumns, minLineLength): text = "" tmpfile = None try: - subprocess.check_call(["tesseract", filename, tmpfilename]) + subprocess.check_call(["tesseract", "--psm", noColumns, filename, tmpfilename]) tmpfile = "./" + tmpfilename + ".txt" with open(tmpfile, 'r') as f: text = f.read() finally: if tmpfile is not None and os.path.isfile(tmpfile): os.remove(tmpfile) - return self.clean_text(text) + return self.clean_text(text, minLineLength) - def clean_text(self, text): + def clean_text(self, text, minLineLength): t = "" - words = text.split() - for word in words: - w = self.clean_word(word) - if w != "": - t += w + " " + lines = text.splitlines() + for line in lines: + if line != "" and len(line) >= int(minLineLength): + if len(t) == 0: + t = line + else: + t += "\n" + line return t - def clean_word(self, word): - cw = word.strip('(){}[].,') - if cw.isalnum() and len(cw) >= 2: - return cw - else: - return "" - def process_message(self, connector, host, secret_key, resource, parameters): inputfile = resource["local_paths"][0] - ocrtext = self.ocr(inputfile, str(uuid.uuid4())).strip() + # get the parameters + noColumns = parameters['parameters']["no-columns"] + minLineLength = parameters['parameters']["min-line-length"] + + ocrtext = self.ocr(inputfile, str(uuid.uuid4()), str(noColumns), minLineLength).strip() if not ocrtext: ocrtext = 'No text detected'