Merge pull request #2 from renan-siqueira/feature/MainFunctionalities

renan-siqueira · web-flow · commit ab82ef2f3330 · 2023-11-17T20:29:34.000-03:00
Project Functionalities
diff --git a/extractors/__init__.py b/extractors/__init__.py
diff --git a/extractors/pdfminer_extractor.py b/extractors/pdfminer_extractor.py
@@ -0,0 +1,5 @@
+from pdfminer.high_level import extract_text
+
+
+def extract_text(file_path):
+    return extract_text(file_path)
diff --git a/extractors/pdfplumber_extractor.py b/extractors/pdfplumber_extractor.py
@@ -0,0 +1,8 @@
+import pdfplumber
+
+
+def extract_text(file_path):
+    text = ""
+    with pdfplumber.open(file_path) as pdf:
+        text = ''.join(page.extract_text() or '' for page in pdf.pages)
+    return text
diff --git a/extractors/pymupdf_extractor.py b/extractors/pymupdf_extractor.py
@@ -0,0 +1,8 @@
+import fitz as PyMuPDF
+
+
+def extract_text(file_path):
+    text = ""
+    with PyMuPDF.open(file_path) as doc:
+        text = ''.join(page.get_text() for page in doc)
+    return text
diff --git a/extractors/pypdf2_extractor.py b/extractors/pypdf2_extractor.py
@@ -0,0 +1,9 @@
+import PyPDF2
+
+
+def extract_text(file_path):
+    text = ""
+    with open(file_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ''.join(page.extract_text() or '' for page in reader.pages)
+    return text
diff --git a/helpers/__init__.py b/helpers/__init__.py
diff --git a/helpers/utils.py b/helpers/utils.py
@@ -0,0 +1,3 @@
+def save_text_to_file(text, file_path):
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(text)
diff --git a/json/config.json b/json/config.json
@@ -0,0 +1,6 @@
+{
+    "input_path": "/path/to/pdf/or/directory",
+    "output_path": "/path/to/output/directory",
+    "libraries": ["pypdf2", "pdfminer"],
+    "log_level": "INFO"
+}
diff --git a/main.py b/main.py
@@ -0,0 +1,62 @@
+import json
+import os
+import logging
+from extractors import pypdf2_extractor, pdfminer_extractor, pymupdf_extractor, pdfplumber_extractor
+from helpers import utils
+
+
+def get_extractor(library_name):
+    return {
+        'pypdf2': pypdf2_extractor.extract_text,
+        'pdfminer': pdfminer_extractor.extract_text,
+        'pymupdf': pymupdf_extractor.extract_text,
+        'pdfplumber': pdfplumber_extractor.extract_text
+    }.get(library_name, None)
+
+
+def process_file(file_path, libraries, output_dir):
+    for library in libraries:
+        extractor = get_extractor(library)
+        if extractor:
+            try:
+                extracted_text = extractor(file_path)
+                output_file = os.path.join(output_dir, f"{os.path.basename(file_path).split('.')[0]}_{library}.txt")
+                utils.save_text_to_file(extracted_text, output_file)
+            except Exception as e:
+                logging.error(f"Error processing {file_path} with {library}: {e}")
+
+
+def create_output_dir(base_dir, input_path, is_single_file):
+    if is_single_file:
+        output_dir = os.path.join(base_dir, os.path.basename(input_path).split('.')[0])
+    else:
+        output_dir = os.path.join(base_dir, os.path.basename(input_path))
+    os.makedirs(output_dir, exist_ok=True)
+    return output_dir
+
+
+def main():
+    with open('json/config.json') as config_file:
+        config = json.load(config_file)
+
+    logging.basicConfig(level=config.get("log_level", "INFO"))
+
+    input_path = config["input_path"]
+    libraries = config["libraries"]
+    base_output_dir = config.get("output_path", "./output")
+
+    is_single_file = os.path.isfile(input_path) and input_path.endswith('.pdf')
+    output_dir = create_output_dir(base_output_dir, input_path, is_single_file)
+
+    if is_single_file:
+        process_file(input_path, libraries, output_dir)
+    elif os.path.isdir(input_path):
+        for filename in os.listdir(input_path):
+            if filename.endswith('.pdf'):
+                process_file(os.path.join(input_path, filename), libraries, output_dir)
+    else:
+        logging.error("Invalid input path")
+
+
+if __name__ == '__main__':
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+def save_text_to_file(text, file_path):`
	`2`	`+ with open(file_path, 'w', encoding='utf-8') as file:`
	`3`	`+ file.write(text)`