Skip to content

Commit ab82ef2

Browse files
Merge pull request #2 from renan-siqueira/feature/MainFunctionalities
Project Functionalities
2 parents 9aa97b1 + 8cda153 commit ab82ef2

File tree

9 files changed

+101
-0
lines changed

9 files changed

+101
-0
lines changed

extractors/__init__.py

Whitespace-only changes.

extractors/pdfminer_extractor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from pdfminer.high_level import extract_text
2+
3+
4+
def extract_text(file_path):
5+
return extract_text(file_path)

extractors/pdfplumber_extractor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import pdfplumber
2+
3+
4+
def extract_text(file_path):
5+
text = ""
6+
with pdfplumber.open(file_path) as pdf:
7+
text = ''.join(page.extract_text() or '' for page in pdf.pages)
8+
return text

extractors/pymupdf_extractor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import fitz as PyMuPDF
2+
3+
4+
def extract_text(file_path):
5+
text = ""
6+
with PyMuPDF.open(file_path) as doc:
7+
text = ''.join(page.get_text() for page in doc)
8+
return text

extractors/pypdf2_extractor.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import PyPDF2
2+
3+
4+
def extract_text(file_path):
5+
text = ""
6+
with open(file_path, 'rb') as file:
7+
reader = PyPDF2.PdfReader(file)
8+
text = ''.join(page.extract_text() or '' for page in reader.pages)
9+
return text

helpers/__init__.py

Whitespace-only changes.

helpers/utils.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
def save_text_to_file(text, file_path):
2+
with open(file_path, 'w', encoding='utf-8') as file:
3+
file.write(text)

json/config.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"input_path": "/path/to/pdf/or/directory",
3+
"output_path": "/path/to/output/directory",
4+
"libraries": ["pypdf2", "pdfminer"],
5+
"log_level": "INFO"
6+
}

main.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import json
2+
import os
3+
import logging
4+
from extractors import pypdf2_extractor, pdfminer_extractor, pymupdf_extractor, pdfplumber_extractor
5+
from helpers import utils
6+
7+
8+
def get_extractor(library_name):
9+
return {
10+
'pypdf2': pypdf2_extractor.extract_text,
11+
'pdfminer': pdfminer_extractor.extract_text,
12+
'pymupdf': pymupdf_extractor.extract_text,
13+
'pdfplumber': pdfplumber_extractor.extract_text
14+
}.get(library_name, None)
15+
16+
17+
def process_file(file_path, libraries, output_dir):
18+
for library in libraries:
19+
extractor = get_extractor(library)
20+
if extractor:
21+
try:
22+
extracted_text = extractor(file_path)
23+
output_file = os.path.join(output_dir, f"{os.path.basename(file_path).split('.')[0]}_{library}.txt")
24+
utils.save_text_to_file(extracted_text, output_file)
25+
except Exception as e:
26+
logging.error(f"Error processing {file_path} with {library}: {e}")
27+
28+
29+
def create_output_dir(base_dir, input_path, is_single_file):
30+
if is_single_file:
31+
output_dir = os.path.join(base_dir, os.path.basename(input_path).split('.')[0])
32+
else:
33+
output_dir = os.path.join(base_dir, os.path.basename(input_path))
34+
os.makedirs(output_dir, exist_ok=True)
35+
return output_dir
36+
37+
38+
def main():
39+
with open('json/config.json') as config_file:
40+
config = json.load(config_file)
41+
42+
logging.basicConfig(level=config.get("log_level", "INFO"))
43+
44+
input_path = config["input_path"]
45+
libraries = config["libraries"]
46+
base_output_dir = config.get("output_path", "./output")
47+
48+
is_single_file = os.path.isfile(input_path) and input_path.endswith('.pdf')
49+
output_dir = create_output_dir(base_output_dir, input_path, is_single_file)
50+
51+
if is_single_file:
52+
process_file(input_path, libraries, output_dir)
53+
elif os.path.isdir(input_path):
54+
for filename in os.listdir(input_path):
55+
if filename.endswith('.pdf'):
56+
process_file(os.path.join(input_path, filename), libraries, output_dir)
57+
else:
58+
logging.error("Invalid input path")
59+
60+
61+
if __name__ == '__main__':
62+
main()

0 commit comments

Comments
 (0)