Skip to content

Commit 26bd7dd

Browse files
✨ Add text extraction functionality from S3 files and save to database
1 parent 8fb03d6 commit 26bd7dd

File tree

1 file changed

+43
-0
lines changed

1 file changed

+43
-0
lines changed

backend/app/core/extractors.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import os
2+
import tempfile
3+
from app.core.db import engine
4+
from sqlmodel import Session
5+
import textract
6+
import requests
7+
from app.models import Document
8+
9+
10+
def extract_text_from_file(s3_url: str) -> str:
11+
try:
12+
response = requests.get(s3_url)
13+
response.raise_for_status()
14+
15+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
16+
tmp_file.write(response.content)
17+
tmp_path = tmp_file.name
18+
19+
text = textract.process(tmp_path).decode("utf-8")
20+
21+
os.remove(tmp_path)
22+
23+
return text
24+
25+
except Exception as e:
26+
raise Exception(f"Failed to extract text: {e}")
27+
28+
def extract_text_and_save_to_db(s3_url: str, document_id: str) -> None:
29+
try:
30+
with Session(engine) as session:
31+
text = extract_text_from_file(s3_url)
32+
33+
document = session.query(Document).filter(Document.id == document_id).first()
34+
if not document:
35+
raise Exception(f"Document with ID {document_id} not found")
36+
37+
document.extracted_text = text
38+
session.add(document)
39+
session.commit()
40+
41+
except Exception as e:
42+
print(f"Failed to extract and chunk text for document {document_id}: {e}")
43+

0 commit comments

Comments
 (0)