Skip to content

Commit 410e6b9

Browse files
refactor extractors to use S3 text extraction function and update dependencies
1 parent 400591f commit 410e6b9

File tree

4 files changed

+48
-24
lines changed

4 files changed

+48
-24
lines changed

backend/app/core/extractors.py

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,12 @@
55
import textract
66
import requests
77
from app.models import Document
8+
from app.s3 import extract_text_from_s3_file
89

9-
10-
def extract_text_from_file(s3_url: str) -> str:
11-
try:
12-
response = requests.get(s3_url)
13-
response.raise_for_status()
14-
15-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
16-
tmp_file.write(response.content)
17-
tmp_path = tmp_file.name
18-
19-
text = textract.process(tmp_path).decode("utf-8") or ""
20-
21-
os.remove(tmp_path)
22-
23-
return text
24-
25-
except Exception as e:
26-
raise Exception(f"Failed to extract text: {e}")
27-
2810
def extract_text_and_save_to_db(s3_url: str, document_id: str) -> None:
2911
try:
3012
with Session(engine) as session:
31-
text = extract_text_from_file(s3_url)
13+
text = extract_text_from_s3_file(s3_url)
3214

3315
document_query = select(Document).where(Document.id == document_id)
3416
document = session.exec(document_query).first()

backend/app/s3.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import os
2+
import tempfile
13
import uuid
24

35
import boto3
46
from fastapi import UploadFile
57

68
from app.core.config import settings
9+
import textract
710

811
s3 = boto3.client(
912
"s3",
@@ -29,3 +32,12 @@ def upload_file_to_s3(file: UploadFile, user_id: str) -> str:
2932

3033
def generate_s3_url(key: str) -> str:
3134
return f"https://{settings.S3_BUCKET_NAME}.s3.amazonaws.com/{key}"
35+
36+
def extract_text_from_s3_file(bucket: str, key: str) -> str:
37+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
38+
s3.download_fileobj(bucket, key, tmp_file)
39+
tmp_path = tmp_file.name
40+
41+
text = textract.process(tmp_path).decode("utf-8") or ""
42+
os.remove(tmp_path)
43+
return text

backend/pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ dependencies = [
2121
"boto3",
2222
"pydantic-settings<3.0.0,>=2.2.1",
2323
"sentry-sdk[fastapi]<2.0.0,>=1.40.6",
24-
"pyjwt<3.0.0,>=2.8.0",
24+
"pyjwt<3.0.0,>=2.8.0",
2525
"textract>=1.6.5",
2626
"boto3-stubs>=1.40.2",
27-
"requests>=2.32.3",
27+
"textract>=1.6.5",
2828
]
2929

3030
[tool.uv]
@@ -35,6 +35,8 @@ dev-dependencies = [
3535
"pre-commit<4.0.0,>=3.6.2",
3636
"types-passlib<2.0.0.0,>=1.7.7.20240106",
3737
"coverage<8.0.0,>=7.4.3",
38+
"types-requests >=2.32.4.20250611",
39+
"types-boto3>=1.39.16"
3840
]
3941

4042
[build-system]

backend/uv.lock

Lines changed: 30 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)