Skip to content

Commit a20ace4

Browse files
✨ Add text extraction dependencies and integrate extraction in document creation
1 parent ebb8965 commit a20ace4

File tree

4 files changed

+357
-13
lines changed

4 files changed

+357
-13
lines changed

backend/app/api/routes/documents.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import Any
22
from app.api.deps import CurrentUser, SessionDep
3+
from app.core.extractors import extract_text_and_save_to_db
34
from app.models import Document, DocumentCreate, DocumentPublic
45
from fastapi import APIRouter, BackgroundTasks, File, UploadFile, HTTPException
56
from app.s3 import upload_file_to_s3, generate_s3_url
@@ -40,6 +41,5 @@ def create_document(
4041

4142
# 3. Kick off background job
4243
print("Document created, starting background task...")
43-
# background_tasks.add_task(generate_questions, document.id)
44-
44+
background_tasks.add_task(extract_text_and_save_to_db, url, document.id)
4545
return document

backend/app/tests/api/routes/test_documents.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
from app.core.config import settings
77
from app.tests.utils.document import create_random_document
88
import io
9+
import time
910

1011

11-
def skip_test_create_document(
12+
def skip_test_create_document_real_s3(
1213
client: TestClient, superuser_token_headers: dict[str, str]
1314
) -> None:
1415
'''Test creating a document with a file upload with the real S3 service.'''
@@ -25,10 +26,6 @@ def skip_test_create_document(
2526
assert response.status_code == 200
2627
content = response.json()
2728
assert "id" in content, "actual response: " + str(content)
28-
# assert content["title"] == metadata["title"]
29-
# assert content["description"] == metadata["description"]
30-
# assert "id" in content
31-
# assert "owner_id" in content
3229

3330
def test_create_document(
3431
client: TestClient, superuser_token_headers: dict[str, str]

backend/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ dependencies = [
2222
"pydantic-settings<3.0.0,>=2.2.1",
2323
"sentry-sdk[fastapi]<2.0.0,>=1.40.6",
2424
"pyjwt<3.0.0,>=2.8.0",
25+
"pymupdf>=1.26.3",
26+
"textract>=1.6.5",
2527
]
2628

2729
[tool.uv]

0 commit comments

Comments
 (0)