File tree Expand file tree Collapse file tree 1 file changed +43
-0
lines changed Expand file tree Collapse file tree 1 file changed +43
-0
lines changed Original file line number Diff line number Diff line change 1+ import os
2+ import tempfile
3+ from app .core .db import engine
4+ from sqlmodel import Session
5+ import textract
6+ import requests
7+ from app .models import Document
8+
9+
10+ def extract_text_from_file (s3_url : str ) -> str :
11+ try :
12+ response = requests .get (s3_url )
13+ response .raise_for_status ()
14+
15+ with tempfile .NamedTemporaryFile (delete = False , suffix = ".pdf" ) as tmp_file :
16+ tmp_file .write (response .content )
17+ tmp_path = tmp_file .name
18+
19+ text = textract .process (tmp_path ).decode ("utf-8" )
20+
21+ os .remove (tmp_path )
22+
23+ return text
24+
25+ except Exception as e :
26+ raise Exception (f"Failed to extract text: { e } " )
27+
28+ def extract_text_and_save_to_db (s3_url : str , document_id : str ) -> None :
29+ try :
30+ with Session (engine ) as session :
31+ text = extract_text_from_file (s3_url )
32+
33+ document = session .query (Document ).filter (Document .id == document_id ).first ()
34+ if not document :
35+ raise Exception (f"Document with ID { document_id } not found" )
36+
37+ document .extracted_text = text
38+ session .add (document )
39+ session .commit ()
40+
41+ except Exception as e :
42+ print (f"Failed to extract and chunk text for document { document_id } : { e } " )
43+
You can’t perform that action at this time.
0 commit comments