|
31 | 31 | QueryMultipleBody, |
32 | 32 | ) |
33 | 33 | from app.services.vector_store.async_pg_vector import AsyncPgVector |
34 | | -from app.utils.document_loader import get_loader, clean_text, process_documents |
| 34 | +from app.utils.document_loader import ( |
| 35 | + get_loader, |
| 36 | + clean_text, |
| 37 | + process_documents, |
| 38 | + cleanup_temp_encoding_file, |
| 39 | +) |
35 | 40 | from app.utils.health import is_health_ok |
36 | 41 |
|
37 | 42 | router = APIRouter() |
@@ -83,8 +88,12 @@ async def health_check(): |
83 | 88 | async def get_documents_by_ids(request: Request, ids: list[str] = Query(...)): |
84 | 89 | try: |
85 | 90 | if isinstance(vector_store, AsyncPgVector): |
86 | | - existing_ids = await vector_store.get_filtered_ids(ids, executor=request.app.state.thread_pool) |
87 | | - documents = await vector_store.get_documents_by_ids(ids, executor=request.app.state.thread_pool) |
| 91 | + existing_ids = await vector_store.get_filtered_ids( |
| 92 | + ids, executor=request.app.state.thread_pool |
| 93 | + ) |
| 94 | + documents = await vector_store.get_documents_by_ids( |
| 95 | + ids, executor=request.app.state.thread_pool |
| 96 | + ) |
88 | 97 | else: |
89 | 98 | existing_ids = vector_store.get_filtered_ids(ids) |
90 | 99 | documents = vector_store.get_documents_by_ids(ids) |
@@ -121,8 +130,12 @@ async def get_documents_by_ids(request: Request, ids: list[str] = Query(...)): |
121 | 130 | async def delete_documents(request: Request, document_ids: List[str] = Body(...)): |
122 | 131 | try: |
123 | 132 | if isinstance(vector_store, AsyncPgVector): |
124 | | - existing_ids = await vector_store.get_filtered_ids(document_ids, executor=request.app.state.thread_pool) |
125 | | - await vector_store.delete(ids=document_ids, executor=request.app.state.thread_pool) |
| 133 | + existing_ids = await vector_store.get_filtered_ids( |
| 134 | + document_ids, executor=request.app.state.thread_pool |
| 135 | + ) |
| 136 | + await vector_store.delete( |
| 137 | + ids=document_ids, executor=request.app.state.thread_pool |
| 138 | + ) |
126 | 139 | else: |
127 | 140 | existing_ids = vector_store.get_filtered_ids(document_ids) |
128 | 141 | vector_store.delete(ids=document_ids) |
@@ -179,7 +192,7 @@ async def query_embeddings_by_file_id( |
179 | 192 | embedding, |
180 | 193 | k=body.k, |
181 | 194 | filter={"file_id": body.file_id}, |
182 | | - executor=request.app.state.thread_pool |
| 195 | + executor=request.app.state.thread_pool, |
183 | 196 | ) |
184 | 197 | else: |
185 | 198 | documents = vector_store.similarity_search_with_score_by_vector( |
@@ -245,7 +258,7 @@ async def store_data_in_vector_db( |
245 | 258 | file_id: str, |
246 | 259 | user_id: str = "", |
247 | 260 | clean_content: bool = False, |
248 | | - executor = None, |
| 261 | + executor=None, |
249 | 262 | ) -> bool: |
250 | 263 | text_splitter = RecursiveCharacterTextSplitter( |
251 | 264 | chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP |
@@ -313,8 +326,16 @@ async def embed_local_file( |
313 | 326 | document.filename, document.file_content_type, document.filepath |
314 | 327 | ) |
315 | 328 | data = await run_in_executor(request.app.state.thread_pool, loader.load) |
| 329 | + |
| 330 | + # Clean up temporary UTF-8 file if it was created for encoding conversion |
| 331 | + cleanup_temp_encoding_file(loader) |
| 332 | + |
316 | 333 | result = await store_data_in_vector_db( |
317 | | - data, document.file_id, user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool |
| 334 | + data, |
| 335 | + document.file_id, |
| 336 | + user_id, |
| 337 | + clean_content=file_ext == "pdf", |
| 338 | + executor=request.app.state.thread_pool, |
318 | 339 | ) |
319 | 340 |
|
320 | 341 | if result: |
@@ -391,8 +412,16 @@ async def embed_file( |
391 | 412 | file.filename, file.content_type, temp_file_path |
392 | 413 | ) |
393 | 414 | data = await run_in_executor(request.app.state.thread_pool, loader.load) |
| 415 | + |
| 416 | + # Clean up temporary UTF-8 file if it was created for encoding conversion |
| 417 | + cleanup_temp_encoding_file(loader) |
| 418 | + |
394 | 419 | result = await store_data_in_vector_db( |
395 | | - data=data, file_id=file_id, user_id=user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool |
| 420 | + data=data, |
| 421 | + file_id=file_id, |
| 422 | + user_id=user_id, |
| 423 | + clean_content=file_ext == "pdf", |
| 424 | + executor=request.app.state.thread_pool, |
396 | 425 | ) |
397 | 426 |
|
398 | 427 | if not result: |
@@ -458,8 +487,12 @@ async def load_document_context(request: Request, id: str): |
458 | 487 | ids = [id] |
459 | 488 | try: |
460 | 489 | if isinstance(vector_store, AsyncPgVector): |
461 | | - existing_ids = await vector_store.get_filtered_ids(ids, executor=request.app.state.thread_pool) |
462 | | - documents = await vector_store.get_documents_by_ids(ids, executor=request.app.state.thread_pool) |
| 490 | + existing_ids = await vector_store.get_filtered_ids( |
| 491 | + ids, executor=request.app.state.thread_pool |
| 492 | + ) |
| 493 | + documents = await vector_store.get_documents_by_ids( |
| 494 | + ids, executor=request.app.state.thread_pool |
| 495 | + ) |
463 | 496 | else: |
464 | 497 | existing_ids = vector_store.get_filtered_ids(ids) |
465 | 498 | documents = vector_store.get_documents_by_ids(ids) |
@@ -526,8 +559,16 @@ async def embed_file_upload( |
526 | 559 | ) |
527 | 560 |
|
528 | 561 | data = await run_in_executor(request.app.state.thread_pool, loader.load) |
| 562 | + |
| 563 | + # Clean up temporary UTF-8 file if it was created for encoding conversion |
| 564 | + cleanup_temp_encoding_file(loader) |
| 565 | + |
529 | 566 | result = await store_data_in_vector_db( |
530 | | - data, file_id, user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool |
| 567 | + data, |
| 568 | + file_id, |
| 569 | + user_id, |
| 570 | + clean_content=file_ext == "pdf", |
| 571 | + executor=request.app.state.thread_pool, |
531 | 572 | ) |
532 | 573 |
|
533 | 574 | if not result: |
@@ -577,7 +618,7 @@ async def query_embeddings_by_file_ids(request: Request, body: QueryMultipleBody |
577 | 618 | embedding, |
578 | 619 | k=body.k, |
579 | 620 | filter={"file_id": {"$in": body.file_ids}}, |
580 | | - executor=request.app.state.thread_pool |
| 621 | + executor=request.app.state.thread_pool, |
581 | 622 | ) |
582 | 623 | else: |
583 | 624 | documents = vector_store.similarity_search_with_score_by_vector( |
|
0 commit comments