📒 feat: Support CSV loading for non-UTF-8 files (#169)

danny-avila · web-flow · commit aa5d89f0bfa1 · 2025-07-05T09:40:17.000-04:00
- Added  function to determine file encoding based on BOM markers.
- Introduced  function to remove temporary UTF-8 files created during encoding conversion.
- Updated CSV loading logic to handle non-UTF-8 encodings by creating temporary UTF-8 files for processing.
- Integrated cleanup functionality in embedding routes to ensure temporary files are removed after processing.
diff --git a/app/routes/document_routes.py b/app/routes/document_routes.py
@@ -31,7 +31,12 @@
     QueryMultipleBody,
 )
 from app.services.vector_store.async_pg_vector import AsyncPgVector
-from app.utils.document_loader import get_loader, clean_text, process_documents
+from app.utils.document_loader import (
+    get_loader,
+    clean_text,
+    process_documents,
+    cleanup_temp_encoding_file,
+)
 from app.utils.health import is_health_ok
 
 router = APIRouter()
@@ -83,8 +88,12 @@ async def health_check():
 async def get_documents_by_ids(request: Request, ids: list[str] = Query(...)):
     try:
         if isinstance(vector_store, AsyncPgVector):
-            existing_ids = await vector_store.get_filtered_ids(ids, executor=request.app.state.thread_pool)
-            documents = await vector_store.get_documents_by_ids(ids, executor=request.app.state.thread_pool)
+            existing_ids = await vector_store.get_filtered_ids(
+                ids, executor=request.app.state.thread_pool
+            )
+            documents = await vector_store.get_documents_by_ids(
+                ids, executor=request.app.state.thread_pool
+            )
         else:
             existing_ids = vector_store.get_filtered_ids(ids)
             documents = vector_store.get_documents_by_ids(ids)
@@ -121,8 +130,12 @@ async def get_documents_by_ids(request: Request, ids: list[str] = Query(...)):
 async def delete_documents(request: Request, document_ids: List[str] = Body(...)):
     try:
         if isinstance(vector_store, AsyncPgVector):
-            existing_ids = await vector_store.get_filtered_ids(document_ids, executor=request.app.state.thread_pool)
-            await vector_store.delete(ids=document_ids, executor=request.app.state.thread_pool)
+            existing_ids = await vector_store.get_filtered_ids(
+                document_ids, executor=request.app.state.thread_pool
+            )
+            await vector_store.delete(
+                ids=document_ids, executor=request.app.state.thread_pool
+            )
         else:
             existing_ids = vector_store.get_filtered_ids(document_ids)
             vector_store.delete(ids=document_ids)
@@ -179,7 +192,7 @@ async def query_embeddings_by_file_id(
                 embedding,
                 k=body.k,
                 filter={"file_id": body.file_id},
-                executor=request.app.state.thread_pool
+                executor=request.app.state.thread_pool,
             )
         else:
             documents = vector_store.similarity_search_with_score_by_vector(
@@ -245,7 +258,7 @@ async def store_data_in_vector_db(
     file_id: str,
     user_id: str = "",
     clean_content: bool = False,
-    executor = None,
+    executor=None,
 ) -> bool:
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
@@ -313,8 +326,16 @@ async def embed_local_file(
             document.filename, document.file_content_type, document.filepath
         )
         data = await run_in_executor(request.app.state.thread_pool, loader.load)
+
+        # Clean up temporary UTF-8 file if it was created for encoding conversion
+        cleanup_temp_encoding_file(loader)
+
         result = await store_data_in_vector_db(
-            data, document.file_id, user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool
+            data,
+            document.file_id,
+            user_id,
+            clean_content=file_ext == "pdf",
+            executor=request.app.state.thread_pool,
         )
 
         if result:
@@ -391,8 +412,16 @@ async def embed_file(
             file.filename, file.content_type, temp_file_path
         )
         data = await run_in_executor(request.app.state.thread_pool, loader.load)
+
+        # Clean up temporary UTF-8 file if it was created for encoding conversion
+        cleanup_temp_encoding_file(loader)
+
         result = await store_data_in_vector_db(
-            data=data, file_id=file_id, user_id=user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool
+            data=data,
+            file_id=file_id,
+            user_id=user_id,
+            clean_content=file_ext == "pdf",
+            executor=request.app.state.thread_pool,
         )
 
         if not result:
@@ -458,8 +487,12 @@ async def load_document_context(request: Request, id: str):
     ids = [id]
     try:
         if isinstance(vector_store, AsyncPgVector):
-            existing_ids = await vector_store.get_filtered_ids(ids, executor=request.app.state.thread_pool)
-            documents = await vector_store.get_documents_by_ids(ids, executor=request.app.state.thread_pool)
+            existing_ids = await vector_store.get_filtered_ids(
+                ids, executor=request.app.state.thread_pool
+            )
+            documents = await vector_store.get_documents_by_ids(
+                ids, executor=request.app.state.thread_pool
+            )
         else:
             existing_ids = vector_store.get_filtered_ids(ids)
             documents = vector_store.get_documents_by_ids(ids)
@@ -526,8 +559,16 @@ async def embed_file_upload(
         )
 
         data = await run_in_executor(request.app.state.thread_pool, loader.load)
+
+        # Clean up temporary UTF-8 file if it was created for encoding conversion
+        cleanup_temp_encoding_file(loader)
+
         result = await store_data_in_vector_db(
-            data, file_id, user_id, clean_content=file_ext == "pdf", executor=request.app.state.thread_pool
+            data,
+            file_id,
+            user_id,
+            clean_content=file_ext == "pdf",
+            executor=request.app.state.thread_pool,
         )
 
         if not result:
@@ -577,7 +618,7 @@ async def query_embeddings_by_file_ids(request: Request, body: QueryMultipleBody
                 embedding,
                 k=body.k,
                 filter={"file_id": {"$in": body.file_ids}},
-                executor=request.app.state.thread_pool
+                executor=request.app.state.thread_pool,
             )
         else:
             documents = vector_store.similarity_search_with_score_by_vector(
diff --git a/app/utils/document_loader.py b/app/utils/document_loader.py
@@ -1,9 +1,12 @@
 # app/utils/document_loader.py
+import os
+import codecs
+import tempfile
 from typing import List, Optional
 
 from langchain_core.documents import Document
 
-from app.config import known_source_ext, PDF_EXTRACT_IMAGES, CHUNK_OVERLAP
+from app.config import known_source_ext, PDF_EXTRACT_IMAGES, CHUNK_OVERLAP, logger
 from langchain_community.document_loaders import (
     TextLoader,
     PyPDFLoader,
@@ -17,14 +20,83 @@
     UnstructuredPowerPointLoader,
 )
 
+
+def detect_file_encoding(filepath: str) -> str:
+    """
+    Detect the encoding of a file by checking for BOM markers.
+    Returns the detected encoding or 'utf-8' as default.
+    """
+    with open(filepath, "rb") as f:
+        raw = f.read(4)
+
+    # Check for BOM markers
+    if raw.startswith(codecs.BOM_UTF16_LE):
+        return "utf-16-le"
+    elif raw.startswith(codecs.BOM_UTF16_BE):
+        return "utf-16-be"
+    elif raw.startswith(codecs.BOM_UTF16):
+        return "utf-16"
+    elif raw.startswith(codecs.BOM_UTF8):
+        return "utf-8-sig"
+    elif raw.startswith(codecs.BOM_UTF32_LE):
+        return "utf-32-le"
+    elif raw.startswith(codecs.BOM_UTF32_BE):
+        return "utf-32-be"
+    else:
+        # Default to utf-8 if no BOM is found
+        return "utf-8"
+
+
+def cleanup_temp_encoding_file(loader) -> None:
+    """
+    Clean up temporary UTF-8 file if it was created for encoding conversion.
+
+    :param loader: The document loader that may have created a temporary file
+    """
+    if hasattr(loader, "_temp_filepath"):
+        try:
+            os.remove(loader._temp_filepath)
+        except Exception as e:
+            logger.warning(f"Failed to remove temporary UTF-8 file: {e}")
+
+
 def get_loader(filename: str, file_content_type: str, filepath: str):
     file_ext = filename.split(".")[-1].lower()
     known_type = True
 
     if file_ext == "pdf":
         loader = PyPDFLoader(filepath, extract_images=PDF_EXTRACT_IMAGES)
     elif file_ext == "csv":
-        loader = CSVLoader(filepath)
+        # Detect encoding for CSV files
+        encoding = detect_file_encoding(filepath)
+
+        if encoding != "utf-8":
+            # For non-UTF-8 encodings, we need to convert the file first
+            # Create a temporary UTF-8 file
+            temp_file = None
+            try:
+                with tempfile.NamedTemporaryFile(
+                    mode="w", encoding="utf-8", suffix=".csv", delete=False
+                ) as temp_file:
+                    # Read the original file with detected encoding
+                    with open(filepath, "r", encoding=encoding) as original_file:
+                        content = original_file.read()
+                        temp_file.write(content)
+
+                    temp_filepath = temp_file.name
+
+                # Use the temporary UTF-8 file with CSVLoader
+                loader = CSVLoader(temp_filepath)
+
+                # Store the temp file path for cleanup
+                loader._temp_filepath = temp_filepath
+            except Exception as e:
+                # If temp file was created but there was an error, clean it up
+                if temp_file and os.path.exists(temp_file.name):
+                    os.unlink(temp_file.name)
+                raise e
+        else:
+            loader = CSVLoader(filepath)
     elif file_ext == "rst":
         loader = UnstructuredRSTLoader(filepath, mode="elements")
     elif file_ext == "xml":
@@ -58,6 +130,7 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
 
     return loader, known_type, file_ext
 
+
 def clean_text(text: str) -> str:
     """
     Remove NUL (0x00) characters from a string.
@@ -67,6 +140,7 @@ def clean_text(text: str) -> str:
     """
     return text.replace("\x00", "")
 
+
 def process_documents(documents: List[Document]) -> str:
     processed_text = ""
     last_page: Optional[int] = None
@@ -91,4 +165,4 @@ def process_documents(documents: List[Document]) -> str:
         else:
             processed_text += new_content
 
-    return processed_text.strip()
+    return processed_text.strip()