🔧 chore: Improve Document Loader and Vector Store Logging (#203)

danny-avila · web-flow · commit 109a2d33efe5 · 2025-08-27T15:53:53.000-04:00
* ✨ feat: Add parameter sanitization for logging in ExtendedPgVector to avoid lengthy embedding logs

- Implemented a static method `_sanitize_parameters_for_logging` to truncate large values and embeddings for improved logging clarity.
- Updated the `setup_query_logging` method to utilize the new sanitization method, ensuring sensitive or large data is not logged directly.

* 🔧 fix: Ensure temporary file cleanup only occurs if the filepath is set

- Updated the `cleanup_temp_encoding_file` function to check that `_temp_filepath` is not None before attempting to remove the file, preventing potential errors when the attribute is present but not initialized.
diff --git a/app/services/vector_store/extended_pg_vector.py b/app/services/vector_store/extended_pg_vector.py
@@ -1,7 +1,7 @@
 import os
 import time
 import logging
-from typing import Optional
+from typing import Optional, Any, Dict, List, Union
 from sqlalchemy import event
 from sqlalchemy import delete
 from sqlalchemy.orm import Session
@@ -17,6 +17,63 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.setup_query_logging()
 
+    @staticmethod
+    def _sanitize_parameters_for_logging(
+        parameters: Union[Dict, List, tuple, Any]
+    ) -> Any:
+        """Sanitize parameters for logging by truncating embeddings and large values."""
+        if parameters is None:
+            return parameters
+
+        if isinstance(parameters, dict):
+            sanitized = {}
+            for key, value in parameters.items():
+                # Check if the key contains 'embedding' or if the value looks like an embedding vector
+                if "embedding" in str(key).lower() or (
+                    isinstance(value, (list, tuple))
+                    and len(value) > 10
+                    and all(isinstance(x, (int, float)) for x in value[:10])
+                ):
+                    sanitized[key] = f"<embedding vector of length {len(value)}>"
+                elif isinstance(value, str) and len(value) > 500:
+                    sanitized[key] = value[:500] + "... (truncated)"
+                elif isinstance(value, (dict, list, tuple)):
+                    sanitized[key] = ExtendedPgVector._sanitize_parameters_for_logging(
+                        value
+                    )
+                else:
+                    sanitized[key] = value
+            return sanitized
+        elif isinstance(parameters, (list, tuple)):
+            sanitized = []
+            # Check if this is a list of embeddings
+            if len(parameters) > 0 and all(
+                isinstance(item, (list, tuple))
+                and len(item) > 10
+                and all(isinstance(x, (int, float)) for x in item[: min(10, len(item))])
+                for item in parameters
+            ):
+                return f"<{len(parameters)} embedding vectors>"
+
+            for item in parameters:
+                if (
+                    isinstance(item, (list, tuple))
+                    and len(item) > 10
+                    and all(isinstance(x, (int, float)) for x in item[:10])
+                ):
+                    sanitized.append(f"<embedding vector of length {len(item)}>")
+                elif isinstance(item, str) and len(item) > 500:
+                    sanitized.append(item[:500] + "... (truncated)")
+                elif isinstance(item, (dict, list, tuple)):
+                    sanitized.append(
+                        ExtendedPgVector._sanitize_parameters_for_logging(item)
+                    )
+                else:
+                    sanitized.append(item)
+            return type(parameters)(sanitized)
+        else:
+            return parameters
+
     def setup_query_logging(self):
         """Enable query logging for this vector store only if DEBUG_PGVECTOR_QUERIES is set"""
         # Only setup logging if the environment variable is set to a truthy value
@@ -45,7 +102,10 @@ def receive_before_cursor_execute(
             if "langchain_pg_embedding" in statement:
                 context._query_start_time = time.time()
                 logger.info(f"STARTING QUERY: {statement}")
-                logger.info(f"PARAMETERS: {parameters}")
+                sanitized_params = ExtendedPgVector._sanitize_parameters_for_logging(
+                    parameters
+                )
+                logger.info(f"PARAMETERS: {sanitized_params}")
 
         @event.listens_for(Engine, "after_cursor_execute")
         def receive_after_cursor_execute(
diff --git a/app/utils/document_loader.py b/app/utils/document_loader.py
@@ -61,7 +61,7 @@ def cleanup_temp_encoding_file(loader) -> None:
 
     :param loader: The document loader that may have created a temporary file
     """
-    if hasattr(loader, "_temp_filepath"):
+    if hasattr(loader, "_temp_filepath") and loader._temp_filepath is not None:
         try:
             os.remove(loader._temp_filepath)
         except Exception as e:
@@ -90,7 +90,9 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
                     mode="w", encoding="utf-8", suffix=".csv", delete=False
                 ) as temp_file:
                     # Read the original file with detected encoding
-                    with open(filepath, "r", encoding=encoding, errors="replace") as original_file:
+                    with open(
+                        filepath, "r", encoding=encoding, errors="replace"
+                    ) as original_file:
                         content = original_file.read()
                         temp_file.write(content)
 
@@ -111,40 +113,40 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
     elif file_ext == "rst":
         loader = UnstructuredRSTLoader(filepath, mode="elements")
     elif file_ext == "xml" or file_content_type in [
-            "application/xml",
-            "text/xml",
-            "application/xhtml+xml",
-        ]:
+        "application/xml",
+        "text/xml",
+        "application/xhtml+xml",
+    ]:
         loader = UnstructuredXMLLoader(filepath)
     elif file_ext in ["ppt", "pptx"] or file_content_type in [
-            "application/vnd.ms-powerpoint",
-            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        ]:
+        "application/vnd.ms-powerpoint",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    ]:
         loader = UnstructuredPowerPointLoader(filepath)
     elif file_ext == "md" or file_content_type in [
-            "text/markdown",
-            "text/x-markdown",
-            "application/markdown",
-            "application/x-markdown",
-        ]:
+        "text/markdown",
+        "text/x-markdown",
+        "application/markdown",
+        "application/x-markdown",
+    ]:
         loader = UnstructuredMarkdownLoader(filepath)
     elif file_ext == "epub" or file_content_type == "application/epub+zip":
         loader = UnstructuredEPubLoader(filepath)
     elif file_ext in ["doc", "docx"] or file_content_type in [
-            "application/msword",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-        ]:
+        "application/msword",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ]:
         loader = Docx2txtLoader(filepath)
     elif file_ext in ["xls", "xlsx"] or file_content_type in [
-            "application/vnd.ms-excel",
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-        ]:
+        "application/vnd.ms-excel",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ]:
         loader = UnstructuredExcelLoader(filepath)
     elif file_ext == "json" or file_content_type == "application/json":
         loader = TextLoader(filepath, autodetect_encoding=True)
     elif file_ext in known_source_ext or (
-            file_content_type and file_content_type.find("text/") >= 0
-        ):
+        file_content_type and file_content_type.find("text/") >= 0
+    ):
         loader = TextLoader(filepath, autodetect_encoding=True)
     else:
         loader = TextLoader(filepath, autodetect_encoding=True)