Skip to content

Commit 109a2d3

Browse files
authored
🔧 chore: Improve Document Loader and Vector Store Logging (#203)
* ✨ feat: Add parameter sanitization for logging in ExtendedPgVector to avoid lengthy embedding logs - Implemented a static method `_sanitize_parameters_for_logging` to truncate large values and embeddings for improved logging clarity. - Updated the `setup_query_logging` method to utilize the new sanitization method, ensuring sensitive or large data is not logged directly. * 🔧 fix: Ensure temporary file cleanup only occurs if the filepath is set - Updated the `cleanup_temp_encoding_file` function to check that `_temp_filepath` is not None before attempting to remove the file, preventing potential errors when the attribute is present but not initialized.
1 parent d9f8272 commit 109a2d3

File tree

2 files changed

+86
-24
lines changed

2 files changed

+86
-24
lines changed

‎app/services/vector_store/extended_pg_vector.py‎

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import time
33
import logging
4-
from typing import Optional
4+
from typing import Optional, Any, Dict, List, Union
55
from sqlalchemy import event
66
from sqlalchemy import delete
77
from sqlalchemy.orm import Session
@@ -17,6 +17,63 @@ def __init__(self, *args, **kwargs):
1717
super().__init__(*args, **kwargs)
1818
self.setup_query_logging()
1919

20+
@staticmethod
21+
def _sanitize_parameters_for_logging(
22+
parameters: Union[Dict, List, tuple, Any]
23+
) -> Any:
24+
"""Sanitize parameters for logging by truncating embeddings and large values."""
25+
if parameters is None:
26+
return parameters
27+
28+
if isinstance(parameters, dict):
29+
sanitized = {}
30+
for key, value in parameters.items():
31+
# Check if the key contains 'embedding' or if the value looks like an embedding vector
32+
if "embedding" in str(key).lower() or (
33+
isinstance(value, (list, tuple))
34+
and len(value) > 10
35+
and all(isinstance(x, (int, float)) for x in value[:10])
36+
):
37+
sanitized[key] = f"<embedding vector of length {len(value)}>"
38+
elif isinstance(value, str) and len(value) > 500:
39+
sanitized[key] = value[:500] + "... (truncated)"
40+
elif isinstance(value, (dict, list, tuple)):
41+
sanitized[key] = ExtendedPgVector._sanitize_parameters_for_logging(
42+
value
43+
)
44+
else:
45+
sanitized[key] = value
46+
return sanitized
47+
elif isinstance(parameters, (list, tuple)):
48+
sanitized = []
49+
# Check if this is a list of embeddings
50+
if len(parameters) > 0 and all(
51+
isinstance(item, (list, tuple))
52+
and len(item) > 10
53+
and all(isinstance(x, (int, float)) for x in item[: min(10, len(item))])
54+
for item in parameters
55+
):
56+
return f"<{len(parameters)} embedding vectors>"
57+
58+
for item in parameters:
59+
if (
60+
isinstance(item, (list, tuple))
61+
and len(item) > 10
62+
and all(isinstance(x, (int, float)) for x in item[:10])
63+
):
64+
sanitized.append(f"<embedding vector of length {len(item)}>")
65+
elif isinstance(item, str) and len(item) > 500:
66+
sanitized.append(item[:500] + "... (truncated)")
67+
elif isinstance(item, (dict, list, tuple)):
68+
sanitized.append(
69+
ExtendedPgVector._sanitize_parameters_for_logging(item)
70+
)
71+
else:
72+
sanitized.append(item)
73+
return type(parameters)(sanitized)
74+
else:
75+
return parameters
76+
2077
def setup_query_logging(self):
2178
"""Enable query logging for this vector store only if DEBUG_PGVECTOR_QUERIES is set"""
2279
# Only setup logging if the environment variable is set to a truthy value
@@ -45,7 +102,10 @@ def receive_before_cursor_execute(
45102
if "langchain_pg_embedding" in statement:
46103
context._query_start_time = time.time()
47104
logger.info(f"STARTING QUERY: {statement}")
48-
logger.info(f"PARAMETERS: {parameters}")
105+
sanitized_params = ExtendedPgVector._sanitize_parameters_for_logging(
106+
parameters
107+
)
108+
logger.info(f"PARAMETERS: {sanitized_params}")
49109

50110
@event.listens_for(Engine, "after_cursor_execute")
51111
def receive_after_cursor_execute(

‎app/utils/document_loader.py‎

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def cleanup_temp_encoding_file(loader) -> None:
6161
6262
:param loader: The document loader that may have created a temporary file
6363
"""
64-
if hasattr(loader, "_temp_filepath"):
64+
if hasattr(loader, "_temp_filepath") and loader._temp_filepath is not None:
6565
try:
6666
os.remove(loader._temp_filepath)
6767
except Exception as e:
@@ -90,7 +90,9 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
9090
mode="w", encoding="utf-8", suffix=".csv", delete=False
9191
) as temp_file:
9292
# Read the original file with detected encoding
93-
with open(filepath, "r", encoding=encoding, errors="replace") as original_file:
93+
with open(
94+
filepath, "r", encoding=encoding, errors="replace"
95+
) as original_file:
9496
content = original_file.read()
9597
temp_file.write(content)
9698

@@ -111,40 +113,40 @@ def get_loader(filename: str, file_content_type: str, filepath: str):
111113
elif file_ext == "rst":
112114
loader = UnstructuredRSTLoader(filepath, mode="elements")
113115
elif file_ext == "xml" or file_content_type in [
114-
"application/xml",
115-
"text/xml",
116-
"application/xhtml+xml",
117-
]:
116+
"application/xml",
117+
"text/xml",
118+
"application/xhtml+xml",
119+
]:
118120
loader = UnstructuredXMLLoader(filepath)
119121
elif file_ext in ["ppt", "pptx"] or file_content_type in [
120-
"application/vnd.ms-powerpoint",
121-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
122-
]:
122+
"application/vnd.ms-powerpoint",
123+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
124+
]:
123125
loader = UnstructuredPowerPointLoader(filepath)
124126
elif file_ext == "md" or file_content_type in [
125-
"text/markdown",
126-
"text/x-markdown",
127-
"application/markdown",
128-
"application/x-markdown",
129-
]:
127+
"text/markdown",
128+
"text/x-markdown",
129+
"application/markdown",
130+
"application/x-markdown",
131+
]:
130132
loader = UnstructuredMarkdownLoader(filepath)
131133
elif file_ext == "epub" or file_content_type == "application/epub+zip":
132134
loader = UnstructuredEPubLoader(filepath)
133135
elif file_ext in ["doc", "docx"] or file_content_type in [
134-
"application/msword",
135-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
136-
]:
136+
"application/msword",
137+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
138+
]:
137139
loader = Docx2txtLoader(filepath)
138140
elif file_ext in ["xls", "xlsx"] or file_content_type in [
139-
"application/vnd.ms-excel",
140-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
141-
]:
141+
"application/vnd.ms-excel",
142+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
143+
]:
142144
loader = UnstructuredExcelLoader(filepath)
143145
elif file_ext == "json" or file_content_type == "application/json":
144146
loader = TextLoader(filepath, autodetect_encoding=True)
145147
elif file_ext in known_source_ext or (
146-
file_content_type and file_content_type.find("text/") >= 0
147-
):
148+
file_content_type and file_content_type.find("text/") >= 0
149+
):
148150
loader = TextLoader(filepath, autodetect_encoding=True)
149151
else:
150152
loader = TextLoader(filepath, autodetect_encoding=True)

0 commit comments

Comments
 (0)