✨ feat: Optimize PostgreSQL Queries and Add Optional Detailed Query Logging (#172)

danny-avila · web-flow · commit 8cc669554445 · 2025-07-10T12:01:47.000-04:00
* fix: vector index creation in database service on `cmetadata-&gt;&gt;'file_id'` due to upstream query change

* ✨ feat: Optimize PostgreSQL Queries and Add Optional Detailed Query Logging

- Introduced a new environment variable `DEBUG_PGVECTOR_QUERIES` to enable detailed logging of pgvector operations.
- Implemented query logging setup in the ExtendedPgVector class, capturing execution time and parameters for relevant queries.
- Updated README.md to document the new environment variable.
diff --git a/README.md b/README.md
@@ -61,6 +61,7 @@ The following environment variables are required to run the application:
 - `RAG_UPLOAD_DIR`: (Optional) The directory where uploaded files are stored. Default value is "./uploads/".
 - `PDF_EXTRACT_IMAGES`: (Optional) A boolean value indicating whether to extract images from PDF files. Default value is "False".
 - `DEBUG_RAG_API`: (Optional) Set to "True" to show more verbose logging output in the server console, and to enable postgresql database routes
+- `DEBUG_PGVECTOR_QUERIES`: (Optional) Set to "True" to enable detailed PostgreSQL query logging for pgvector operations. Useful for debugging performance issues with vector database queries.
 - `CONSOLE_JSON`: (Optional) Set to "True" to log as json for Cloud Logging aggregations
 - `EMBEDDINGS_PROVIDER`: (Optional) either "openai", "bedrock", "azure", "huggingface", "huggingfacetei", "vertexai", or "ollama", where "huggingface" uses sentence_transformers; defaults to "openai"
 - `EMBEDDINGS_MODEL`: (Optional) Set a valid embeddings model to use from the configured provider.
diff --git a/app/services/database.py b/app/services/database.py
@@ -2,6 +2,7 @@
 import asyncpg
 from app.config import DSN, logger
 
+
 class PSQLDatabase:
     pool = None
 
@@ -17,18 +18,29 @@ async def close_pool(cls):
             await cls.pool.close()
             cls.pool = None
 
-async def ensure_custom_id_index_on_embedding():
+
+async def ensure_vector_indexes():
     table_name = "langchain_pg_embedding"
     column_name = "custom_id"
     # You might want to standardize the index naming convention
     index_name = f"idx_{table_name}_{column_name}"
 
     pool = await PSQLDatabase.get_pool()
     async with pool.acquire() as conn:
-        await conn.execute(f"""
-                CREATE INDEX IF NOT EXISTS {index_name} ON {table_name} ({column_name});
-            """)
-        logger.debug(f"Checking if index '{index_name}' on '{table_name}({column_name}) exists, if not found then the index is created.'")
+        await conn.execute(
+            f"""
+            CREATE INDEX IF NOT EXISTS {index_name} ON {table_name} ({column_name});
+        """
+        )
+
+        await conn.execute(
+            f"""
+            CREATE INDEX IF NOT EXISTS idx_{table_name}_file_id 
+            ON {table_name} ((cmetadata->>'file_id'));
+        """
+        )
+
+        logger.info("Vector database indexes ensured")
 
 
 async def pg_health_check() -> bool:
@@ -39,4 +51,4 @@ async def pg_health_check() -> bool:
         return True
     except Exception as e:
         logger.error(f"Health check failed: {e}")
-        return False
+        return False
diff --git a/app/services/vector_store/extended_pg_vector.py b/app/services/vector_store/extended_pg_vector.py
@@ -1,19 +1,73 @@
+import os
+import time
+import logging
 from typing import Optional
-
+from sqlalchemy import event
 from sqlalchemy import delete
 from sqlalchemy.orm import Session
+from sqlalchemy.engine import Engine
 from langchain_core.documents import Document
 from langchain_community.vectorstores.pgvector import PGVector
 
+
 class ExtendedPgVector(PGVector):
+    _query_logging_setup = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.setup_query_logging()
+
+    def setup_query_logging(self):
+        """Enable query logging for this vector store only if DEBUG_PGVECTOR_QUERIES is set"""
+        # Only setup logging if the environment variable is set to a truthy value
+        debug_queries = os.getenv("DEBUG_PGVECTOR_QUERIES", "").lower()
+        if debug_queries not in ["true", "1", "yes", "on"]:
+            return
+
+        # Only setup once per class
+        if ExtendedPgVector._query_logging_setup:
+            return
+
+        logger = logging.getLogger("pgvector.queries")
+        logger.setLevel(logging.INFO)
+
+        # Create handler if it doesn't exist
+        if not logger.handlers:
+            handler = logging.StreamHandler()
+            formatter = logging.Formatter("%(asctime)s - PGVECTOR QUERY - %(message)s")
+            handler.setFormatter(formatter)
+            logger.addHandler(handler)
+
+        @event.listens_for(Engine, "before_cursor_execute")
+        def receive_before_cursor_execute(
+            conn, cursor, statement, parameters, context, executemany
+        ):
+            if "langchain_pg_embedding" in statement:
+                context._query_start_time = time.time()
+                logger.info(f"STARTING QUERY: {statement}")
+                logger.info(f"PARAMETERS: {parameters}")
+
+        @event.listens_for(Engine, "after_cursor_execute")
+        def receive_after_cursor_execute(
+            conn, cursor, statement, parameters, context, executemany
+        ):
+            if "langchain_pg_embedding" in statement:
+                total = time.time() - context._query_start_time
+                logger.info(f"COMPLETED QUERY in {total:.4f}s")
+                logger.info("-" * 50)
+
+        ExtendedPgVector._query_logging_setup = True
+
     def get_all_ids(self) -> list[str]:
         with Session(self._bind) as session:
             results = session.query(self.EmbeddingStore.custom_id).all()
             return [result[0] for result in results if result[0] is not None]
-        
+
     def get_filtered_ids(self, ids: list[str]) -> list[str]:
         with Session(self._bind) as session:
-            query = session.query(self.EmbeddingStore.custom_id).filter(self.EmbeddingStore.custom_id.in_(ids))
+            query = session.query(self.EmbeddingStore.custom_id).filter(
+                self.EmbeddingStore.custom_id.in_(ids)
+            )
             results = query.all()
             return [result[0] for result in results if result[0] is not None]
 
@@ -45,7 +99,9 @@ def _delete_multiple(
                     if not collection:
                         self.logger.warning("Collection not found")
                         return
-                    stmt = stmt.where(self.EmbeddingStore.collection_id == collection.uuid)
+                    stmt = stmt.where(
+                        self.EmbeddingStore.collection_id == collection.uuid
+                    )
                 stmt = stmt.where(self.EmbeddingStore.custom_id.in_(ids))
                 session.execute(stmt)
-            session.commit()
+            session.commit()
diff --git a/main.py b/main.py
@@ -9,31 +9,49 @@
 
 from starlette.responses import JSONResponse
 
-from app.config import VectorDBType, debug_mode, RAG_HOST, RAG_PORT, CHUNK_SIZE, CHUNK_OVERLAP, PDF_EXTRACT_IMAGES, VECTOR_DB_TYPE, \
-    LogMiddleware, logger
+from app.config import (
+    VectorDBType,
+    debug_mode,
+    RAG_HOST,
+    RAG_PORT,
+    CHUNK_SIZE,
+    CHUNK_OVERLAP,
+    PDF_EXTRACT_IMAGES,
+    VECTOR_DB_TYPE,
+    LogMiddleware,
+    logger,
+)
 from app.middleware import security_middleware
 from app.routes import document_routes, pgvector_routes
-from app.services.database import PSQLDatabase, ensure_custom_id_index_on_embedding
+from app.services.database import PSQLDatabase, ensure_vector_indexes
+
 
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     # Startup logic goes here
     # Create bounded thread pool executor based on CPU cores
-    max_workers = min(int(os.getenv("RAG_THREAD_POOL_SIZE", str(os.cpu_count()))), 8)  # Cap at 8
-    app.state.thread_pool = ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="rag-worker")
-    logger.info(f"Initialized thread pool with {max_workers} workers (CPU cores: {os.cpu_count()})")
-    
+    max_workers = min(
+        int(os.getenv("RAG_THREAD_POOL_SIZE", str(os.cpu_count()))), 8
+    )  # Cap at 8
+    app.state.thread_pool = ThreadPoolExecutor(
+        max_workers=max_workers, thread_name_prefix="rag-worker"
+    )
+    logger.info(
+        f"Initialized thread pool with {max_workers} workers (CPU cores: {os.cpu_count()})"
+    )
+
     if VECTOR_DB_TYPE == VectorDBType.PGVECTOR:
         await PSQLDatabase.get_pool()  # Initialize the pool
-        await ensure_custom_id_index_on_embedding()
+        await ensure_vector_indexes()
 
     yield
-    
+
     # Cleanup logic
     logger.info("Shutting down thread pool")
     app.state.thread_pool.shutdown(wait=True)
     logger.info("Thread pool shutdown complete")
 
+
 app = FastAPI(lifespan=lifespan, debug=debug_mode)
 
 app.add_middleware(
@@ -74,5 +92,6 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
         },
     )
 
+
 if __name__ == "__main__":
     uvicorn.run(app, host=RAG_HOST, port=RAG_PORT, log_config=None)
diff --git a/tests/services/test_database.py b/tests/services/test_database.py
@@ -1,39 +1,49 @@
 import pytest
-from app.services.database import ensure_custom_id_index_on_embedding, PSQLDatabase
+from app.services.database import ensure_vector_indexes, PSQLDatabase
+
 
 # Create dummy classes to simulate a database connection and pool
 class DummyConnection:
     async def fetchval(self, query, index_name):
         # Simulate that the index does not exist
         return False
+
     async def execute(self, query):
         return "Executed"
 
+
 class DummyAcquire:
     async def __aenter__(self):
         return DummyConnection()
+
     async def __aexit__(self, exc_type, exc, tb):
         pass
 
+
 class DummyPool:
     def acquire(self):
         return DummyAcquire()
 
+
 class DummyDatabase:
     pool = DummyPool()
 
     @classmethod
     async def get_pool(cls):
         return cls.pool
 
+
 @pytest.fixture
 def dummy_pool(monkeypatch):
     monkeypatch.setattr(PSQLDatabase, "get_pool", DummyDatabase.get_pool)
     return DummyPool()
 
+
 import asyncio
+
+
 @pytest.mark.asyncio
-async def test_ensure_custom_id_index_on_embedding(monkeypatch, dummy_pool):
-    result = await ensure_custom_id_index_on_embedding()
+async def test_ensure_vector_indexes(monkeypatch, dummy_pool):
+    result = await ensure_vector_indexes()
     # If no exceptions are raised, the function worked as expected.
-    assert result is None
+    assert result is None