lightspeed-core
diff --git a/‎dev-tools/mcp-mock-server/server.py‎
Lines changed: 75 additions & 33 deletions b/‎dev-tools/mcp-mock-server/server.py‎
Lines changed: 75 additions & 33 deletions
diff --git a/‎dev-tools/test-configs/mcp-mock-test-noop.yaml‎
Lines changed: 38 additions & 0 deletions b/‎dev-tools/test-configs/mcp-mock-test-noop.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎docker-compose.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docker-compose.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/app/endpoints/query.py‎
Lines changed: 122 additions & 33 deletions b/‎src/app/endpoints/query.py‎
Lines changed: 122 additions & 33 deletions
@@ -60,7 +60,11 @@ def _capture_headers(self) -> None:
         if len(request_log) > 10:
             request_log.pop(0)
 
-    def do_POST(self) -> None:  # pylint: disable=invalid-name
+    def do_POST(
+        self,
+    ) -> (
+        None
+    ):  # pylint: disable=invalid-name,too-many-locals,too-many-branches,too-many-statements
         """Handle POST requests (MCP protocol endpoints)."""
         self._capture_headers()
 
@@ -77,23 +81,40 @@ def do_POST(self) -> None:  # pylint: disable=invalid-name
             request_id = 1
             method = "unknown"
 
+        # Log the RPC method in the request log
+        if request_log:
+            request_log[-1]["rpc_method"] = method
+
         # Determine tool name based on authorization header to avoid collisions
         auth_header = self.headers.get("Authorization", "")
 
         # Initialize tool info defaults
         tool_name = "mock_tool_no_auth"
         tool_desc = "Mock tool with no authorization"
+        error_mode = False
 
         # Match based on token content
-        if "test-secret-token" in auth_header:
-            tool_name = "mock_tool_file"
-            tool_desc = "Mock tool with file-based auth"
-        elif "my-k8s-token" in auth_header:
-            tool_name = "mock_tool_k8s"
-            tool_desc = "Mock tool with Kubernetes token"
-        elif "my-client-token" in auth_header:
-            tool_name = "mock_tool_client"
-            tool_desc = "Mock tool with client-provided token"
+        match True:
+            case _ if "test-secret-token" in auth_header:
+                tool_name = "mock_tool_file"
+                tool_desc = "Mock tool with file-based auth"
+            case _ if "my-k8s-token" in auth_header:
+                tool_name = "mock_tool_k8s"
+                tool_desc = "Mock tool with Kubernetes token"
+            case _ if "my-client-token" in auth_header:
+                tool_name = "mock_tool_client"
+                tool_desc = "Mock tool with client-provided token"
+            case _ if "error-mode" in auth_header:
+                tool_name = "mock_tool_error"
+                tool_desc = "Mock tool configured to return errors"
+                error_mode = True
+            case _:
+                # Default case already set above
+                pass
+
+        # Log the tool name in the request log
+        if request_log:
+            request_log[-1]["tool_name"] = tool_name
 
         # Handle MCP protocol methods using match statement
         response: dict = {}
@@ -145,29 +166,46 @@ def do_POST(self) -> None:  # pylint: disable=invalid-name
                 tool_called = params.get("name", "unknown")
                 arguments = params.get("arguments", {})
 
-                # Build result text
-                auth_preview = (
-                    auth_header[:50] if len(auth_header) > 50 else auth_header
-                )
-                result_text = (
-                    f"Mock tool '{tool_called}' executed successfully "
-                    f"with arguments: {arguments}. Auth used: {auth_preview}..."
-                )
-
-                # Return successful tool execution result
-                response = {
-                    "jsonrpc": "2.0",
-                    "id": request_id,
-                    "result": {
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": result_text,
-                            }
-                        ],
-                        "isError": False,
-                    },
-                }
+                # Check if error mode is enabled
+                if error_mode:
+                    # Return error response
+                    response = {
+                        "jsonrpc": "2.0",
+                        "id": request_id,
+                        "result": {
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": (
+                                        f"Error: Tool '{tool_called}' "
+                                        "execution failed - simulated error."
+                                    ),
+                                }
+                            ],
+                            "isError": True,
+                        },
+                    }
+                else:
+                    # Build result text
+                    result_text = (
+                        f"Mock tool '{tool_called}' executed successfully "
+                        f"with arguments: {arguments}."
+                    )
+
+                    # Return successful tool execution result
+                    response = {
+                        "jsonrpc": "2.0",
+                        "id": request_id,
+                        "result": {
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": result_text,
+                                }
+                            ],
+                            "isError": False,
+                        },
+                    }
 
             case _:
                 # Generic success response for other methods
@@ -194,6 +232,10 @@ def do_GET(self) -> None:  # pylint: disable=invalid-name
                 )
             case "/debug/requests":
                 self._send_json_response(request_log)
+            case "/debug/clear":
+                # Clear the request log
+                request_log.clear()
+                self._send_json_response({"status": "cleared", "request_count": 0})
             case "/":
                 self._send_help_page()
             case _:
 
@@ -0,0 +1,38 @@
+name: Lightspeed Core Service - MCP Mock Server Test (Noop Auth)
+service:
+  host: localhost
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: true
+  library_client_config_path: "dev-tools/test-configs/llama-stack-mcp-test.yaml"
+user_data_collection:
+  feedback_enabled: false
+  transcripts_enabled: false
+authentication:
+  module: "noop"
+inference:
+  default_model: "gpt-4o-mini"
+  default_provider: "openai"
+mcp_servers:
+  # Test 1: Static file-based authentication (HTTP)
+  - name: "mock-file-auth"
+    provider_id: "model-context-protocol"
+    url: "http://localhost:9000"
+    authorization_headers:
+      Authorization: "/tmp/lightspeed-mcp-test-token"
+  # Test 2: Kubernetes token forwarding (HTTP)
+  - name: "mock-k8s-auth"
+    provider_id: "model-context-protocol"
+    url: "http://localhost:9000"
+    authorization_headers:
+      Authorization: "kubernetes"
+  # Test 3: Client-provided token (HTTP - simplified for testing)
+  - name: "mock-client-auth"
+    provider_id: "model-context-protocol"
+    url: "http://localhost:9000"
+    authorization_headers:
+      Authorization: "client"
@@ -84,6 +84,11 @@ services:
       - TENANT_ID=${TENANT_ID:-}
       - CLIENT_ID=${CLIENT_ID:-}
       - CLIENT_SECRET=${CLIENT_SECRET:-}
+    entrypoint: >
+      /bin/bash -c "
+      echo 'test-secret-token-123' > /tmp/lightspeed-mcp-test-token &&
+      /opt/app-root/src/scripts/run.sh
+      "
     depends_on:
         llama-stack:
           condition: service_healthy
 
@@ -1,5 +1,6 @@
 """Handler for REST API call to provide answer to query."""
 
+import asyncio
 import ast
 import logging
 import re
@@ -77,6 +78,33 @@
     503: ServiceUnavailableResponse.openapi_response(),
 }
 
+# Track background tasks to prevent garbage collection
+# Background tasks created with asyncio.create_task() need strong references
+# to prevent premature garbage collection before they complete
+background_tasks_set: set[asyncio.Task] = set()
+
+
+def create_background_task(coro: Any) -> None:
+    """Create a background task and track it to prevent garbage collection.
+
+    This function creates a detached async task that runs independently of the
+    HTTP request lifecycle. Tasks are stored in a module-level set to maintain
+    strong references, preventing garbage collection. When a task completes,
+    it automatically removes itself from the set.
+
+    Args:
+        coro: Coroutine to run as a background task
+    """
+    try:
+        task = asyncio.create_task(coro)
+        background_tasks_set.add(task)
+        task.add_done_callback(background_tasks_set.discard)
+        logger.debug(
+            f"Background task created, active tasks: {len(background_tasks_set)}"
+        )
+    except Exception as e:
+        logger.error(f"Failed to create background task: {e}", exc_info=True)
+
 
 def is_transcripts_enabled() -> bool:
     """Check if transcripts is enabled.
@@ -297,26 +325,6 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
             )
         )
 
-        # Get the initial topic summary for the conversation
-        topic_summary = None
-        with get_session() as session:
-            existing_conversation = (
-                session.query(UserConversation).filter_by(id=conversation_id).first()
-            )
-            if not existing_conversation:
-                # Check if topic summary should be generated (default: True)
-                should_generate = query_request.generate_topic_summary
-
-                if should_generate:
-                    logger.debug("Generating topic summary for new conversation")
-                    topic_summary = await get_topic_summary_func(
-                        query_request.query, client, llama_stack_model_id
-                    )
-                else:
-                    logger.debug(
-                        "Topic summary generation disabled by request parameter"
-                    )
-                    topic_summary = None
         # Convert RAG chunks to dictionary format once for reuse
         logger.info("Processing RAG chunks...")
         rag_chunks_dict = [chunk.model_dump() for chunk in summary.rag_chunks]
@@ -338,15 +346,6 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
                 attachments=query_request.attachments or [],
             )
 
-        logger.info("Persisting conversation details...")
-        persist_user_conversation_details(
-            user_id=user_id,
-            conversation_id=conversation_id,
-            model=model_id,
-            provider_id=provider_id,
-            topic_summary=topic_summary,
-        )
-
         completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
         cache_entry = CacheEntry(
             query=query_request.query,
@@ -376,15 +375,20 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
             conversation_id,
             cache_entry,
             _skip_userid_check,
-            topic_summary,
+            None,  # topic_summary is generated in background task
         )
 
         # Convert tool calls to response format
         logger.info("Processing tool calls...")
 
         logger.info("Using referenced documents from response...")
 
-        available_quotas = get_available_quotas(configuration.quota_limiters, user_id)
+        # Get available quotas if quota limiters are configured
+        available_quotas = {}
+        if configuration.quota_limiters:
+            available_quotas = get_available_quotas(
+                configuration.quota_limiters, user_id
+            )
 
         logger.info("Building final response...")
         response = QueryResponse(
@@ -399,10 +403,95 @@ async def query_endpoint_handler_base(  # pylint: disable=R0914
             output_tokens=token_usage.output_tokens,
             available_quotas=available_quotas,
         )
+
+        # Schedule conversation persistence as a detached background task
+        # IMPORTANT: We use asyncio.create_task() instead of FastAPI's BackgroundTasks
+        # for two critical reasons:
+        # 1. Complete detachment from request context: The task runs independently,
+        #    not tied to the HTTP request lifecycle or middleware processing
+        # 2. MCP session lifecycle compatibility: Llama Stack's MCPSessionManager.close_all()
+        #    aggressively cancels tasks within the request context. By creating a detached
+        #    task, we avoid this cancellation scope entirely.
+        async def persist_with_topic_summary() -> None:
+            """Persist conversation with topic summary generation.
+
+            This function runs as a background task AFTER the HTTP response has been sent.
+
+            Strategy for MCP compatibility and database isolation:
+            1. Wait 500ms for MCP session cleanup to complete naturally
+            2. Then safely call LLM for topic summary generation without cancellation
+            3. Use independent database sessions in thread pool to avoid connection issues
+            4. Persist conversation details with or without topic summary
+
+            The delay ensures MCPSessionManager.close_all() has finished its cleanup
+            before we make any new LLM calls, preventing CancelledError exceptions.
+            Database operations run in thread pool to isolate from request lifecycle.
+            """
+            logger.debug("Background task: waiting for MCP cleanup")
+            # Give MCP sessions time to clean up (they close after response is sent)
+            await asyncio.sleep(0.5)  # 500ms should be enough for cleanup
+            logger.debug("Background task: MCP cleanup complete")
+
+            topic_summary = None
+            should_generate = (
+                query_request.generate_topic_summary
+                if query_request.generate_topic_summary is not None
+                else True
+            )
+
+            # Check if this is a new conversation and generate topic summary if needed
+            if should_generate:
+                try:
+
+                    def check_conversation_exists() -> bool:
+                        """Check if conversation exists in database (runs in thread pool)."""
+                        with get_session() as session:
+                            existing = (
+                                session.query(UserConversation)
+                                .filter_by(id=conversation_id)
+                                .first()
+                            )
+                            return existing is not None
+
+                    # Run database check in thread pool to avoid connection issues
+                    conversation_exists = await asyncio.to_thread(
+                        check_conversation_exists
+                    )
+
+                    if not conversation_exists:
+                        logger.debug("Generating topic summary for new conversation")
+                        topic_summary = await get_topic_summary_func(
+                            query_request.query, client, llama_stack_model_id
+                        )
+                        logger.info("Topic summary generated successfully")
+                except Exception as e:  # pylint: disable=broad-exception-caught
+                    logger.error("Failed to generate topic summary: %s", e)
+                    topic_summary = None
+
+            # Persist conversation
+            try:
+
+                def persist_conversation() -> None:
+                    """Persist conversation to database (runs in thread pool)."""
+                    persist_user_conversation_details(
+                        user_id=user_id,
+                        conversation_id=conversation_id,
+                        model=model_id,
+                        provider_id=provider_id,
+                        topic_summary=topic_summary,
+                    )
+
+                # Run persistence in thread pool to avoid connection issues
+                await asyncio.to_thread(persist_conversation)
+                logger.debug("Conversation persisted successfully")
+            except Exception as e:  # pylint: disable=broad-exception-caught
+                logger.error("Failed to persist conversation: %s", e)
+
+        # Create detached task with strong reference to prevent garbage collection
+        create_background_task(persist_with_topic_summary())
+
         logger.info("Query processing completed successfully!")
         return response
-
-    # connection to Llama Stack server
     except APIConnectionError as e:
         # Update metrics for the LLM call failure
         metrics.llm_calls_failures_total.inc()