Delete thread when it expired from cache

Pavan-Microsoft · Pavan-Microsoft · commit c6e86f0341ad · 2025-05-23T10:33:52.000+05:30
diff --git a/src/api/agents/agent_factory.py b/src/api/agents/agent_factory.py
@@ -32,7 +32,7 @@ async def get_instance(cls, config):
                     conn_str=config.azure_ai_project_conn_string
                 )
 
-                agent_name = "agent"
+                agent_name = "ConversationKnowledgeAgent"
                 agent_instructions = '''You are a helpful assistant.
                 Always return the citations as is in final response.
                 Always return citation markers in the answer as [doc1], [doc2], etc.
diff --git a/src/api/services/chat_service.py b/src/api/services/chat_service.py
@@ -3,6 +3,9 @@
 import time
 import uuid
 from types import SimpleNamespace
+import asyncio
+import random
+import re
 
 import openai
 from fastapi import HTTPException, Request, status
@@ -16,8 +19,6 @@
 from helpers.utils import format_stream_response
 from cachetools import TTLCache
 
-thread_cache = TTLCache(maxsize=1000, ttl=3600)
-
 # Constants
 HOST_NAME = "CKM"
 HOST_INSTRUCTIONS = "Answer questions about call center operations"
@@ -26,8 +27,47 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+class ExpCache(TTLCache):
+    """
+    Extended TTLCache that associates an agent and deletes Azure AI agent threads when items expire or are evicted (LRU).
+    """
+    def __init__(self, *args, agent=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.agent = agent
+
+    def expire(self, time=None):
+        items = super().expire(time)
+        for key, thread_id in items:
+            try:
+                if self.agent:
+                    thread = AzureAIAgentThread(client=self.agent.client, thread_id=thread_id)
+                    asyncio.create_task(thread.delete())
+                    print(f"Thread deleted : {thread_id}")
+            except Exception as e:
+                logger.error("Failed to delete thread for key %s: %s", key, e)
+        return items
+
+    def popitem(self):
+        key, thread_id = super().popitem()
+        try:
+            if self.agent:
+                thread = AzureAIAgentThread(client=self.agent.client, thread_id=thread_id)
+                asyncio.create_task(thread.delete())
+                print(f"Thread deleted (LRU evict): {thread_id}")
+        except Exception as e:
+            logger.error("Failed to delete thread for key %s (LRU evict): %s", key, e)
+        return key, thread_id
+
+
+# Global thread cache, agent will be set later
+thread_cache = None
+
 
 class ChatService:
+    """
+    Service for handling chat interactions, including streaming responses,
+    processing RAG responses, and generating chart data for visualization.
+    """
     def __init__(self, request : Request):
         config = Config()
         self.azure_openai_endpoint = config.azure_openai_endpoint
@@ -37,6 +77,10 @@ def __init__(self, request : Request):
         self.azure_ai_project_conn_string = config.azure_ai_project_conn_string
         self.agent = request.app.state.agent
 
+        global thread_cache
+        if thread_cache is None:
+            thread_cache = ExpCache(maxsize=1000, ttl=3600.0, agent=self.agent)
+
     def process_rag_response(self, rag_response, query):
         """
         Parses the RAG response dynamically to extract chart data for Chart.js.
@@ -64,7 +108,7 @@ def process_rag_response(self, rag_response, query):
             {query}
             {rag_response}
             """
-            logger.info(f">>> Processing chart data for response: {rag_response}")
+            logger.info(">>> Processing chart data for response: %s", rag_response)
 
             completion = client.chat.completions.create(
                 model=self.azure_openai_deployment_name,
@@ -76,12 +120,12 @@ def process_rag_response(self, rag_response, query):
             )
 
             chart_data = completion.choices[0].message.content.strip().replace("```json", "").replace("```", "")
-            logger.info(f">>> Generated chart data: {chart_data}")
+            logger.info(">>> Generated chart data: %s", chart_data)
 
             return json.loads(chart_data)
 
         except Exception as e:
-            logger.error(f"Error processing RAG response: {e}")
+            logger.error("Error processing RAG response: %s", e)
             return {"error": "Chart could not be generated from this data. Please ask a different question."}
 
     async def stream_openai_text(self, conversation_id: str, query: str) -> StreamingResponse:
@@ -94,44 +138,44 @@ async def stream_openai_text(self, conversation_id: str, query: str) -> Streamin
             if not query:
                 query = "Please provide a query."
 
-            # Create the AzureAI Agent
-            agent = self.agent
-
-            thread_id = thread_cache.get(conversation_id, None)
+            thread_id = None
+            if thread_cache is not None:
+                thread_id = thread_cache.get(conversation_id, None)
             if thread_id:
-                thread = AzureAIAgentThread(client=agent.client, thread_id=thread_id)
+                thread = AzureAIAgentThread(client=self.agent.client, thread_id=thread_id)
 
             truncation_strategy = TruncationObject(type="last_messages", last_messages=2)
 
-            async for response in agent.invoke_stream(messages=query, thread=thread, truncation_strategy=truncation_strategy):
-                thread_cache[conversation_id] = response.thread.id
+            async for response in self.agent.invoke_stream(messages=query, thread=thread, truncation_strategy=truncation_strategy):
+                if thread_cache is not None:
+                    thread_cache[conversation_id] = response.thread.id
                 complete_response += str(response.content)
                 yield response.content
 
         except RuntimeError as e:
             complete_response = str(e)
             if "Rate limit is exceeded" in str(e):
                 logger.error("Rate limit error: %s", e)
-                raise AgentException(f"Rate limit is exceeded. {str(e)}")
+                raise AgentException(f"Rate limit is exceeded. {str(e)}") from e
             else:
                 logger.error("RuntimeError: %s", e)
-                raise AgentException(f"An unexpected runtime error occurred: {str(e)}")
+                raise AgentException(f"An unexpected runtime error occurred: {str(e)}") from e
 
         except Exception as e:
             complete_response = str(e)
             logger.error("Error in stream_openai_text: %s", e)
-            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Error streaming OpenAI text")
+            raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Error streaming OpenAI text") from e
 
         finally:
             # Provide a fallback response when no data is received from OpenAI.
             if complete_response == "":
                 logger.info("No response received from OpenAI.")
-                thread_cache.pop(conversation_id, None)
-                if thread:
-                    try:
-                        await thread.delete()
-                    except Exception as e:
-                        logger.warning("Failed to delete thread %s: %s", thread_id, e)
+                thread_id = None
+                if thread_cache is not None:
+                    thread_id = thread_cache.pop(conversation_id, None)
+                    if thread_id is not None:
+                        corrupt_key = f"{conversation_id}_corrupt_{random.randint(1000, 9999)}"
+                        thread_cache[corrupt_key] = thread_id
                 yield "I cannot answer this question with the current data. Please rephrase or add more details."
 
     async def stream_chat_request(self, request_body, conversation_id, query):
@@ -186,18 +230,17 @@ async def generate():
                 error_message = str(e)
                 retry_after = "sometime"
                 if "Rate limit is exceeded" in error_message:
-                    import re
                     match = re.search(r"Try again in (\d+) seconds", error_message)
                     if match:
                         retry_after = f"{match.group(1)} seconds"
-                    logger.error(f"Rate limit error: {error_message}")
+                    logger.error("Rate limit error: %s", error_message)
                     yield json.dumps({"error": f"Rate limit is exceeded. Try again in {retry_after}."}) + "\n\n"
                 else:
-                    logger.error(f"AgentInvokeException: {error_message}")
+                    logger.error("AgentInvokeException: %s", error_message)
                     yield json.dumps({"error": "An error occurred. Please try again later."}) + "\n\n"
 
             except Exception as e:
-                logger.error(f"Error in stream_chat_request: {e}", exc_info=True)
+                logger.error("Error in stream_chat_request: %s", e, exc_info=True)
                 yield json.dumps({"error": "An error occurred while processing the request."}) + "\n\n"
 
         return generate()

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ async def get_instance(cls, config):`
`32`	`32`	`conn_str=config.azure_ai_project_conn_string`
`33`	`33`	`)`
`34`	`34`
`35`		`- agent_name = "agent"`
	`35`	`+ agent_name = "ConversationKnowledgeAgent"`
`36`	`36`	`agent_instructions = '''You are a helpful assistant.`
`37`	`37`	`Always return the citations as is in final response.`
`38`	`38`	`Always return citation markers in the answer as [doc1], [doc2], etc.`