handle rate limit from backend

Pavan-Microsoft · Pavan-Microsoft · commit a2975724cfa5 · 2025-04-03T16:03:11.000+05:30
diff --git a/.github/workflows/deploy-KMGeneric.yml b/.github/workflows/deploy-KMGeneric.yml
@@ -103,7 +103,7 @@ jobs:
 
       - name: Determine Tag Name Based on Branch
         id: determine_tag
-        run: echo "tagname=${{ github.ref_name == 'main' && 'latest_migra' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.ref_name == 'dependabotchanges' && 'dependabotchanges' || github.head_ref || 'default' }}" >> $GITHUB_OUTPUT
+        run: echo "tagname=${{ github.ref_name == 'main' && 'latest_migrated' || github.ref_name == 'dev' && 'dev' || github.ref_name == 'demo' && 'demo' || github.ref_name == 'dependabotchanges' && 'dependabotchanges' || github.head_ref || 'default' }}" >> $GITHUB_OUTPUT
 
       - name: Deploy Bicep Template
         id: deploy
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -49,7 +49,7 @@ jobs:
         id: determine_tag
         run: |
          if [[ "${{ github.ref_name }}" == "main" ]]; then
-          echo "tagname=latest_migra" >> $GITHUB_OUTPUT
+          echo "tagname=latest_migrated" >> $GITHUB_OUTPUT
          elif [[ "${{ github.ref_name }}" == "dev" ]]; then
           echo "tagname=dev" >> $GITHUB_OUTPUT
          elif [[ "${{ github.ref_name }}" == "demo" ]]; then
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -63,7 +63,7 @@ param embeddingModel string = 'text-embedding-ada-002'
 @description('Capacity of the Embedding Model deployment')
 param embeddingDeploymentCapacity int = 80
 
-param imageTag string = 'latest_migra'
+param imageTag string = 'latest_migrated'
 
 var uniqueId = toLower(uniqueString(subscription().id, environmentName, resourceGroup().location))
 var solutionPrefix = 'km${padLeft(take(uniqueId, 12), 12, '0')}'
diff --git a/infra/main.json b/infra/main.json
@@ -5,7 +5,7 @@
     "_generator": {
       "name": "bicep",
       "version": "0.34.44.8038",
-      "templateHash": "13730134018880843517"
+      "templateHash": "2995962395312305521"
     }
   },
   "parameters": {
@@ -92,7 +92,7 @@
     },
     "imageTag": {
       "type": "string",
-      "defaultValue": "latest_migra"
+      "defaultValue": "latest_migrated"
     }
   },
   "variables": {
diff --git a/src/api/services/chat_service.py b/src/api/services/chat_service.py
@@ -11,6 +11,7 @@
 from semantic_kernel.agents.open_ai import AzureAssistantAgent
 from semantic_kernel.contents.chat_message_content import ChatMessageContent
 from semantic_kernel.contents.utils.author_role import AuthorRole
+from semantic_kernel.exceptions.agent_exceptions import AgentInvokeException  # Import the exception
 
 from common.config.config import Config
 from helpers.utils import format_stream_response
@@ -145,50 +146,70 @@ async def stream_chat_request(self, request_body, conversation_id, query):
         history_metadata = request_body.get("history_metadata", {})
 
         async def generate():
-            assistant_content = ""
-            # Call the OpenAI streaming method
-            response = await self.stream_openai_text(conversation_id, query)
-            # Stream chunks of data
-            async for chunk in response.body_iterator:
-                if isinstance(chunk, dict):
-                    chunk = json.dumps(chunk)  # Convert dict to JSON string
-                assistant_content += chunk
-                chat_completion_chunk = {
-                    "id": "",
-                    "model": "",
-                    "created": 0,
-                    "object": "",
-                    "choices": [
-                        {
-                            "messages": [],
-                            "delta": {},
-                        }
-                    ],
-                    "history_metadata": history_metadata,
-                    "apim-request-id": "",
-                }
-
-                chat_completion_chunk["id"] = str(uuid.uuid4())
-                chat_completion_chunk["model"] = "rag-model"
-                chat_completion_chunk["created"] = int(time.time())
-                # chat_completion_chunk["object"] = assistant_content
-                chat_completion_chunk["object"] = "extensions.chat.completion.chunk"
-                chat_completion_chunk["apim-request-id"] = response.headers.get(
-                    "apim-request-id", ""
-                )
-                chat_completion_chunk["choices"][0]["messages"].append(
-                    {"role": "assistant", "content": assistant_content}
-                )
-                chat_completion_chunk["choices"][0]["delta"] = {
-                    "role": "assistant",
-                    "content": assistant_content,
-                }
-
-                completion_chunk_obj = json.loads(
-                    json.dumps(chat_completion_chunk),
-                    object_hook=lambda d: SimpleNamespace(**d),
-                )
-                yield json.dumps(format_stream_response(completion_chunk_obj, history_metadata, response.headers.get("apim-request-id", ""))) + "\n\n"
+            try:
+                assistant_content = ""
+                # Call the OpenAI streaming method
+                response = await self.stream_openai_text(conversation_id, query)
+                # Stream chunks of data
+                async for chunk in response.body_iterator:
+                    if isinstance(chunk, dict):
+                        chunk = json.dumps(chunk)  # Convert dict to JSON string
+                    assistant_content += chunk
+                    chat_completion_chunk = {
+                        "id": "",
+                        "model": "",
+                        "created": 0,
+                        "object": "",
+                        "choices": [
+                            {
+                                "messages": [],
+                                "delta": {},
+                            }
+                        ],
+                        "history_metadata": history_metadata,
+                        "apim-request-id": "",
+                    }
+
+                    chat_completion_chunk["id"] = str(uuid.uuid4())
+                    chat_completion_chunk["model"] = "rag-model"
+                    chat_completion_chunk["created"] = int(time.time())
+                    # chat_completion_chunk["object"] = assistant_content
+                    chat_completion_chunk["object"] = "extensions.chat.completion.chunk"
+                    chat_completion_chunk["apim-request-id"] = response.headers.get(
+                        "apim-request-id", ""
+                    )
+                    chat_completion_chunk["choices"][0]["messages"].append(
+                        {"role": "assistant", "content": assistant_content}
+                    )
+                    chat_completion_chunk["choices"][0]["delta"] = {
+                        "role": "assistant",
+                        "content": assistant_content,
+                    }
+
+                    completion_chunk_obj = json.loads(
+                        json.dumps(chat_completion_chunk),
+                        object_hook=lambda d: SimpleNamespace(**d),
+                    )
+                    yield json.dumps(format_stream_response(completion_chunk_obj, history_metadata, response.headers.get("apim-request-id", ""))) + "\n\n"
+            
+            except AgentInvokeException as e:
+                error_message = str(e)
+                retry_after = "sometime"
+                if "Rate limit is exceeded" in error_message:
+                    import re
+                    match = re.search(r"Try again in (\d+) seconds", error_message)
+                    if match:
+                        retry_after = f"{match.group(1)} seconds"
+                    logger.error(f"Rate limit error: {error_message}")
+                    yield json.dumps({"error": f"Rate limit is exceeded. Try again in {retry_after}."}) + "\n\n"
+                else:
+                    logger.error(f"AgentInvokeException: {error_message}")
+                    yield json.dumps({"error": "An error occurred. Please try again later."}) + "\n\n"
+            
+            except Exception as e:
+                logger.error(f"Error in stream_chat_request: {e}", exc_info=True)
+                yield json.dumps({"error": "An error occurred while processing the request."}) + "\n\n"
+
         return generate()
 
     async def complete_chat_request(self, query, last_rag_response=None):