Configurable number requests and connections

jpodivin · jpodivin · commit 8c31a1682120 · 2026-03-10T10:28:52.000Z
Signed-off-by: Jiri Podivin &lt;jpodivin@redhat.com&gt;
diff --git a/logdetective/constants.py b/logdetective/constants.py
@@ -75,7 +75,9 @@
 
 # Tuning for LLM-as-a-Service
 LLM_DEFAULT_MAX_QUEUE_SIZE = 50
-LLM_DEFAULT_REQUESTS_PER_MINUTE = 60
+LLM_DEFAULT_REQUESTS_PER_MINUTE = 600
+LLM_MAX_CONCURRENT_REQUESTS = 100
+LLM_MAX_KEEP_ALIVE_CONNECTIONS = 20
 
 # Roles for chat API
 SYSTEM_ROLE_DEFAULT = "developer"
diff --git a/logdetective/server/config.py b/logdetective/server/config.py
@@ -1,6 +1,7 @@
 import os
 import logging
 import yaml
+import httpx
 from openai import AsyncOpenAI
 
 from logdetective.utils import load_prompts, load_skip_snippet_patterns
@@ -54,9 +55,14 @@ def get_log(config: Config):
 
 def get_openai_api_client(inference_config: InferenceConfig):
     """Set up AsyncOpenAI client with default configuration."""
+    limits = httpx.Limits(
+        max_connections=inference_config.max_concurrent_requests,
+        max_keepalive_connections=inference_config.max_keep_alive_connections,
+    )
     return AsyncOpenAI(
         api_key=inference_config.api_token, base_url=inference_config.url,
-        timeout=inference_config.llm_api_timeout
+        timeout=inference_config.llm_api_timeout,
+        http_client=httpx.AsyncClient(limits=limits)  # Defaults are too restrictive
     )
 
 
diff --git a/logdetective/server/models.py b/logdetective/server/models.py
@@ -16,6 +16,8 @@
     LLM_DEFAULT_REQUESTS_PER_MINUTE,
     SYSTEM_ROLE_DEFAULT,
     USER_ROLE_DEFAULT,
+    LLM_MAX_CONCURRENT_REQUESTS,
+    LLM_MAX_KEEP_ALIVE_CONNECTIONS,
 )
 from logdetective.utils import check_csgrep
 
@@ -174,6 +176,8 @@ class InferenceConfig(BaseModel):  # pylint: disable=too-many-instance-attribute
     system_role: str = SYSTEM_ROLE_DEFAULT
     llm_api_timeout: float = 15.0
     requests_per_minute: int = LLM_DEFAULT_REQUESTS_PER_MINUTE
+    max_concurrent_requests: int = LLM_MAX_CONCURRENT_REQUESTS
+    max_keep_alive_connections: int = LLM_MAX_KEEP_ALIVE_CONNECTIONS
 
 
 class ExtractorConfig(BaseModel):
diff --git a/server/config.yml b/server/config.yml
@@ -16,6 +16,9 @@ inference:
   # If the roles are same, the system prompt and user message are concatenated
   user_role: user
   system_role: system
+  # Limits on connections for AsyncOpenAI client
+  max_concurrent_requests: 100
+  max_keep_alive_connections: 20
 # Separate LLM endpoint for snippet analysis, optional
 # snippet_inference:
 #   max_tokens: -1