Added ability to configure concurrent_requests in litellm_model.py (#911)

dameikle · NathanHB · web-flow · commit b0edbc340485 · 2025-08-14T14:37:42.000+02:00
* Added ability to configure concurrent_requests in litellm_model.py

* Update src/lighteval/models/endpoints/litellm_model.py

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;

* Updated docstring for new concurrent_requests parameter in LiteLLMModelConfig

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py
@@ -78,13 +78,18 @@ class LiteLLMModelConfig(ModelConfig):
         api_key (str | None):
             API key for authentication. If None, reads from environment variables.
             Environment variable names are provider-specific (e.g., OPENAI_API_KEY).
+        concurrent_requests (int):
+            Maximum number of concurrent API requests to execute in parallel.
+            Higher values can improve throughput for batch processing but may hit rate limits
+            or exhaust API quotas faster. Default is 10.
 
     Example:
         ```python
         config = LiteLLMModelConfig(
             model_name="gpt-4",
             provider="openai",
             base_url="https://api.openai.com/v1",
+            concurrent_requests=5,
             generation_parameters=GenerationParameters(
                 temperature=0.7,
                 max_new_tokens=100
@@ -97,6 +102,7 @@ class LiteLLMModelConfig(ModelConfig):
     provider: str | None = None
     base_url: str | None = None
     api_key: str | None = None
+    concurrent_requests: int = 10
 
 
 class LiteLLMClient(LightevalModel):
@@ -113,11 +119,11 @@ def __init__(self, config: LiteLLMModelConfig) -> None:
         self.base_url = config.base_url
         self.api_key = config.api_key
         self.generation_parameters = config.generation_parameters
+        self.concurrent_requests = config.concurrent_requests
 
         self.API_MAX_RETRY = 5
         self.API_RETRY_SLEEP = 3
         self.API_RETRY_MULTIPLIER = 2
-        self.CONCURRENT_CALLS = 10  # 100 leads to hitting Anthropic rate limits
 
         self._tokenizer = encode
         self.pairwise_tokenization = False
@@ -229,7 +235,7 @@ def __call_api_parallel(
             f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}"
         )
 
-        with ThreadPoolExecutor(self.CONCURRENT_CALLS) as executor:
+        with ThreadPoolExecutor(self.concurrent_requests) as executor:
             for entry in tqdm(
                 executor.map(
                     self.__call_api,