huggingface · anupam-dewan · Sep 10, 2025 · Sep 10, 2025 · Sep 15, 2025 · NathanHB
diff --git a/examples/model_configs/litellm_model.yaml b/examples/model_configs/litellm_model.yaml
@@ -2,6 +2,7 @@ model_parameters:
   model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
   provider: "openai"
   base_url: "https://router.huggingface.co/hf-inference/v1"
+  concurrent_requests: 10 # Configure the number of concurrent API requests
   generation_parameters:
     temperature: 0.5
     max_new_tokens: 256

diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
@@ -54,6 +54,10 @@ class ModelConfig(BaseModel, extra="forbid"):
             behavior and context for the model during evaluation.
         cache_dir (str):
             Directory to cache the model. Defaults to "~/.cache/huggingface/lighteval".
+        concurrent_requests (int):
+            Maximum number of concurrent API requests to execute in parallel.
+            Higher values can improve throughput for batch processing but may hit rate limits
+            or exhaust API quotas faster. Default is 10.
 
     Methods:
         from_path(path: str):
@@ -83,6 +87,8 @@ class ModelConfig(BaseModel, extra="forbid"):
     generation_parameters: GenerationParameters = GenerationParameters()
     system_prompt: str | None = None
     cache_dir: str = "~/.cache/huggingface/lighteval"
+    concurrent_requests: int = 10
+
 
     @classmethod
     def from_path(cls, path: str):