diff --git a/src/lighteval/models/endpoints/litellm_model.py b/src/lighteval/models/endpoints/litellm_model.py index 0e6d216d3..b69764ebc 100644 --- a/src/lighteval/models/endpoints/litellm_model.py +++ b/src/lighteval/models/endpoints/litellm_model.py @@ -78,6 +78,10 @@ class LiteLLMModelConfig(ModelConfig): api_key (str | None): API key for authentication. If None, reads from environment variables. Environment variable names are provider-specific (e.g., OPENAI_API_KEY). + concurrent_requests (int): + Maximum number of concurrent API requests to execute in parallel. + Higher values can improve throughput for batch processing but may hit rate limits + or exhaust API quotas faster. Default is 10. Example: ```python @@ -85,6 +89,7 @@ class LiteLLMModelConfig(ModelConfig): model_name="gpt-4", provider="openai", base_url="https://api.openai.com/v1", + concurrent_requests=5, generation_parameters=GenerationParameters( temperature=0.7, max_new_tokens=100 @@ -97,6 +102,7 @@ class LiteLLMModelConfig(ModelConfig): provider: str | None = None base_url: str | None = None api_key: str | None = None + concurrent_requests: int = 10 class LiteLLMClient(LightevalModel): @@ -113,11 +119,11 @@ def __init__(self, config: LiteLLMModelConfig) -> None: self.base_url = config.base_url self.api_key = config.api_key self.generation_parameters = config.generation_parameters + self.concurrent_requests = config.concurrent_requests self.API_MAX_RETRY = 5 self.API_RETRY_SLEEP = 3 self.API_RETRY_MULTIPLIER = 2 - self.CONCURRENT_CALLS = 10 # 100 leads to hitting Anthropic rate limits self._tokenizer = encode self.pairwise_tokenization = False @@ -229,7 +235,7 @@ def __call_api_parallel( f"Length of prompts, return_logitss, max_new_tokenss, num_sampless, stop_sequences, system_prompts should be the same but are {len(prompts)}, {len(return_logitss)}, {len(max_new_tokenss)}, {len(num_sampless)}, {len(stop_sequencess)}" ) - with ThreadPoolExecutor(self.CONCURRENT_CALLS) as executor: + with ThreadPoolExecutor(self.concurrent_requests) as executor: for entry in tqdm( executor.map( self.__call_api,