fix inference endpoints bugs

NathanHB · NathanHB · commit 9a0690d94dc0 · 2025-05-07T09:39:45.000Z
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx
@@ -62,7 +62,7 @@ The command takes three required arguments:
 ```python
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.models.custom.custom_model import CustomModelConfig
-from lighteval.pipeline import Pipeline, PipelineParameters, EnvConfig
+from lighteval.pipeline import Pipeline, PipelineParameters
 
 # Set up evaluation tracking
 evaluation_tracker = EvaluationTracker(
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -67,10 +67,6 @@ def __init__(
             requests (List): A list of requests.
             num_dataset_splits (int): The number of dataset splits.
         """
-        # We make sure the requests contain the tokenized versions of their values
-        if any(r.tokenized_context is None for r in requests):
-            raise ValueError("You passed a request for which tokenization had not happened yet.")
-
         # sort the requests using the collate function and save the original order
         enumerated_requests = list(enumerate(requests))
         sorted_enumerated_requests = sorted(enumerated_requests, key=lambda x: self._sorting_criteria(x[1]))
@@ -270,11 +266,13 @@ def _sorting_criteria(self, request: GreedyUntilRequest) -> tuple[bool, bool, li
         Returns:
             Any: The collated data.
         """
-        toks = request.tokenized_context
+        toks = request.context
         gen_length = request.generation_size
+
         # The generative task has no limit except the model context
         if gen_length is None:
             gen_length = 0
+
         return (
             request.do_sample,
             request.use_logits,
diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py
@@ -97,7 +97,7 @@ def get_method_from_request_type(self, request_type: RequestType):
         raise NotImplementedError(f"Request type {request_type} not supported")
 
     def greedy_until_multi_turn(  # noqa: C901
-        self, requests: list[GreedyUntilMultiTurnRequest], override_bs: Optional[int] = None
+        self, requests: list[GreedyUntilMultiTurnRequest]
     ) -> GenerativeMultiturnResponse:
         """Generates responses using a greedy decoding strategy until certain ending conditions are met."""
         return NotImplemented
@@ -106,7 +106,6 @@ def greedy_until_multi_turn(  # noqa: C901
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        override_bs: Optional[int] = None,
     ) -> list[GenerativeResponse]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -122,24 +121,20 @@ def greedy_until(
         return NotImplemented
 
     @abstractmethod
-    def loglikelihood(
-        self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
-    ) -> list[LoglikelihoodResponse]:
+    def loglikelihood(self, requests: list[LoglikelihoodRequest]) -> list[LoglikelihoodResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
         return NotImplemented
 
     @abstractmethod
-    def loglikelihood_rolling(
-        self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None
-    ) -> list[LoglikelihoodResponse]:
+    def loglikelihood_rolling(self, requests: list[LoglikelihoodRollingRequest]) -> list[LoglikelihoodResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         return NotImplemented
 
     @abstractmethod
     def loglikelihood_single_token(
-        self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None
+        self, requests: list[LoglikelihoodSingleTokenRequest]
     ) -> list[LoglikelihoodSingleTokenResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
diff --git a/src/lighteval/models/endpoints/inference_providers_model.py b/src/lighteval/models/endpoints/inference_providers_model.py
@@ -24,7 +24,6 @@
 import logging
 from typing import Any, List, Optional
 
-import yaml
 from huggingface_hub import AsyncInferenceClient, ChatCompletionOutput
 from huggingface_hub.errors import HfHubHTTPError
 from pydantic import NonNegativeInt
@@ -35,7 +34,6 @@
 from lighteval.data import GenerativeTaskDataset
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.endpoints.endpoint_model import ModelInfo
-from lighteval.models.model_input import GenerationParameters
 from lighteval.models.model_output import (
     GenerativeResponse,
     LoglikelihoodResponse,
@@ -72,26 +70,6 @@ class InferenceProvidersModelConfig(ModelConfig):
     org_to_bill: str | None = None
     parallel_calls_count: NonNegativeInt = 10
 
-    @classmethod
-    def from_path(cls, path):
-        with open(path, "r") as f:
-            config = yaml.safe_load(f)["model"]
-
-        model_name = config["model_name"]
-        provider = config.get("provider", None)
-        timeout = config.get("timeout", None)
-        proxies = config.get("proxies", None)
-        org_to_bill = config.get("org_to_bill", None)
-        generation_parameters = GenerationParameters.from_dict(config)
-        return cls(
-            model=model_name,
-            provider=provider,
-            timeout=timeout,
-            proxies=proxies,
-            org_to_bill=org_to_bill,
-            generation_parameters=generation_parameters,
-        )
-
 
 class InferenceProvidersClient(LightevalModel):
     """Client for making inference requests to various providers using the HuggingFace Inference API.
@@ -198,7 +176,6 @@ async def bounded_api_call(prompt, num_samples):
     def greedy_until(
         self,
         requests: list[GreedyUntilRequest],
-        override_bs: Optional[int] = None,
     ) -> list[GenerativeResponse]:
         """
         Generates responses using a greedy decoding strategy until certain ending conditions are met.
@@ -256,22 +233,18 @@ def max_length(self) -> int:
             logger.warning("Tokenizer was not correctly loaded. Max model context length is assumed to be 30K tokens")
             return 30000
 
-    def loglikelihood(
-        self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
-    ) -> list[LoglikelihoodResponse]:
+    def loglikelihood(self, requests: list[LoglikelihoodRequest]) -> list[LoglikelihoodResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.
         """
         raise NotImplementedError
 
-    def loglikelihood_rolling(
-        self, requests: list[LoglikelihoodRollingRequest], override_bs: Optional[int] = None
-    ) -> list[LoglikelihoodResponse]:
+    def loglikelihood_rolling(self, requests: list[LoglikelihoodRollingRequest]) -> list[LoglikelihoodResponse]:
         """This function is used to compute the log likelihood of the context for perplexity metrics."""
         raise NotImplementedError
 
     def loglikelihood_single_token(
-        self, requests: list[LoglikelihoodSingleTokenRequest], override_bs: Optional[int] = None
+        self, requests: list[LoglikelihoodSingleTokenRequest]
     ) -> list[LoglikelihoodSingleTokenResponse]:
         """Tokenize the context and continuation and compute the log likelihood of those
         tokenized sequences.