removed duplicate code, useless function, added stronger deletion of items, plus updated the logic in generation size to respect what the user asks (#1073)

clefourrier · web-flow · commit 2236e17fec68 · 2025-11-20T14:35:29.000+01:00
diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
@@ -66,6 +66,9 @@
 
 STARTING_BATCH_SIZE = 512
 
+# Thread local param
+torch.set_grad_enabled(False)
+
 
 class TransformersModelConfig(ModelConfig):
     """Configuration class for HuggingFace Transformers models.
@@ -218,12 +221,6 @@ def __init__(
         if config.model_parallel is False and self.config.dtype not in ["4bit", "8bit"]:
             logger.info(f"Using Data Parallelism, putting model on device {self._device}")
             self.model = self.model.to(self._device)
-        if config.compile:
-            try:
-                logger.info("Compiling the model")
-                self.model.model.compile()
-            except AttributeError as e:
-                logger.warning("Could not compile the model because: ", e)
 
         self.model_name = _simplify_name(config.model_name)
 
@@ -410,7 +407,7 @@ def _create_auto_model(self) -> transformers.PreTrainedModel:
         )
         # model.to(self.device)
         model.eval()
-        torch.set_grad_enabled(False)
+
         if self.continuous_batching:
             generation_config = GenerationConfig(
                 **self.generation_config_dict,
@@ -497,9 +494,6 @@ def _check_continuations_start_space(self, continuation: str) -> str:
                 continuation = continuation.lstrip()
         return continuation
 
-    def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
-        return self.model(inputs).logits
-
     def _get_batch_size(self, max_input_length: int, override_bs: int | None, starting_batch_size: int = 512) -> int:
         if override_bs is not None:
             return override_bs
@@ -509,10 +503,18 @@ def _get_batch_size(self, max_input_length: int, override_bs: int | None, starti
             starting_batch_size=starting_batch_size
         )  # if OOM, then halves batch_size and tries again
         def forward_batch(batch_size):
-            test_batch = torch.ones(
-                (batch_size + int(0.1 * batch_size), max_input_length), device=self.device
-            ).long()  # We add 10% for marging :)
-            F.log_softmax(self._model_call(test_batch).float(), dim=-1).cpu()
+            fake_batch, fake_output = None, None
+            with torch.no_grad():
+                try:
+                    fake_batch = torch.ones((batch_size, max_input_length), device=self.device).int()
+                    fake_output = F.log_softmax(self.model(fake_batch).logits, dim=-1).cpu()
+                except Exception as e:
+                    for fake_item in [fake_batch, fake_output]:
+                        if fake_item is not None:
+                            fake_item.detach()
+                            del fake_item
+
+                    raise e
             return batch_size
 
         batch_size = forward_batch()
@@ -645,10 +647,14 @@ def _padded_greedy_until(
             position=0,
             disable=self.disable_tqdm,
         ):
-            if split[0].generation_size is None:
+            if self.generation_config_dict.get("max_new_tokens", None) is not None:
+                # The user forces a specific generation size
+                max_context_continuation_size_allowed = self.generation_config_dict["max_new_tokens"]
+            elif split[0].generation_size is None:
                 # No constraints on the generation size: max length allowed is the max model context
                 max_context_continuation_size_allowed = self.max_length
             else:
+                # The task forces a specific generation size
                 context = self.prompt_manager.prepare_prompt(split[0])
                 tokenized_context = self.tokenizer(context)
 
@@ -953,7 +959,7 @@ def _loglikelihood_tokens(  # noqa: C901
                     max_context=None,  # computed as model max length in the function
                 )
 
-                model_output = self._model_call(prepared_batch.input_ids)
+                model_output = self.model(prepared_batch.input_ids).logits
                 logits = F.log_softmax(model_output, dim=-1)  # [batch, sequence_length, vocab]
 
                 flat_index = 0