lm-eval polishing and speed-up (#2361)

12010486 · web-flow · commit 472e93d2d73c · 2026-01-07T18:33:01.000+01:00
diff --git a/examples/text-generation/model_adapter.py b/examples/text-generation/model_adapter.py
@@ -25,14 +25,14 @@
 import torch.nn.functional as F
 from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM, TemplateLM
-from lm_eval.models.utils import get_dtype, stop_sequences_criteria
+from lm_eval.models.utils import get_dtype
 
 # Local imports
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 
 
-logger = logging.getLogger(__name__)
+eval_logger = logging.getLogger(__name__)
 
 
 class HabanaModelAdapter(HFLM):
@@ -100,7 +100,7 @@ def __init__(
         )
         if "gemma" in getattr(self._config, "model_type", ""):
             self.add_bos_token = True
-            logger.info(
+            eval_logger.info(
                 f"Model type is '{self._config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
             )
         self.batch_size_per_gpu = int(args.batch_size)
@@ -170,21 +170,23 @@ def max_length(self) -> int:
 
     @property
     def device(self):
-        # We need to do padding ourselves, otherwise we'll end up with recompilations
-        # Returning 'cpu' to keep tensors on CPU in lm_eval code
-        return "cpu"
+        return torch.device("hpu")
 
     @max_length.setter
     def max_length(self, value: int) -> None:
         self._max_length = value
 
     def find_bucket(self, length: int, key=lambda b, length: b >= length) -> int:
+        """
+        Find the smallest bucket >= length, or add a new one.
+        """
         for b in self.buckets:
             if key(b, length):
                 return b
         new_bucket = length
         self.buckets.append(new_bucket)
         self.buckets.sort()
+        eval_logger.info(f"Added new bucket: {new_bucket}. Buckets are now: {self.buckets}")
         return new_bucket
 
     def _model_call(self, inps: torch.Tensor) -> torch.Tensor:
@@ -195,13 +197,13 @@ def _model_call(self, inps: torch.Tensor) -> torch.Tensor:
             if self.options.use_cache and self.options.reuse_cache:
                 self._model.allocate_kv_cache(bs, bucket_length + 1, bucket_length)
             padding_length = bucket_length - seq_length
-            inps = F.pad(inps, (0, padding_length), value=self._model.config.pad_token_id)
-        logits = self._model(inps.to(self.device_), **self.model_inputs)["logits"].cpu()
+            pad_token_id = getattr(self._model.config, "pad_token_id", 0)
+            inps = F.pad(inps, (0, padding_length), value=pad_token_id)
+            eval_logger.debug(f"Padded input from {seq_length} to {bucket_length} (pad={padding_length})")
+        logits = self._model(inps.to(self.device), **self.model_inputs)["logits"]
 
         if self.options.static_shapes and padding_length > 0:
             logits = logits[:, :-padding_length, :]
-        logits = logits.to(torch.float32)
-
         return logits
 
     def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) -> list[str]:
@@ -217,7 +219,7 @@ def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) -
 
     def _model_generate(
         self,
-        context,
+        context: torch.Tensor,
         max_length: int,
         stop: list[str],
         **generation_kwargs: dict[str, Any],
@@ -226,21 +228,12 @@ def _model_generate(
         Patched method
         source: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.9.1/lm_eval/models/huggingface.py#L951
         """
-        # temperature = 0.0 if not set
-        # if do_sample is false and temp==0.0:
-        # remove temperature, as do_sample=False takes care of this
-        # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample")
-        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
             generation_kwargs["do_sample"] = do_sample = False
-
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
-        # build stopping criteria
-        stopping_criteria = stop_sequences_criteria(self.tokenizer, stop, context.shape[1], context.shape[0])
-        # to avoid graph recompilation
         if self.options.static_shapes:
             self.options.bucket_internal = True
             bucket_length = self.find_bucket(context.shape[1])
@@ -254,17 +247,16 @@ def _model_generate(
                     generation_kwargs["attention_mask"], (0, padding_length), value=0
                 )
         # move context & attention_mask to hpu
-        context = context.to("hpu")
-        generation_kwargs["attention_mask"] = generation_kwargs["attention_mask"].to("hpu")
+        context = context.to(self.device)
+        generation_kwargs["attention_mask"] = generation_kwargs["attention_mask"].to(self.device)
         with torch.autocast(
-            device_type="hpu",
+            device_type=self.device,
             dtype=self.mixed_precision_dtype,
             enabled=self.mixed_precision_dtype is not None,
         ):
             return self.model.generate(
                 input_ids=context,
                 max_new_tokens=max_gen_toks,
-                stopping_criteria=stopping_criteria,
                 pad_token_id=self.tokenizer.pad_token_id,
                 use_cache=True,
                 hpu_graphs=self.hpu_graphs,
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
@@ -145,7 +145,7 @@ def setup_lm_eval_parser():
         "--metadata",
         type=json.loads,
         default=None,
-        help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",
+        help="""JSON string metadata to pass to task configs, for example '{"max_length":1024}'. Will be merged with model_args. Can also be set in task config.""",
     )
     parser.add_argument(
         "--apply_chat_template",

Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ def setup_lm_eval_parser():`
`145`	`145`	`"--metadata",`
`146`	`146`	`type=json.loads,`
`147`	`147`	`default=None,`
`148`		`- help="""JSON string metadata to pass to task configs, for example '{"max_seq_lengths":[4096,8192]}'. Will be merged with model_args. Can also be set in task config.""",`
	`148`	`+ help="""JSON string metadata to pass to task configs, for example '{"max_length":1024}'. Will be merged with model_args. Can also be set in task config.""",`
`149`	`149`	`)`
`150`	`150`	`parser.add_argument(`
`151`	`151`	`"--apply_chat_template",`