Enable text-only evals for VLM models (#2999)

ysulsky · web-flow · commit 82a99365abdc · 2025-06-02T20:03:03.000+05:00
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -494,10 +494,6 @@ def evaluate(
             raise ValueError(
                 f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
             )
-        else:
-            raise ValueError(
-                f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
-            )
     # end validation check
 
     # Cache the limit arg.
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
@@ -399,6 +399,9 @@ def _batch_images(self, image_encs):
         return batched_imgs
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood_rolling(requests=requests)
         raise NotImplementedError(
             "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
             "this is because we do not support measuring the loglikelihood a model assigns to an image.",
@@ -407,6 +410,9 @@ def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
     def loglikelihood(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[Tuple[float, bool]]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood(requests=requests, disable_tqdm=disable_tqdm)
         raise NotImplementedError(
             "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
         )
@@ -433,9 +439,11 @@ def loglikelihood(
                 )
             )
 
-        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+        return self._multimodal_loglikelihood_tokens(
+            new_reqs, disable_tqdm=disable_tqdm
+        )
 
-    def _loglikelihood_tokens(
+    def _multimodal_loglikelihood_tokens(
         self,
         requests: List[
             Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
@@ -624,7 +632,10 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
+
         res = []
 
         def _collate(x):
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
@@ -890,7 +890,10 @@ def _model_call(self, inps, attn_mask=None, labels=None):
                     input_ids=inps, attention_mask=attn_mask, labels=labels
                 ).logits
             else:
-                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                assert self.AUTO_MODEL_CLASS in (
+                    transformers.AutoModelForCausalLM,
+                    transformers.AutoModelForVision2Seq,
+                )
                 return self.model(inps).logits
 
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
@@ -106,7 +106,7 @@ def tok_batch_multimodal_encode(
             outputs.append(inputs)
         return outputs
 
-    def _model_generate(
+    def _multimodal_model_generate(
         self,
         requests: List[List[dict]] = None,
         generate: bool = False,
@@ -218,7 +218,10 @@ def apply_chat_template(
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: support text-only reqs
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
+
         res = []
 
         def _collate(x):
@@ -293,7 +296,7 @@ def _collate(x):
                 left_truncate_len=max_ctx_len,
             )
 
-            cont = self._model_generate(
+            cont = self._multimodal_model_generate(
                 inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
             )
 
@@ -309,3 +312,12 @@ def _collate(x):
 
         pbar.close()
         return res
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood_rolling(requests=requests)
+        raise NotImplementedError(
+            "model type `vllm-vlm` does not support loglikelihood_rolling. Use 'vlm' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
+        )