feat: add llava_onevision1_5 (#825)

mathCrazyy · web-flow · commit 1676c57f2bca · 2025-09-18T15:36:50.000+08:00
* feat: add llava_onevision1_5

* unset interleave_visuals

* fix: useless class, error link,

* fix: change repo

* improve: bot advice

* improve: bot advice.2

* fix format

* Remove invalid claude.yml workflow file

* 保存当前工作进度

* fix link

* fix link
diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml
diff --git a/examples/models/llava_onevision1_5.sh b/examples/models/llava_onevision1_5.sh
@@ -0,0 +1,9 @@
+export HF_HOME="~/.cache/huggingface"
+
+# pip install git+https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+
+accelerate launch --num_processes=8 --main_process_port 12399 -m lmms_eval \
+    --model=llava_onevision1_5 \
+    --model_args=pretrained=lmms-lab/LLaVA-OneVision-1.5-8B-Instruct,attn_implementation=flash_attention_2,max_pixels=3240000 \
+    --tasks=mmerealworld,mmerealworld_cn,chartqa,docvqa_val,infovqa_val,mmstar,ocrbench \
+    --batch_size=1
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
@@ -37,6 +37,7 @@
     "llava": "Llava",
     "llava_hf": "LlavaHf",
     "llava_onevision": "Llava_OneVision",
+    "llava_onevision1_5": "Llava_OneVision1_5",
     "llava_onevision_moviechat": "Llava_OneVision_MovieChat",
     "llava_sglang": "LlavaSglang",
     "llava_vid": "LlavaVid",
diff --git a/lmms_eval/models/simple/llava_onevision1_5.py b/lmms_eval/models/simple/llava_onevision1_5.py
@@ -0,0 +1,332 @@
+import base64
+import re
+from io import BytesIO
+from typing import List, Optional, Tuple, Union
+
+import decord
+import numpy as np
+import torch
+from accelerate import Accelerator, DistributedType
+from loguru import logger as eval_logger
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+try:
+    from qwen_vl_utils import process_vision_info
+except ImportError:
+    eval_logger.warning("Failed to import qwen_vl_utils; Please install it via `pip install qwen-vl-utils`")
+
+
+@register_model("llava_onevision1_5")
+class Llava_OneVision1_5(lmms):
+    """
+    Llava_OneVision1_5 Model
+    "https://huggingface.co/lmms-lab/LLaVA-OneVision-1.5-8B-Instruct"
+    """
+
+    def __init__(
+        self,
+        pretrained: str = "lmms-lab/LLaVA-OneVision-1.5-8B-Instruct",
+        device: Optional[str] = "cuda",
+        device_map: Optional[str] = "auto",
+        batch_size: Optional[Union[int, str]] = 1,
+        use_cache=True,
+        attn_implementation: Optional[str] = None,
+        min_pixels: int = 256 * 28 * 28,
+        max_pixels: int = 1605632,
+        max_num_frames: int = 32,
+        use_custom_video_loader: Optional[bool] = False,
+        fps: Optional[float] = None,  # Only applicable if use_custom_video_loader is True
+        max_image_size: Optional[int] = None,  # Only applicable if use_custom_video_loader is True
+        system_prompt: Optional[str] = "You are a helpful assistant.",
+        interleave_visuals: Optional[bool] = False,
+        reasoning_prompt: Optional[str] = None,
+        max_length: int = 2048,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        if kwargs:
+            eval_logger.warning(f"Ignoring unexpected kwargs: {list(kwargs.keys())}")
+
+        # Validate attention implementation
+        valid_attn_implementations = [None, "flash_attention_2", "sdpa", "eager"]
+        if attn_implementation not in valid_attn_implementations:
+            raise ValueError(f"attn_implementation must be one of {valid_attn_implementations}, got {attn_implementation}")
+
+        self.use_custom_video_loader = use_custom_video_loader
+        self.fps = fps
+        # if self.fps and not self.use_custom_video_loader:
+        #     raise ValueError("FPS is only applicable if use_custom_video_loader is True")
+        self.max_image_size = max_image_size
+        if self.max_image_size and not self.use_custom_video_loader:
+            raise ValueError("max_image_size is only applicable if use_custom_video_loader is True")
+
+        accelerator = Accelerator()
+        if accelerator.num_processes > 1:
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        else:
+            self._device = torch.device(device)
+            self.device_map = device_map if device_map else device
+
+        # Prepare model loading arguments
+        model_kwargs = {"torch_dtype": "auto", "device_map": self.device_map, "trust_remote_code": True}
+
+        # Add attention implementation if specified
+        if attn_implementation is not None:
+            model_kwargs["attn_implementation"] = attn_implementation
+
+        self._model = AutoModelForCausalLM.from_pretrained(pretrained, **model_kwargs).eval()
+        self.max_pixels = max_pixels
+        self.min_pixels = min_pixels
+        self.max_num_frames = max_num_frames
+
+        if reasoning_prompt:
+            self.reasoning_prompt = reasoning_prompt.replace("\\n", "\n")
+        else:
+            self.reasoning_prompt = None
+        self.processor = AutoProcessor.from_pretrained(pretrained, max_pixels=max_pixels, min_pixels=min_pixels, trust_remote_code=True)
+        self._tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
+        self.system_prompt = system_prompt
+        self.interleave_visuals = interleave_visuals
+
+        self._config = self.model.config
+        self._max_length = int(max_length)
+        self.batch_size_per_gpu = int(batch_size)
+        self.use_cache = use_cache
+
+        if accelerator.num_processes > 1:
+            assert accelerator.distributed_type in [
+                DistributedType.FSDP,
+                DistributedType.MULTI_GPU,
+            ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            if accelerator.distributed_type == DistributedType.FSDP:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        else:
+            self._rank = 0
+            self._world_size = 1
+
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+
+    @property
+    def eot_token_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        raise NotImplementedError("Loglikelihood is not implemented for Qwen2.5_VL")
+
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tokenizer.encode(x[0])
+            return -len(toks), x[0]
+
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        for chunk in chunks:
+            contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            visual_list = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            gen_kwargs = all_gen_kwargs[0]
+
+            # Set default until or update values from gen_kwargs if present
+            until = gen_kwargs.get("until", [self.tokenizer.decode(self.eot_token_id)])
+
+            if isinstance(until, str):
+                until = [until]
+            elif not isinstance(until, list):
+                raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str, list], but got {type(until)}")
+
+            # Avoid using '\n\n' as a stopper for Qwen2.5VL to prevent truncation, which can lead to incorrect results
+            until = [item for item in until if item != "\n\n"]
+
+            if isinstance(contexts, tuple):
+                contexts = list(contexts)
+
+            for i in range(len(contexts)):
+                if "<image>" in contexts[i]:
+                    contexts[i] = contexts[i].replace("<image>", "")
+
+            batched_messages = []
+            for i, context in enumerate(contexts):
+                if "<image>" in context:
+                    context = context.replace("<image>", "")
+
+                message = [{"role": "system", "content": self.system_prompt}]
+                if self.reasoning_prompt:
+                    context = context.strip() + self.reasoning_prompt
+                    contexts[i] = context
+
+                processed_visuals = []
+                for visual in visual_list[i]:
+                    if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")):  # Video file
+                        vr = decord.VideoReader(visual)
+                        first_frame = vr[0].asnumpy()
+                        height, width = first_frame.shape[:2]
+                        # max_pixels = height * width
+                        processed_visuals.append({"type": "video", "video": visual, "max_pixels": self.max_pixels, "min_pixels": self.min_pixels})
+                    elif isinstance(visual, Image.Image):
+                        processed_visuals.append({"type": "image", "image": visual.convert("RGB")})
+
+                if self.interleave_visuals is False:
+                    message.append(
+                        {
+                            "role": "user",
+                            "content": processed_visuals + [{"type": "text", "text": context}],
+                        }
+                    )
+                else:  # currently support find <image x> in the context
+                    image_placeholders = re.findall(r"<image \d+>", context)
+                    content_parts = []
+                    text_parts = re.split(r"<image \d+>", context)
+                    if text_parts[0]:
+                        content_parts.append({"type": "text", "text": text_parts[0]})
+
+                    for i, placeholder in enumerate(image_placeholders):
+                        img_idx = int(re.search(r"<image (\d+)>", placeholder).group(1)) - 1
+                        image_idx = min(img_idx, len(processed_visuals) - 1) if processed_visuals else 0
+                        if processed_visuals and image_idx < len(processed_visuals):
+                            content_parts.append(processed_visuals[image_idx])
+                        if i + 1 < len(text_parts) and text_parts[i + 1]:
+                            content_parts.append({"type": "text", "text": text_parts[i + 1]})
+
+                    message.append(
+                        {
+                            "role": "user",
+                            "content": content_parts,
+                        }
+                    )
+
+                batched_messages.append(message)
+
+            texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batched_messages]
+            image_inputs, video_inputs = process_vision_info(batched_messages)
+            if video_inputs is not None:
+                total_frames = video_inputs[0].shape[0]
+                indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int)
+                # Append the last frame index if not already included
+                if total_frames - 1 not in indices:
+                    indices = np.append(indices, total_frames - 1)
+                video_inputs[0] = video_inputs[0][indices]
+            inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+
+            if self.device_map == "auto":
+                inputs = inputs.to("cuda")
+            else:
+                inputs = inputs.to(self.device)
+
+            # Set default generation kwargs
+            default_gen_kwargs = {
+                "max_new_tokens": 128,
+                "temperature": 0.0,  # Set to 0 for greedy default
+                "top_p": None,
+                "num_beams": 1,
+            }
+            # Update with provided kwargs
+            current_gen_kwargs = {**default_gen_kwargs, **gen_kwargs}
+            pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
+            do_sample = bool(current_gen_kwargs.get("temperature", 0) and current_gen_kwargs["temperature"] > 0)
+            gen_args = {
+                **inputs,
+                "eos_token_id": self.tokenizer.eos_token_id,
+                "pad_token_id": pad_token_id,
+                "num_beams": current_gen_kwargs["num_beams"],
+                "max_new_tokens": current_gen_kwargs["max_new_tokens"],
+                "use_cache": self.use_cache,
+            }
+            if do_sample:
+                gen_args.update(
+                    do_sample=True,
+                    temperature=float(current_gen_kwargs.get("temperature", 1.0)),
+                    top_p=float(current_gen_kwargs.get("top_p", 1.0)),
+                )
+            with torch.inference_mode():
+                cont = self.model.generate(**gen_args)
+
+            generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, cont)]
+            answers = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            for i, ans in enumerate(answers):
+                for term in until:
+                    if len(term) > 0:
+                        ans = ans.split(term)[0]
+                answers[i] = ans
+
+            for ans, context in zip(answers, contexts):
+                res.append(ans)
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), ans)
+                pbar.update(1)
+            # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+
+        pbar.close()
+        return res
+
+    def generate_until_multi_round(self, requests) -> List[str]:
+        raise NotImplementedError("TODO: Implement multi-round generation")