[FEATURE] Add Chameleon generate example (#1019)

zhtmike · web-flow · commit 48a81b191111 · 2025-07-28T08:00:45.000Z
* update UT

* update model

* fix UT

* add to auto model

* add generate.py

* fix sampling

* align with v4.50 and support batch inference

* remove generate examples and add preprocess

* add copyright
diff --git a/mindone/transformers/__init__.py b/mindone/transformers/__init__.py
@@ -142,8 +142,10 @@
 )
 from .models.chameleon import (
     ChameleonForConditionalGeneration,
+    ChameleonImageProcessor,
     ChameleonModel,
     ChameleonPreTrainedModel,
+    ChameleonProcessor,
     ChameleonVQVAE,
 )
 from .models.clap import (
diff --git a/mindone/transformers/models/__init__.py b/mindone/transformers/models/__init__.py
@@ -30,6 +30,7 @@
     blip_2,
     camembert,
     canine,
+    chameleon,
     clap,
     clip,
     convbert,
diff --git a/mindone/transformers/models/auto/configuration_auto.py b/mindone/transformers/models/auto/configuration_auto.py
@@ -43,6 +43,7 @@
         ("bit", "BitConfig"),
         ("blip", "BlipConfig"),
         ("blip-2", "Blip2Config"),
+        ("chameleon", "ChameleonConfig"),
         ("camembert", "CamembertConfig"),
         ("convbert", "ConvBertConfig"),
         ("clip", "CLIPConfig"),
diff --git a/mindone/transformers/models/auto/image_processing_auto.py b/mindone/transformers/models/auto/image_processing_auto.py
@@ -50,6 +50,7 @@
             ("beit", ("BeitImageProcessor",)),
             ("blip", ("BlipImageProcessor",)),
             ("blip-2", ("BlipImageProcessor",)),
+            ("chameleon", ("ChameleonImageProcessor",)),
             ("clip", ("CLIPImageProcessor",)),
             ("dpt", ("DPTImageProcessor",)),
             ("llava_next", ("LlavaNextImageProcessor",)),
diff --git a/mindone/transformers/models/auto/modeling_auto.py b/mindone/transformers/models/auto/modeling_auto.py
@@ -283,6 +283,7 @@
         ("aria", "AriaForConditionalGeneration"),
         ("blip", "BlipForConditionalGeneration"),
         ("blip-2", "Blip2ForConditionalGeneration"),
+        ("chameleon", "ChameleonForConditionalGeneration"),
         ("gemma3", "Gemma3ForConditionalGeneration"),
         ("chameleon", "ChameleonForConditionalGeneration"),
         ("idefics", "IdeficsForVisionText2Text"),
diff --git a/mindone/transformers/models/auto/processing_auto.py b/mindone/transformers/models/auto/processing_auto.py
@@ -50,6 +50,7 @@
 PROCESSOR_MAPPING_NAMES = OrderedDict(
     [
         ("blip", "BlipProcessor"),
+        ("chameleon", "ChameleonProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
diff --git a/mindone/transformers/models/chameleon/__init__.py b/mindone/transformers/models/chameleon/__init__.py
@@ -14,4 +14,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .image_processing_chameleon import *
 from .modeling_chameleon import *
+from .processing_chameleon import *
diff --git a/mindone/transformers/models/chameleon/image_processing_chameleon.py b/mindone/transformers/models/chameleon/image_processing_chameleon.py
diff --git a/mindone/transformers/models/chameleon/modeling_chameleon.py b/mindone/transformers/models/chameleon/modeling_chameleon.py
diff --git a/mindone/transformers/models/chameleon/processing_chameleon.py b/mindone/transformers/models/chameleon/processing_chameleon.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Chameleon.
+"""
+from typing import List, Optional, Union
+
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+import mindspore as ms
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack, _validate_images_text_input_order
+
+
+class ChameleonTextKwargs(TextKwargs, total=False):
+    return_for_text_completion: bool
+
+
+class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: ChameleonTextKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_for_text_completion": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "ms",
+        },
+    }
+
+
+class ChameleonProcessor(ProcessorMixin):
+    r"""
+    Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
+    processor.
+
+    [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`].
+    See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`ChameleonImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+        image_seq_length (`int`, *optional*, defaults to 1024):
+            Sequence length of one image embedding.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            The special token used to indicate image in the text.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    valid_kwargs = ["image_seq_length", "image_token"]
+    image_processor_class = "ChameleonImageProcessor"
+
+    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        self.image_seq_length = image_seq_length
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_start_token = (
+            tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
+        )  # fixed tokens for start and end, so can hardcode
+        self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "<eoss>"
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[ChameleonProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `mindspore.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[mindspore.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'ms'`: Return PyTorch `mindspore.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        # check if images and text inputs are reversed for BC
+        images, text = _validate_images_text_input_order(images, text)
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        if text is None and images is None:
+            raise ValueError("You must provide either text or images")
+
+        output_kwargs = self._merge_kwargs(
+            ChameleonProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False)
+
+        # Replace the image token with the expanded image token sequence
+        prompt_strings = []
+        one_img_tokens = self.image_start_token + (self.image_token * self.image_seq_length) + self.image_end_token
+        for sample in text:
+            sample = sample.replace(self.image_token, one_img_tokens)
+            if not return_for_text_completion:
+                sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
+            prompt_strings.append(sample)
+
+        output_kwargs["text_kwargs"].pop("return_tensors", None)
+        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors="np")
+        for k, v in data.items():
+            data[k] = ms.tensor(v)
+
+        if images is not None:
+            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = ["ChameleonProcessor"]
diff --git a/mindone/transformers/models/llava_next/__init__.py b/mindone/transformers/models/llava_next/__init__.py
@@ -1,3 +1,19 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .image_processing_llava_next import *
 from .modeling_llava_next import *
 from .processing_llava_next import *
diff --git a/mindone/transformers/models/llava_next/image_processing_llava_next.py b/mindone/transformers/models/llava_next/image_processing_llava_next.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_next/modeling_llava_next.py b/mindone/transformers/models/llava_next/modeling_llava_next.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_next/processing_llava_next.py b/mindone/transformers/models/llava_next/processing_llava_next.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_next_video/__init__.py b/mindone/transformers/models/llava_next_video/__init__.py
@@ -1,3 +1,20 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .image_processing_llava_next_video import *
 from .modeling_llava_next_video import *
 from .processing_llava_next_video import *
diff --git a/mindone/transformers/models/llava_next_video/image_processing_llava_next_video.py b/mindone/transformers/models/llava_next_video/image_processing_llava_next_video.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_next_video/modeling_llava_next_video.py b/mindone/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_next_video/processing_llava_next_video.py b/mindone/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_onevision/__init__.py b/mindone/transformers/models/llava_onevision/__init__.py
@@ -1,3 +1,20 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .image_processing_llava_onevision import *
 from .modeling_llava_onevision import *
 from .processing_llava_onevision import *
diff --git a/mindone/transformers/models/llava_onevision/image_processing_llava_onevision.py b/mindone/transformers/models/llava_onevision/image_processing_llava_onevision.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_onevision/modeling_llava_onevision.py b/mindone/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -1,6 +1,10 @@
 # coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/mindone/transformers/models/llava_onevision/processing_llava_onevision.py b/mindone/transformers/models/llava_onevision/processing_llava_onevision.py
diff --git a/mindone/transformers/models/llava_onevision/video_processing_llava_onevision.py b/mindone/transformers/models/llava_onevision/video_processing_llava_onevision.py
diff --git a/tests/transformers_tests/models/chameleon/test_modeling_chameleon.py b/tests/transformers_tests/models/chameleon/test_modeling_chameleon.py
diff --git a/tests/transformers_tests/models/llava_next/test_modeling_llava_next.py b/tests/transformers_tests/models/llava_next/test_modeling_llava_next.py
diff --git a/tests/transformers_tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/transformers_tests/models/llava_next_video/test_modeling_llava_next_video.py
diff --git a/tests/transformers_tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/transformers_tests/models/llava_onevision/test_modeling_llava_onevision.py

Original file line number	Diff line number	Diff line change
`@@ -142,8 +142,10 @@`
`142`	`142`	`)`
`143`	`143`	`from .models.chameleon import (`
`144`	`144`	`ChameleonForConditionalGeneration,`
	`145`	`+ ChameleonImageProcessor,`
`145`	`146`	`ChameleonModel,`
`146`	`147`	`ChameleonPreTrainedModel,`
	`148`	`+ ChameleonProcessor,`
`147`	`149`	`ChameleonVQVAE,`
`148`	`150`	`)`
`149`	`151`	`from .models.clap import (`
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@`
`50`	`50`	`PROCESSOR_MAPPING_NAMES = OrderedDict(`
`51`	`51`	`[`
`52`	`52`	`("blip", "BlipProcessor"),`
	`53`	`+ ("chameleon", "ChameleonProcessor"),`
`53`	`54`	`("llava_next", "LlavaNextProcessor"),`
`54`	`55`	`("llava_next_video", "LlavaNextVideoProcessor"),`
`55`	`56`	`("llava_onevision", "LlavaOnevisionProcessor"),`