Support gemma3n (#4836)

0russwest0 · web-flow · commit c97afe3c4579 · 2025-07-04T23:10:44.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -508,6 +508,10 @@
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|&#x2718;|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
 |[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
+|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
+|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
+|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
+|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|&#x2718;|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
 |[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|&#x2718;|-|-|
 |[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|&#x2714;|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -508,6 +508,10 @@ The table below introduces the models integrated with ms-swift:
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|&#x2718;|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
 |[LLM-Research/gemma-3-1b-pt](https://modelscope.cn/models/LLM-Research/gemma-3-1b-pt)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-pt](https://huggingface.co/google/gemma-3-1b-pt)|
 |[LLM-Research/gemma-3-1b-it](https://modelscope.cn/models/LLM-Research/gemma-3-1b-it)|gemma3_text|gemma3_text|transformers>=4.49|&#x2718;|-|[google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it)|
+|[google/gemma-3n-E2B](https://www.modelscope.cn/models/google/gemma-3n-E2B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B](https://huggingface.co/google/gemma-3n-E2B)|
+|[google/gemma-3n-E2B-it](https://modelscope.cn/models/google/gemma-3n-E2B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E2B-it](https://huggingface.co/google/gemma-3n-E2B-it)|
+|[google/gemma-3n-E4B](https://www.modelscope.cn/models/google/gemma-3n-E4B)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B](https://huggingface.co/google/gemma-3n-E4B)|
+|[google/gemma-3n-E4B-it](https://www.modelscope.cn/models/google/gemma-3n-E4B-it)|gemma3n|gemma3n|transformers>=4.53.1|&#x2718;|-|[google/gemma-3n-E4B-it](https://huggingface.co/google/gemma-3n-E4B-it)|
 |[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|&#x2718;|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
 |[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|&#x2718;|-|-|
 |[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|&#x2714;|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
@@ -232,6 +232,7 @@ class MLLMModelType:
     megrez_omni = 'megrez_omni'
     valley = 'valley'
     gemma3_vision = 'gemma3_vision'
+    gemma3n = 'gemma3n'
     mistral_2503 = 'mistral_2503'
 
 
diff --git a/swift/llm/model/model/gemma.py b/swift/llm/model/model/gemma.py
@@ -4,6 +4,7 @@
 from swift.llm import TemplateType
 from ..constant import LLMModelType, MLLMModelType
 from ..model_arch import ModelArch
+from ..patcher import patch_output_to_input_device
 from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal,
                         get_model_tokenizer_with_flash_attn, register_model)
 from ..utils import ModelInfo
@@ -163,3 +164,38 @@ def get_model_tokenizer_gemma3_vision(model_dir: str,
         model_arch=ModelArch.llava_hf,
         requires=['transformers>=4.49'],
     ))
+
+
+def get_model_tokenizer_gemma3n(model_dir: str,
+                                model_info: ModelInfo,
+                                model_kwargs: Dict[str, Any],
+                                load_model: bool = True,
+                                **kwargs):
+    from transformers import Gemma3nForConditionalGeneration
+    kwargs['automodel_class'] = kwargs['automodel_class'] or Gemma3nForConditionalGeneration
+    model, processor = get_model_tokenizer_multimodal(model_dir, model_info, model_kwargs, load_model, **kwargs)
+
+    if load_model and model is not None:
+        patch_output_to_input_device(model.model.embed_vision)
+        patch_output_to_input_device(model.model.embed_audio)
+
+    return model, processor
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.gemma3n,
+        [
+            ModelGroup([
+                Model('google/gemma-3n-E2B', 'google/gemma-3n-E2B'),
+                Model('google/gemma-3n-E4B', 'google/gemma-3n-E4B'),
+                Model('google/gemma-3n-E2B-it', 'google/gemma-3n-E2B-it'),
+                Model('google/gemma-3n-E4B-it', 'google/gemma-3n-E4B-it'),
+            ], ),
+        ],
+        TemplateType.gemma3n,
+        get_model_tokenizer_gemma3n,
+        architectures=['Gemma3nForConditionalGeneration'],
+        model_arch=ModelArch.gemma3n,
+        requires=['transformers>=4.53.1'],
+    ))
diff --git a/swift/llm/model/model_arch.py b/swift/llm/model/model_arch.py
@@ -70,6 +70,7 @@ class MLLMModelArch:
     emu3_chat = 'emu3_chat'
     megrez_omni = 'megrez_omni'
     valley = 'valley'
+    gemma3n = 'gemma3n'
     mistral_2503 = 'mistral_2503'
 
 
@@ -594,6 +595,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
         vision_tower=['model.vision_tower', 'model.qwen2vl_vision_tower'],
     ))
 
+register_model_arch(
+    MultiModelKeys(
+        MLLMModelArch.gemma3n,
+        language_model='model.language_model',
+        aligner=['model.embed_vision', 'model.embed_audio'],
+        vision_tower=['model.vision_tower', 'model.audio_tower'],
+    ))
+
 
 def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
     return MODEL_ARCH_MAPPING.get(arch_name)
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
@@ -188,6 +188,7 @@ class MLLMTemplateType:
     megrez_omni = 'megrez_omni'
     valley = 'valley'
     gemma3_vision = 'gemma3_vision'
+    gemma3n = 'gemma3n'
     mistral_2503 = 'mistral_2503'
 
 
diff --git a/swift/llm/template/template/gemma.py b/swift/llm/template/template/gemma.py
@@ -11,6 +11,7 @@
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
 from ..utils import Context, Prompt, findall
+from ..vision_utils import load_audio
 
 
 @dataclass
@@ -129,3 +130,102 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
 
 
 register_template(GemmaTemplateMeta(MLLMTemplateType.gemma3_vision, template_cls=Gemma3VisionTemplate))
+
+
+class Gemma3nTemplate(Gemma3Template):
+    boi_token_id = 255999
+    boa_token_id = 256000
+    placeholder_tokens = ['<start_of_image>', '<start_of_audio>']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return ['<start_of_image>']
+        elif media_type == 'audio':
+            inputs.audios[index] = load_audio(inputs.audios[index], self.processor.feature_extractor.sampling_rate)
+            return ['<start_of_audio>']
+        else:
+            raise ValueError(f'Unsupported media type: {media_type}. Supported types are: image, audio')
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        from transformers.models.gemma3n.processing_gemma3n import Gemma3nProcessorKwargs
+
+        # Input validation
+        if not inputs.images and not inputs.audios and not inputs.messages:
+            raise ValueError('Provide at least one of `images`, `audios`, or `messages`.')
+
+        encoded = super()._encode(inputs)
+        processor = self.processor
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+
+        # Initialize token_type_ids and other outputs
+        array_ids = np.array(input_ids)
+        mm_token_type_ids = np.zeros_like(input_ids)
+
+        # Handle images
+        if inputs.images:
+            idx_list = findall(input_ids, self.boi_token_id)
+            img_tokens = self._tokenize(processor.full_image_sequence)
+            input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, lambda _: img_tokens)
+
+            # Process images
+            processor_kwargs = Gemma3nProcessorKwargs._defaults.get('images_kwargs', {})
+            image_inputs = processor.image_processor(inputs.images, **processor_kwargs)
+            image_inputs['pixel_values'] = torch.as_tensor(np.array(image_inputs['pixel_values']))
+            if 'num_crops' in image_inputs:
+                image_inputs.pop('num_crops')
+            encoded.update(image_inputs)
+
+        # Handle audios
+        if inputs.audios:
+            audio_idx_list = findall(input_ids, self.boa_token_id)
+            if audio_idx_list:
+                # Get audio token sequence from processor
+                audio_tokens = self._tokenize(processor.full_audio_sequence)
+                input_ids, labels = self._extend_tokens(input_ids, labels, audio_idx_list, lambda _: audio_tokens)
+
+                # Process audios
+                processor_kwargs = Gemma3nProcessorKwargs._defaults.get('audio_kwargs', {})
+                audio_inputs = processor.feature_extractor(inputs.audios, **processor_kwargs)
+
+                if 'input_features' in audio_inputs:
+                    audio_inputs['input_features'] = torch.tensor(audio_inputs['input_features']).to(
+                        self.model_info.torch_dtype)
+                if 'input_features_mask' in audio_inputs:
+                    audio_inputs['input_features_mask'] = torch.tensor(audio_inputs['input_features_mask'])
+                encoded.update(audio_inputs)
+
+        # Update array_ids after token extension
+        array_ids = np.array(input_ids)
+        mm_token_type_ids = np.zeros_like(input_ids)
+
+        if hasattr(processor, 'image_token_id') and processor.image_token_id is not None:
+            mm_token_type_ids[array_ids == processor.image_token_id] = 1
+
+        if hasattr(processor, 'audio_token_id') and processor.audio_token_id is not None:
+            mm_token_type_ids[array_ids == processor.audio_token_id] = 3
+
+        encoded['token_type_ids'] = mm_token_type_ids.tolist()
+        encoded['input_ids'] = input_ids
+        encoded['labels'] = labels
+
+        return encoded
+
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Handle multimodal data collation for Gemma3n, including audio features"""
+        res = super()._data_collator_mm_data(batch)
+
+        # Handle audio features like other templates do
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        input_features_mask = [b['input_features_mask'] for b in batch if b.get('input_features_mask') is not None]
+
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+        if input_features_mask:
+            res['input_features_mask'] = torch.concat(input_features_mask)
+
+        return res
+
+
+register_template(GemmaTemplateMeta(MLLMTemplateType.gemma3n, template_cls=Gemma3nTemplate))
diff --git a/tests/test_align/test_template/test_audio.py b/tests/test_align/test_template/test_audio.py
@@ -65,6 +65,16 @@ def test_qwen2_5_omni():
     assert response == response2
 
 
+def test_gemma3n():
+    pt_engine = PtEngine('google/gemma-3n-E4B-it')
+    messages = [{'role': 'user', 'content': '<audio>Transcribe this audio and complete the statement'}]
+    audios = ['https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav']
+    response = _infer_model(pt_engine, messages=messages, audios=audios)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, audios=audios)
+    assert response == response2
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig
     from swift.utils import get_logger, seed_everything
@@ -73,4 +83,5 @@ def test_qwen2_5_omni():
     # test_qwen2_audio()
     # test_xcomposer2d5_ol()
     # test_step_audio_chat()
-    test_qwen2_5_omni()
+    # test_qwen2_5_omni()
+    test_gemma3n()
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -578,6 +578,19 @@ def test_glm4_1v():
         assert response == response2
 
 
+def test_gemma3n():
+    pt_engine = PtEngine('google/gemma-3n-E2B-it')
+    messages = [{'role': 'user', 'content': '<image><image>What is the difference between the two images?'}]
+    images = [
+        'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
+        'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'
+    ]
+    response = _infer_model(pt_engine, messages=messages, images=images)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, images=images)
+    assert response == response2
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig
     from swift.utils import get_logger, seed_everything
@@ -632,4 +645,5 @@ def test_glm4_1v():
     # test_internvl3_9b()
     # test_kimi_vl()
     # test_kimi_vl_thinking()
-    test_glm4_1v()
+    # test_glm4_1v()
+    test_gemma3n()