diff --git a/swift/model/constant.py b/swift/model/constant.py
index 204cc3f6eb..00746caa13 100644
--- a/swift/model/constant.py
+++ b/swift/model/constant.py
@@ -239,6 +239,7 @@ class MLLMModelType:
     idefics3 = 'idefics3'
     paligemma = 'paligemma'
     molmo = 'molmo'
+    molmo2 = 'molmo2'
     molmoe = 'molmoe'
     pixtral = 'pixtral'
     megrez_omni = 'megrez_omni'
diff --git a/swift/model/models/__init__.py b/swift/model/models/__init__.py
index 5a0661cfe1..8c67705142 100644
--- a/swift/model/models/__init__.py
+++ b/swift/model/models/__init__.py
@@ -1,3 +1,3 @@
 from . import (baai, baichuan, baidu, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba,
-               microsoft, minicpm, minimax, mistral, mllm, moonshot, mplug, openbuddy, qwen, seed, skywork, stepfun,
-               telechat, tencent, valley, yi)
+               microsoft, minicpm, minimax, mistral, mllm, molmo2, moonshot, mplug, openbuddy, qwen, seed, skywork,
+               stepfun, telechat, tencent, valley, yi)
diff --git a/swift/model/models/molmo2.py b/swift/model/models/molmo2.py
new file mode 100644
index 0000000000..a26b840969
--- /dev/null
+++ b/swift/model/models/molmo2.py
@@ -0,0 +1,114 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import transformers
+from contextlib import contextmanager
+from packaging import version
+from transformers import PreTrainedModel
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+from swift.template import TemplateType
+from swift.utils import get_logger
+from ..constant import MLLMModelType
+from ..model_arch import ModelArch
+from ..model_meta import Model, ModelGroup, ModelMeta
+from ..patcher import patch_output_clone
+from ..register import ModelLoader, register_model
+
+logger = get_logger()
+
+
+class Molmo2Loader(ModelLoader):
+
+    @staticmethod
+    @contextmanager
+    def _patch_processor_optional_attributes_compat():
+        """Restrict ProcessorMixin compat to Molmo2 processor loading only."""
+        if version.parse(transformers.__version__) < version.parse('5.0.0.dev'):
+            yield
+            return
+        try:
+            from transformers.processing_utils import ProcessorMixin
+        except Exception:
+            yield
+            return
+
+        origin_init = ProcessorMixin.__init__
+
+        def _patched_init(self, *args, **kwargs):
+            optional_attributes = getattr(self, 'optional_attributes', None) or []
+            optional_values = {}
+            for key in optional_attributes:
+                if key in {'chat_template', 'audio_tokenizer'}:
+                    continue
+                if key in kwargs:
+                    optional_values[key] = kwargs.pop(key)
+
+            origin_init(self, *args, **kwargs)
+
+            for key in optional_attributes:
+                if key in {'chat_template', 'audio_tokenizer'}:
+                    continue
+                if key in optional_values:
+                    setattr(self, key, optional_values[key])
+                elif not hasattr(self, key):
+                    setattr(self, key, None)
+
+        ProcessorMixin.__init__ = _patched_init
+        try:
+            yield
+        finally:
+            ProcessorMixin.__init__ = origin_init
+
+    @staticmethod
+    def _patch_vision_pooling_attention(model: PreTrainedModel) -> None:
+        inner_model = getattr(model, 'model', None)
+        if inner_model is None:
+            return
+
+        vision_backbone = getattr(inner_model, 'vision_backbone', None)
+        if vision_backbone is None:
+            return
+        pooling = getattr(vision_backbone, 'image_pooling_2d', None)
+        if pooling is None or getattr(pooling, 'attn_implementation', None) != 'flash_attention_2':
+            return
+
+        pooling.attn_implementation = 'sdpa'
+        adapter_config = getattr(vision_backbone, 'adapter_config', None)
+        if adapter_config is not None and getattr(adapter_config, 'attn_implementation', None) == 'flash_attention_2':
+            adapter_config.attn_implementation = 'sdpa'
+        logger.info('Set Molmo2 vision_backbone.image_pooling_2d attention to `sdpa` to avoid '
+                    'flash-attn varlen failures on padded video batches.')
+
+    def get_processor(self, model_dir, config):
+        with self._patch_processor_optional_attributes_compat():
+            return super().get_processor(model_dir, config)
+
+    def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
+        from transformers import AutoModelForImageTextToText
+        model_cls = get_class_from_dynamic_module('modeling_molmo2.Molmo2ForConditionalGeneration', model_dir)
+        no_split_modules = getattr(model_cls, '_no_split_modules', []) or []
+        if 'MolmoSequentialBlock' not in no_split_modules:
+            model_cls._no_split_modules = no_split_modules + ['MolmoSequentialBlock']
+        self.auto_model_cls = self.auto_model_cls or AutoModelForImageTextToText
+        model = super().get_model(model_dir, *args, **kwargs)
+        patch_output_clone(model.model.transformer.wte)
+        self._patch_vision_pooling_attention(model)
+        return model
+
+
+register_model(
+    ModelMeta(
+        MLLMModelType.molmo2,
+        [
+            ModelGroup([
+                Model('LLM-Research/Molmo2-4B', 'allenai/Molmo2-4B'),
+                Model('LLM-Research/Molmo2-8B', 'allenai/Molmo2-8B'),
+                Model('LLM-Research/Molmo2-O-7B', 'allenai/Molmo2-O-7B'),
+            ]),
+        ],
+        Molmo2Loader,
+        template=TemplateType.molmo2,
+        model_arch=ModelArch.molmo,
+        architectures=['Molmo2ForConditionalGeneration'],
+        tags=['vision', 'video'],
+        requires=['transformers>=4.57.1', 'decord'],
+    ))
diff --git a/swift/template/constant.py b/swift/template/constant.py
index ab20b50233..ecd2d8a44e 100644
--- a/swift/template/constant.py
+++ b/swift/template/constant.py
@@ -243,6 +243,7 @@ class MLLMTemplateType:
     phi4_multimodal = 'phi4_multimodal'
     florence = 'florence'
     molmo = 'molmo'
+    molmo2 = 'molmo2'
     megrez_omni = 'megrez_omni'
     valley = 'valley'
     gemma3_vision = 'gemma3_vision'
diff --git a/swift/template/templates/__init__.py b/swift/template/templates/__init__.py
index 1af552c8ed..5f7f20b2e4 100644
--- a/swift/template/templates/__init__.py
+++ b/swift/template/templates/__init__.py
@@ -1,3 +1,3 @@
 from . import (baai, baidu, bert, deepseek, dots, gemma, glm, idefics3, internlm, internvl, kwai, llama, llava, llm,
-               megrez, microsoft, midashenglm, minicpm, minimax, minimind, mistral, molmo, moonshot, mplug, openbuddy,
-               pixtral, qwen, seed, stepfun, tencent, valley, yi)
+               megrez, microsoft, midashenglm, minicpm, minimax, minimind, mistral, molmo, molmo2, moonshot, mplug,
+               openbuddy, pixtral, qwen, seed, stepfun, tencent, valley, yi)
diff --git a/swift/template/templates/molmo2.py b/swift/template/templates/molmo2.py
new file mode 100644
index 0000000000..15c11fa5d0
--- /dev/null
+++ b/swift/template/templates/molmo2.py
@@ -0,0 +1,277 @@
+# Copyright (c) ModelScope Contributors. All rights reserved.
+import copy
+import inspect
+import numpy as np
+import re
+from PIL import Image
+from typing import Any, Dict, List, Literal, Tuple
+
+from ..base import Template
+from ..constant import MLLMTemplateType
+from ..register import TemplateMeta, register_template
+from ..template_inputs import StdTemplateInputs
+from ..utils import Context
+
+
+class Molmo2Template(Template):
+    """Native Molmo2 template for image and video understanding."""
+
+    use_model = True
+
+    placeholder_tokens = [
+        '<|image|>',
+        '<|video|>',
+        '<im_patch>',
+        '<frame_start>',
+        '<frame_end>',
+    ]
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'image':
+            return ['<|image|>']
+        if media_type == 'video':
+            return ['<|video|>']
+        return []
+
+    @staticmethod
+    def _load_video_descriptor(video_item: Any) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
+        if not isinstance(video_item, dict):
+            raise TypeError('Molmo2 expects a video descriptor dict produced by the dataset preprocessor.')
+        frame_paths = video_item.get('frame_paths') or []
+        timestamps = video_item.get('timestamps') or []
+        if not frame_paths or not timestamps or len(frame_paths) != len(timestamps):
+            raise ValueError('Molmo2 video descriptor requires aligned `frame_paths` and `timestamps`.')
+
+        frames = []
+        for frame_path in frame_paths:
+            with Image.open(frame_path) as image:
+                frames.append(np.asarray(image.convert('RGB')))
+        frame_array = np.stack(frames, axis=0)
+        timestamp_array = np.asarray(timestamps, dtype=np.float32)
+        metadata = {
+            'frame_paths': frame_paths,
+            'source_video': video_item.get('source_video'),
+            'num_frames': len(frame_paths),
+        }
+        return frame_array, timestamp_array, metadata
+
+    @staticmethod
+    def _build_messages_for_processor(inputs: StdTemplateInputs) -> List[Dict[str, Any]]:
+        messages = copy.deepcopy(inputs.messages)
+        image_idx = 0
+        video_idx = 0
+        for message in messages:
+            content = message.get('content', '')
+            structured_content: List[Dict[str, Any]] = []
+            if not isinstance(content, str):
+                message['content'] = content
+                continue
+            for chunk in re.split(r'(<image>|<video>)', content):
+                if not chunk:
+                    continue
+                if chunk == '<image>':
+                    if image_idx >= len(inputs.images):
+                        raise ValueError('The number of <image> tags does not match inputs.images.')
+                    structured_content.append({'type': 'image', 'image': inputs.images[image_idx]})
+                    image_idx += 1
+                elif chunk == '<video>':
+                    if video_idx >= len(inputs.videos):
+                        raise ValueError('The number of <video> tags does not match inputs.videos.')
+                    structured_content.append({'type': 'video', 'video': inputs.videos[video_idx]})
+                    video_idx += 1
+                else:
+                    structured_content.append({'type': 'text', 'text': chunk})
+            message['content'] = structured_content or [{'type': 'text', 'text': ''}]
+        if image_idx != len(inputs.images):
+            raise ValueError('Unused images remain after parsing message placeholders.')
+        if video_idx != len(inputs.videos):
+            raise ValueError('Unused videos remain after parsing message placeholders.')
+        return messages
+
+    @staticmethod
+    def _load_images(images: List[Any]) -> List[Image.Image]:
+        loaded = []
+        for image in images:
+            if isinstance(image, Image.Image):
+                loaded.append(image.convert('RGB'))
+            elif isinstance(image, str):
+                with Image.open(image) as pil_image:
+                    loaded.append(pil_image.convert('RGB'))
+            else:
+                loaded.append(image)
+        return loaded
+
+    @staticmethod
+    def _build_video_metadata(frames: np.ndarray, timestamps: np.ndarray) -> Dict[str, Any]:
+        if len(timestamps) <= 1:
+            fps = 1.0
+        else:
+            median_diff = np.median(np.diff(timestamps))
+            fps = 1.0 / float(median_diff) if median_diff > 0 else 1.0
+        frames_indices = np.rint(timestamps * fps).astype(int)
+        return {
+            'total_num_frames': int(frames.shape[0]),
+            'fps': float(fps),
+            'height': int(frames.shape[1]),
+            'width': int(frames.shape[2]),
+            'frames_indices': frames_indices.tolist(),
+        }
+
+    def _prepare_mm_inputs(self, inputs: StdTemplateInputs) -> Tuple[Dict[str, Any], List[List[int]], List[List[int]]]:
+        media_inputs: Dict[str, Any] = {}
+        image_expansions: List[List[int]] = []
+        video_expansions: List[List[int]] = []
+        tokenizer = self.tokenizer
+
+        if inputs.images:
+            images = self._load_images(inputs.images)
+            image_inputs = self.processor.image_processor(images=images, return_tensors='pt')
+            for image_grid, image_num_crops in zip(image_inputs['image_grids'], image_inputs['image_num_crops']):
+                image_tokens = self.processor.get_image_tokens(image_grid.cpu().numpy(), int(image_num_crops.item()))
+                image_expansions.append(tokenizer.encode(''.join(image_tokens), add_special_tokens=False))
+            media_inputs.update(image_inputs)
+
+        if inputs.videos:
+            if len(inputs.videos) != 1:
+                raise ValueError('Molmo2 currently only supports single-video samples.')
+            frames, timestamps, _ = self._load_video_descriptor(inputs.videos[0])
+            video_metadata = [self._build_video_metadata(frames, timestamps)]
+            video_inputs = self.processor.video_processor(
+                videos=[frames],
+                video_metadata=video_metadata,
+                do_sample_frames=False,
+                return_tensors='pt',
+                return_metadata=True,
+            )
+            video_metadata = video_inputs.pop('video_metadata')
+            for video_grid, metadata in zip(video_inputs['video_grids'], video_metadata):
+                video_string = self.processor.get_video_string(
+                    video_grid.cpu().numpy(),
+                    np.asarray(metadata.timestamps, dtype=np.float32),
+                )
+                video_expansions.append(tokenizer.encode(video_string, add_special_tokens=False))
+            media_inputs.update(video_inputs)
+
+        return media_inputs, image_expansions, video_expansions
+
+    def _replace_media_placeholders(self, token_ids: List[int], image_expansions: List[List[int]],
+                                    video_expansions: List[List[int]]) -> List[int]:
+        image_placeholder = self.tokenizer.encode('<|image|>', add_special_tokens=False)
+        video_placeholder = self.tokenizer.encode('<|video|>', add_special_tokens=False)
+        replaced: List[int] = []
+        i = 0
+        image_idx = 0
+        video_idx = 0
+        while i < len(token_ids):
+            if video_placeholder and token_ids[i:i + len(video_placeholder)] == video_placeholder:
+                if video_idx >= len(video_expansions):
+                    raise ValueError('Encountered more <|video|> placeholders than available video expansions.')
+                replaced.extend(video_expansions[video_idx])
+                video_idx += 1
+                i += len(video_placeholder)
+                continue
+            if image_placeholder and token_ids[i:i + len(image_placeholder)] == image_placeholder:
+                if image_idx >= len(image_expansions):
+                    raise ValueError('Encountered more <|image|> placeholders than available image expansions.')
+                replaced.extend(image_expansions[image_idx])
+                image_idx += 1
+                i += len(image_placeholder)
+                continue
+            replaced.append(token_ids[i])
+            i += 1
+        if image_idx != len(image_expansions):
+            raise ValueError('Unused image expansions remain after placeholder replacement.')
+        if video_idx != len(video_expansions):
+            raise ValueError('Unused video expansions remain after placeholder replacement.')
+        return replaced
+
+    def _encode_text_with_media(self, text: str, image_expansions: List[List[int]],
+                                video_expansions: List[List[int]]) -> List[int]:
+        token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+        token_ids = self._replace_media_placeholders(token_ids, image_expansions, video_expansions)
+        attention_mask = np.ones((1, len(token_ids)), dtype=np.int64)
+        token_ids_np = np.asarray([token_ids], dtype=np.int64)
+        if hasattr(self.processor, 'insert_bos'):
+            insert_bos = self.processor.insert_bos
+            try:
+                parameters = inspect.signature(insert_bos).parameters
+            except (TypeError, ValueError):
+                parameters = None
+            if parameters is not None and len(parameters) >= 4:
+                token_ids_np, _ = insert_bos(
+                    token_ids_np,
+                    attention_mask,
+                    self.tokenizer.bos_token_id,
+                    self.tokenizer.pad_token_id,
+                )
+            else:
+                token_ids_np, _ = insert_bos(token_ids_np, attention_mask)
+        return token_ids_np[0].tolist()
+
+    def _build_token_type_ids(self, input_ids: List[int]) -> List[int]:
+        image_token_ids = {int(token_id) for token_id in getattr(self.processor, 'image_token_ids', [])}
+        return [1 if token_id in image_token_ids else 0 for token_id in input_ids]
+
+    @staticmethod
+    def _extract_text_from_message(message: Dict[str, Any]) -> str:
+        content = message.get('content', '')
+        if isinstance(content, str):
+            return content
+        return ''.join(part.get('text', '') for part in content if part.get('type') == 'text')
+
+    def _build_training_prompt_text(self, messages: List[Dict[str, Any]], full_text: str) -> str:
+        assistant_text = self._extract_text_from_message(messages[-1])
+        if assistant_text and full_text.endswith(assistant_text):
+            return full_text[:-len(assistant_text)]
+        return self.processor.apply_chat_template(messages[:-1], tokenize=False, add_generation_prompt=True)
+
+    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        messages = self._build_messages_for_processor(inputs)
+        media_inputs, image_expansions, video_expansions = self._prepare_mm_inputs(inputs)
+
+        if self.is_training and messages and messages[-1]['role'] == 'assistant':
+            full_text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+            prompt_text = self._build_training_prompt_text(messages, full_text)
+            input_ids = self._encode_text_with_media(full_text, image_expansions, video_expansions)
+            prompt_ids = self._encode_text_with_media(prompt_text, image_expansions, video_expansions)
+            if input_ids[:len(prompt_ids)] != prompt_ids:
+                raise ValueError('Molmo2 prompt ids are not a prefix of the full training ids.')
+            labels = [-100] * len(prompt_ids) + input_ids[len(prompt_ids):]
+            loss_scale = [0.] * len(prompt_ids) + [1.] * (len(input_ids) - len(prompt_ids))
+        else:
+            prompt_text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            input_ids = self._encode_text_with_media(prompt_text, image_expansions, video_expansions)
+            labels = None
+            loss_scale = None
+
+        encoded: Dict[str, Any] = {
+            'input_ids': input_ids,
+            'labels': labels,
+            'loss_scale': loss_scale,
+            'token_type_ids': self._build_token_type_ids(input_ids),
+        }
+        encoded.update(media_inputs)
+        return encoded
+
+    def _data_collator_mm_data(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        res = super()._data_collator_mm_data(batch)
+        for key in ['image_grids', 'video_grids', 'image_token_pooling', 'video_token_pooling', 'image_num_crops']:
+            value = self.concat_tensor(batch, key, 0)
+            if value is not None:
+                res[key] = value
+        video_metadata = self.gather_list(batch, 'video_metadata')
+        if video_metadata:
+            res['video_metadata'] = video_metadata
+        return res
+
+
+register_template(
+    TemplateMeta(
+        MLLMTemplateType.molmo2,
+        prefix=[],
+        prompt=['{{QUERY}}'],
+        chat_sep=None,
+        suffix=[],
+        template_cls=Molmo2Template,
+    ))
diff --git a/tests/general/test_model.py b/tests/general/test_model.py
index 7c832e6de7..89825b597f 100644
--- a/tests/general/test_model.py
+++ b/tests/general/test_model.py
@@ -1,5 +1,6 @@
 import os
 import torch
+import unittest
 
 from swift.utils import get_device
 
@@ -25,6 +26,28 @@ def test_modelscope_hub():
     model, tokenizer = get_model_processor('Qwen/Qwen2___5-Math-1___5B-Instruct/', load_model=False)
 
 
+class TestMolmo2Registration(unittest.TestCase):
+
+    def test_registration(self):
+        from swift.model import MODEL_MAPPING, MLLMModelType
+        from swift.template import TEMPLATE_MAPPING, TemplateType
+
+        model_meta = MODEL_MAPPING[MLLMModelType.molmo2]
+        self.assertEqual(model_meta.template, TemplateType.molmo2)
+        self.assertEqual(model_meta.model_arch.arch_name, 'molmo')
+        self.assertIn('Molmo2ForConditionalGeneration', model_meta.architectures)
+
+        hf_model_ids = []
+        for group in model_meta.model_groups:
+            for model in group.models:
+                hf_model_ids.append(model.hf_model_id)
+
+        self.assertIn('allenai/Molmo2-4B', hf_model_ids)
+        self.assertIn('allenai/Molmo2-8B', hf_model_ids)
+        self.assertIn('allenai/Molmo2-O-7B', hf_model_ids)
+        self.assertIn(TemplateType.molmo2, TEMPLATE_MAPPING)
+
+
 if __name__ == '__main__':
     test_qwen2()
     # test_modelscope_hub()