NVIDIA-NeMo
diff --git a/‎docs/model-coverage/vlm.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/model-coverage/vlm.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml‎
Lines changed: 120 additions & 0 deletions b/‎examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 50 additions & 0 deletions b/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v3/model.py‎
Lines changed: 11 additions & 3 deletions b/‎nemo_automodel/components/models/deepseek_v3/model.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎nemo_automodel/components/models/kimivl/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎nemo_automodel/components/models/kimivl/__init__.py‎
Lines changed: 13 additions & 0 deletions
@@ -25,6 +25,7 @@ NeMo Automodel supports [AutoModelForImageTextToText](https://huggingface.co/doc
 
 | Model                              | Dataset                     | FSDP2      | PEFT       | Example YAML |
 |------------------------------------|-----------------------------|------------|------------|--------------|
+| Kimi-VL-A3B-Instruct               | cord-v2                     | Supported  | Supported  | [kimi2vl_cordv2.yaml](../../examples/vlm_finetune/kimi/kimi2vl_cordv2.yaml) |
 | Gemma 3-4B & 27B                   | naver-clova-ix & rdr-items  | Supported  | Supported  | [gemma3_vl_4b_cord_v2.yaml](../../examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml) |
 | Gemma 3n                           | naver-clova-ix & rdr-items  | Supported  | Supported  | [gemma3n_vl_4b_medpix.yaml](../../examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml) |
 | Qwen2-VL-2B-Instruct & Qwen2.5-VL-3B-Instruct | cord-v2          | Supported  | Supported  | [qwen2_5_vl_3b_rdr.yaml](../../examples/vlm_finetune/qwen2_5/qwen2_5_vl_3b_rdr.yaml) |
 
@@ -0,0 +1,120 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+step_scheduler:
+  global_batch_size: 16
+  local_batch_size: 2
+  ckpt_every_steps: 100
+  val_every_steps: 100
+  max_steps: 50
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  ep_size: 8
+  sequence_parallel: false
+
+autopipeline:
+  _target_: nemo_automodel.components.distributed.pipelining.AutoPipeline
+  pp_schedule: interleaved1f1b
+  pp_microbatch_size: 2
+  round_virtual_stages_to_pp_multiple: down
+  scale_grads_in_schedule: false
+  layers_per_stage: 7
+  patch_inner_model: false      
+  patch_causal_lm_model: false 
+
+parallelizer:
+  _target_: nemo_automodel.components.moe.parallelizer.parallelize_model
+  activation_checkpointing: false
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: moonshotai/Kimi-VL-A3B-Instruct
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: te
+    linear: te
+    rms_norm: te
+    rope_fusion: true
+    enable_deepep: true
+    fake_balanced_gate: false
+    enable_hf_state_dict_adapter: true
+    enable_fsdp_optimizations: true
+
+processor:
+  _target_: transformers.AutoProcessor.from_pretrained
+  pretrained_model_name_or_path: moonshotai/Kimi-VL-A3B-Instruct
+  trust_remote_code: true
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: vlm_checkpoints/kimi2vl/
+  model_save_format: safetensors
+  save_consolidated: true
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+  fp32_upcast: false
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
+  path_or_dataset: naver-clova-ix/cord-v2
+  split: train
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  pin_memory: true
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.kimi_vl_collate_fn
+    max_length: 2048
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
+  path_or_dataset: naver-clova-ix/cord-v2
+  split: validation
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1.0e-05
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_language_model: false
+
+# wandb:
+#   project: <your_project_name>
+#   entity: <your_entity_name>
+#   name: kimi2vl_finetune
@@ -253,6 +253,55 @@ def has_data(modality_list):
     return batch
 
 
+def kimi_vl_collate_fn(
+    examples: Sequence[Dict[str, Any]],
+    processor,
+    max_length: Optional[int] = None,
+) -> Dict[str, torch.Tensor]:
+    """Collate function for KimiVL processors."""
+    conversations = [example["conversation"] for example in examples]
+    texts = [
+        processor.apply_chat_template(conversation, add_generation_prompt=False, tokenize=False)
+        for conversation in conversations
+    ]
+
+    images: List[Any] = []
+    for conversation in conversations:
+        for message in conversation:
+            content = message.get("content")
+            if isinstance(content, list):
+                for item in content:
+                    if isinstance(item, dict) and item.get("type") == "image":
+                        images.append(item.get("image"))
+
+    processor_kwargs = {
+        "text": texts,
+        "return_tensors": "pt",
+        "padding": True,
+        "truncation": True,
+    }
+    if max_length is not None:
+        processor_kwargs["max_length"] = max_length
+        processor_kwargs["padding"] = "max_length"
+    if images:
+        processor_kwargs["images"] = images
+
+    batch = processor(**processor_kwargs)
+
+    labels = build_labels(
+        batch["input_ids"],
+        conversations,
+        processor,
+    )
+    batch["labels"] = labels[:, 1:]
+
+    input_shape = batch["input_ids"].shape
+    for key, value in list(batch.items()):
+        if isinstance(value, torch.Tensor) and value.shape == input_shape:
+            batch[key] = value[:, :-1]
+    return batch
+
+
 def nemotron_parse_collate_fn(
     examples: Sequence[Dict[str, Any]],
     processor,
@@ -383,6 +432,7 @@ def default_collate_fn(
 COLLATE_FNS = {
     "Qwen2_5_VLProcessor": qwen2_5_collate_fn,
     "Qwen3OmniMoeProcessor": qwen3_omni_collate_fn,
+    "KimiVLProcessor": kimi_vl_collate_fn,
     "NemotronParseProcessor": nemotron_parse_collate_fn,
     "default": default_collate_fn,
 }
@@ -155,16 +155,24 @@ def __init__(
 
     def forward(
         self,
-        input_ids: torch.Tensor,
+        input_ids: torch.Tensor | None = None,
         *,
+        inputs_embeds: torch.Tensor | None = None,
         position_ids: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         padding_mask: torch.Tensor | None = None,
         **attn_kwargs: Any,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if (input_ids is None) == (inputs_embeds is None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) if self.embed_tokens is not None else input_ids
+
         if position_ids is None:
+            seq_len = inputs_embeds.shape[1]
             position_ids = (
-                torch.arange(0, input_ids.shape[1], device=input_ids.device).unsqueeze(0).expand(input_ids.shape[0], -1)
+                torch.arange(seq_len, device=inputs_embeds.device).unsqueeze(0).expand(inputs_embeds.shape[0], -1)
             )
 
         with torch.no_grad():
@@ -176,7 +184,7 @@ def forward(
                 cp_size=attn_kwargs.get("cp_size", 1),
             )
 
-        h = self.embed_tokens(input_ids) if self.embed_tokens is not None else input_ids
+        h = inputs_embeds
 
         # Apply the transformer layers.
         for layer in self.layers.values():
 
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.