feat: add ministral3 configs and improve tie_emb detection (#915)

HuiyingLi · adil-a · web-flow · commit b2db55eee98d · 2025-12-05T09:34:11.000-08:00
Signed-off-by: HuiyingLi &lt;willwin.lee@gmail.com&gt;
Co-authored-by: Adil &lt;47084919+adil-a@users.noreply.github.com&gt;
diff --git a/examples/vlm_finetune/mistral/ministral3_14b_medpix.yaml b/examples/vlm_finetune/mistral/ministral3_14b_medpix.yaml
@@ -0,0 +1,95 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for fine-tuning ministral 3 14b
+
+
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 100
+  val_every_steps: 100  # will run every x number of gradient steps
+  num_epochs: 1
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: mistralai/Ministral-3-14B-Reasoning-2512
+  torch_dtype: torch.bfloat16
+  attn_implementation: eager
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: vlm_checkpoints/
+  model_save_format: safetensors
+  save_consolidated: True
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train[:1000]
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation[:500]
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+  
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-5
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_language_model: false
+  
+# Uncomment and configure for W&B logging
+#  wandb:
+#   project:
+#   entity:
+#   name:
+#   save_dir:
diff --git a/examples/vlm_finetune/mistral/ministral3_3b_medpix.yaml b/examples/vlm_finetune/mistral/ministral3_3b_medpix.yaml
@@ -0,0 +1,95 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for fine-tuning ministral 3 3b
+
+
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 100
+  val_every_steps: 100  # will run every x number of gradient steps
+  num_epochs: 1
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: mistralai/Ministral-3-3B-Reasoning-2512
+  torch_dtype: torch.bfloat16
+  attn_implementation: eager
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: vlm_checkpoints/
+  model_save_format: safetensors
+  save_consolidated: True
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train[:1000]
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation[:500]
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+  
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-5
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_language_model: false
+  
+# Uncomment and configure for W&B logging
+#  wandb:
+#   project:
+#   entity:
+#   name:
+#   save_dir:
diff --git a/examples/vlm_finetune/mistral/ministral3_8b_medpix.yaml b/examples/vlm_finetune/mistral/ministral3_8b_medpix.yaml
@@ -0,0 +1,95 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Configuration for fine-tuning ministral 3 8b
+
+
+
+step_scheduler:
+  global_batch_size: 8
+  local_batch_size: 1
+  ckpt_every_steps: 100
+  val_every_steps: 100  # will run every x number of gradient steps
+  num_epochs: 1
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: mistralai/Ministral-3-8B-Reasoning-2512
+  torch_dtype: torch.bfloat16
+  attn_implementation: eager
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: vlm_checkpoints/
+  model_save_format: safetensors
+  save_consolidated: True
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train[:1000]
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation[:500]
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+  
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 1e-5
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
+
+freeze_config:
+  freeze_embeddings: true
+  freeze_vision_tower: true
+  freeze_language_model: false
+  
+# Uncomment and configure for W&B logging
+#  wandb:
+#   project:
+#   entity:
+#   name:
+#   save_dir:
diff --git a/nemo_automodel/components/checkpoint/checkpointing.py b/nemo_automodel/components/checkpoint/checkpointing.py
@@ -39,6 +39,7 @@
 )
 from nemo_automodel.components.checkpoint.addons import ConsolidatedHFAddon, PeftAddon
 from nemo_automodel.components.checkpoint.stateful_wrappers import ModelState, OptimizerState
+from nemo_automodel.components.checkpoint.utils import is_tied_word_embeddings
 
 if TYPE_CHECKING:
     from peft import PeftConfig
@@ -374,7 +375,7 @@ def load_base_model(
                 key_mapping=getattr(model, "_checkpoint_conversion_mapping", None),
             )
 
-        is_tied_lm_head = getattr(getattr(model, "config", {}), "tie_word_embeddings", False)
+        is_tied_lm_head = is_tied_word_embeddings(model)
         self.config.original_model_root_dir = root_dir
         if hasattr(model, "tie_weights") and is_tied_lm_head:
             model.tie_weights()
diff --git a/nemo_automodel/components/checkpoint/stateful_wrappers.py b/nemo_automodel/components/checkpoint/stateful_wrappers.py
@@ -24,6 +24,8 @@
     set_optimizer_state_dict,
 )
 
+from nemo_automodel.components.checkpoint.utils import is_tied_word_embeddings
+
 _PREFIX = "model."
 
 
@@ -92,16 +94,7 @@ def __init__(
                 - ["score."] for some classification heads
         """
         self.model = [model] if isinstance(model, torch.nn.Module) else model
-        self.is_tied_lm_head = getattr(getattr(self.model[0], "config", {}), "tie_word_embeddings", False)
-
-        non_tied_lm_head_models = {
-            "Qwen3OmniMoeThinkerForConditionalGeneration",  # complicated config structure
-            "InternVLForConditionalGeneration",  # even tho config says tie_word_embeddings=True, it's not
-        }
-        for m in non_tied_lm_head_models:
-            if m in type(self.model[0]).__name__:
-                self.is_tied_lm_head = False
-                break
+        self.is_tied_lm_head = is_tied_word_embeddings(self.model[0])
 
         if self.is_tied_lm_head:
             _, lm_head_param_name = _get_lm_head_weight_and_name(self.model[0])
diff --git a/nemo_automodel/components/checkpoint/utils.py b/nemo_automodel/components/checkpoint/utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+
+
+def is_tied_word_embeddings(model: nn.Module) -> bool:
+    """
+    Check if the model's word embeddings are tied.
+
+    Args:
+        model (nn.Module): The model to check.
+
+    Returns:
+        bool: True if the model's word embeddings are tied, False otherwise.
+    """
+    non_tied_lm_head_models = {
+        "Qwen3OmniMoeThinkerForConditionalGeneration",  # complicated config structure
+    }
+    model_class_name = type(model).__name__
+    for m in non_tied_lm_head_models:
+        if m in model_class_name:
+            return False
+    config = getattr(model, "config", None)
+    text_config = getattr(config, "get_text_config", lambda: None)()
+    return bool(getattr(text_config, "tie_word_embeddings", getattr(config, "tie_word_embeddings", False)))
diff --git a/tests/unit_tests/checkpoint/test_addons.py b/tests/unit_tests/checkpoint/test_addons.py
@@ -89,7 +89,7 @@ def __init__(self):
     _DummyModel.__name__ = "Qwen3OmniMoeThinkerForConditionalGeneration"
 
     model = _DummyModel()
-    state = ModelState(model)
+    state = ModelState([model])
 
     assert state.is_tied_lm_head is False
     assert not hasattr(state, "lm_head_param_name")
diff --git a/tests/unit_tests/utils/test_checkpoint_utils.py b/tests/unit_tests/utils/test_checkpoint_utils.py

Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@`
`39`	`39`	`)`
`40`	`40`	`from nemo_automodel.components.checkpoint.addons import ConsolidatedHFAddon, PeftAddon`
`41`	`41`	`from nemo_automodel.components.checkpoint.stateful_wrappers import ModelState, OptimizerState`
	`42`	`+from nemo_automodel.components.checkpoint.utils import is_tied_word_embeddings`
`42`	`43`
`43`	`44`	`if TYPE_CHECKING:`
`44`	`45`	`from peft import PeftConfig`
`@@ -374,7 +375,7 @@ def load_base_model(`
`374`	`375`	`key_mapping=getattr(model, "_checkpoint_conversion_mapping", None),`
`375`	`376`	`)`
`376`	`377`
`377`		`- is_tied_lm_head = getattr(getattr(model, "config", {}), "tie_word_embeddings", False)`
	`378`	`+ is_tied_lm_head = is_tied_word_embeddings(model)`
`378`	`379`	`self.config.original_model_root_dir = root_dir`
`379`	`380`	`if hasattr(model, "tie_weights") and is_tied_lm_head:`
`380`	`381`	`model.tie_weights()`