NVIDIA-NeMo
diff --git a/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_automodel/_transformers/auto_model.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py‎
Lines changed: 5 additions & 2 deletions b/‎nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎nemo_automodel/components/checkpoint/checkpointing.py‎
Lines changed: 39 additions & 8 deletions b/‎nemo_automodel/components/checkpoint/checkpointing.py‎
Lines changed: 39 additions & 8 deletions
diff --git a/‎nemo_automodel/components/distributed/parallelizer.py‎
Lines changed: 24 additions & 4 deletions b/‎nemo_automodel/components/distributed/parallelizer.py‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v3/model.py‎
Lines changed: 6 additions & 2 deletions b/‎nemo_automodel/components/models/deepseek_v3/model.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎nemo_automodel/components/models/gpt_oss/model.py‎
Lines changed: 5 additions & 1 deletion b/‎nemo_automodel/components/models/gpt_oss/model.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎nemo_automodel/recipes/vlm/finetune.py‎
Lines changed: 2 additions & 1 deletion b/‎nemo_automodel/recipes/vlm/finetune.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
@@ -883,6 +883,7 @@ def _retry(**override):
                 use_liger_kernel=override.get("use_liger_kernel", use_liger_kernel),
                 use_sdpa_patching=override.get("use_sdpa_patching", use_sdpa_patching),
                 sdpa_method=sdpa_method,
+                force_hf=force_hf,
                 autopipeline=autopipeline,
                 parallelize_fn=parallelize_fn,
                 peft_config=peft_config,
 
@@ -35,8 +35,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, add_bos_token=Tru
             add_eos_token: Whether to add EOS token (default: True)
         """
         tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        tokenizer.add_bos_token = add_bos_token
-        tokenizer.add_eos_token = add_eos_token
+
+        if add_bos_token and getattr(tokenizer, "bos_token", None) is not None:
+            tokenizer.add_bos_token = add_bos_token
+        if add_eos_token and getattr(tokenizer, "eos_token", None) is not None:
+            tokenizer.add_eos_token = add_eos_token
         # Keep the wrapper class name at runtime, but remember the original HF tokenizer class
         # so we can save an HF-compatible `tokenizer_class` in `save_pretrained()`.
         base_tokenizer_cls = type(tokenizer)
 
@@ -606,14 +606,22 @@ def _maybe_build_consolidated_index(
             fqn_to_file_index_mapping = get_fqn_to_file_index_mapping(
                 index_path, getattr(model, "_checkpoint_conversion_mapping", None)
             )
-            # some HF models like Moonlight-16B have non-persistent buffers in the base checkpoint
-            # however, HF initializes buffers with persistent=False, so we need to make sure these
-            # buffer keys are not saved during checkpointing
-            keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(self.config.model_state_dict_keys))
-            if model_state.is_tied_lm_head:
-                keys_to_remove.append(model_state.lm_head_param_name)
-            for key in keys_to_remove:
-                fqn_to_file_index_mapping.pop(key, None)
+            model_part = model_state.model[0]
+            config = getattr(model_part, "config", None)
+            model_type = getattr(config, "model_type", None)
+            if model_type and requires_tensor_merging(model_type) and not hasattr(model_part, "state_dict_adapter"):
+                # in this case, Transformers performed weight conversion so we will save the converted format in the checkpoint
+                num_shards = max(fqn_to_file_index_mapping.values()) if fqn_to_file_index_mapping else 1
+                fqn_to_file_index_mapping = _equally_divide_layers(num_shards, self.config.model_state_dict_keys)
+            else:
+                # some HF models like Moonlight-16B have non-persistent buffers in the base checkpoint
+                # however, HF initializes buffers with persistent=False, so we need to make sure these
+                # buffer keys are not saved during checkpointing
+                keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(self.config.model_state_dict_keys))
+                if model_state.is_tied_lm_head:
+                    keys_to_remove.append(model_state.lm_head_param_name)
+                for key in keys_to_remove:
+                    fqn_to_file_index_mapping.pop(key, None)
         else:
             fqn_to_file_index_mapping = {k: 1 for k in state_dict.keys()}
 
@@ -1055,6 +1063,29 @@ def _maybe_adapt_state_dict_to_hf(
     return state_dict
 
 
+def _equally_divide_layers(num_shards: int, keys: list[str]) -> dict[str, int]:
+    """
+    Equally divide the state dict keys into num_shards shards.
+    """
+    if num_shards <= 0:
+        raise ValueError(f"num_shards must be > 0, got {num_shards}")
+
+    num_layers = len(keys)
+    if num_layers == 0:
+        return {}
+
+    layers_per_shard, remainder = divmod(num_layers, num_shards)
+    fqn_to_index_mapping: dict[str, int] = {}
+    start = 0
+    for shard_index in range(1, num_shards + 1):
+        extra = 1 if shard_index <= remainder else 0
+        end = start + layers_per_shard + extra
+        for key in keys[start:end]:
+            fqn_to_index_mapping[key] = shard_index
+        start = end
+    return fqn_to_index_mapping
+
+
 def _maybe_adapt_state_dict_from_hf(
     model_part: nn.Module, state_dict: dict[str, torch.Tensor], moe_mesh: Optional[DeviceMesh] = None
 ) -> dict[str, torch.Tensor]:
 
@@ -548,15 +548,35 @@ def get_hf_tp_shard_plan(model):
     if f"{model_prefix}.embed_tokens" not in hf_tp_plan:
         hf_tp_plan[f"{model_prefix}.embed_tokens"] = "rowwise_rep"
 
+    # Build translated plan, skipping HF's MoE-related styles.
+    #
+    # HuggingFace transformers v5 introduced these styles for MoE models, but they do NOT
+    # implement true expert parallelism (where each rank stores only a subset of experts).
+    # Instead, HF's approach:
+    # - local_colwise/local_rowwise: Store expert weights as local tensors (NOT sharded).
+    #   Despite the names, these do NOT perform tensor parallelism on the experts.
+    #   Each rank stores ALL expert weights (full shape), which is memory inefficient.
+    # - ep_router: Modifies routing so each rank only computes with a subset of experts.
+    #   This distributes compute but not memory.
+    # - gather: All-reduces expert outputs across ranks.
+    #
+    # Since these styles result in replicated expert weights (not sharded), and we don't
+    # support HF's routing modification approach, we skip them entirely. The experts will
+    # be replicated across all ranks and computed redundantly, which is correct but not
+    # memory/compute efficient for large MoE models.
+    _hf_moe_styles = {"ep_router", "local_colwise", "local_rowwise", "gather"}
+    translated_plan = {}
     for k, v in hf_tp_plan.items():
+        if isinstance(v, str) and (v.startswith("ep_") or v in _hf_moe_styles):
+            continue
         # speed up the tp plan for lm_head
         if (k == "lm_head" or k == "language_model.lm_head") and v == "colwise_rep":
-            hf_tp_plan[k] = ColwiseParallel(output_layouts=Shard(-1), use_local_output=False)
+            translated_plan[k] = ColwiseParallel(output_layouts=Shard(-1), use_local_output=False)
         else:
-            hf_tp_plan[k] = translate_to_torch_parallel_style(v)
+            translated_plan[k] = translate_to_torch_parallel_style(v)
 
-    logger.info(f"Hugging Face tp plan: {hf_tp_plan}")
-    return hf_tp_plan
+    logger.info(f"Hugging Face tp plan: {translated_plan}")
+    return translated_plan
 
 
 def import_class_from_path(name: str) -> Any:
 
@@ -213,7 +213,9 @@ def init_weights(self, buffer_device: torch.device | None = None) -> None:
             self.freqs_cis = precompute_freqs_cis(
                 self.config.qk_rope_head_dim,
                 self.max_seq_len,
-                self.config.rope_theta,
+                self.config.rope_parameters["rope_theta"]
+                if hasattr(self.config, "rope_parameters")
+                else self.config.rope_theta,
                 self.config.rope_scaling,
             )
             self.freqs_cis = self.freqs_cis.to(buffer_device)
@@ -321,7 +323,9 @@ def initialize_weights(
             self.model.freqs_cis = precompute_freqs_cis(
                 self.config.qk_rope_head_dim,
                 self.model.max_seq_len,
-                self.config.rope_theta,
+                self.config.rope_parameters["rope_theta"]
+                if hasattr(self.config, "rope_parameters")
+                else self.config.rope_theta,
                 self.config.rope_scaling,
             )
 
 
@@ -127,9 +127,13 @@ def __init__(self, config: GptOssConfig, backend: BackendConfig, *, moe_config:
                 "beta_slow": 1.0,
                 "original_max_position_embeddings": 4096,
             }
+        if hasattr(config, "rope_parameters") and config.rope_parameters:
+            rope_theta = config.rope_parameters.get("rope_theta", 10000.0)
+        else:
+            rope_theta = getattr(config, "rope_theta", 10000.0)
         self.rotary_emb = RotaryEmbedding(
             head_dim=self.head_dim,
-            base=getattr(config, "rope_theta", 10000.0),
+            base=rope_theta,
             dtype=torch.float32,
             initial_context_length=rope_scaling["original_max_position_embeddings"],
             scaling_factor=rope_scaling["factor"],
 
@@ -880,7 +880,8 @@ def _forward_backward_step(
                     out = model(logits_to_keep=1, **batch)
                     if "hidden_states" not in out:
                         raise ValueError(
-                            "FusedLinearCrossEntropy requires the model to output hidden states. Set `model.output_hidden_states=True` in the config."
+                            "FusedLinearCrossEntropy requires the model to output hidden states. "
+                            "Set `model.text_config.output_hidden_states=True` in the config."
                         )
                 else:
                     out = model(**batch)
 
@@ -124,7 +124,7 @@ vlm = [
     "numpy",
     "numba",
     "torchcodec; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
-    "mistral_common[opencv]",
+    "mistral_common[opencv]>=1.9.0",
     "albumentations"
 ]
 all = [
@@ -157,7 +157,7 @@ linting = [
     "ruff~=0.9.0",
     "import-linter~=2.4",
 ]
-test = ["coverage", "pytest", "peft"]
+test = ["coverage", "pytest", "peft>=0.18.1"]
 dev = ["cut-cross-entropy", "liger-kernel; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
 
 [tool.uv]