Skip to content

Commit ad895fa

Browse files
committed
fixing tests for v5
Signed-off-by: adil-a <adil.asif2000@hotmail.com>
1 parent d537dca commit ad895fa

File tree

20 files changed

+763
-1952
lines changed

20 files changed

+763
-1952
lines changed

nemo_automodel/_transformers/auto_model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,7 @@ def _retry(**override):
883883
use_liger_kernel=override.get("use_liger_kernel", use_liger_kernel),
884884
use_sdpa_patching=override.get("use_sdpa_patching", use_sdpa_patching),
885885
sdpa_method=sdpa_method,
886+
force_hf=force_hf,
886887
autopipeline=autopipeline,
887888
parallelize_fn=parallelize_fn,
888889
peft_config=peft_config,

nemo_automodel/_transformers/tokenization/nemo_auto_tokenizer.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, add_bos_token=Tru
3535
add_eos_token: Whether to add EOS token (default: True)
3636
"""
3737
tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
38-
tokenizer.add_bos_token = add_bos_token
39-
tokenizer.add_eos_token = add_eos_token
38+
39+
if add_bos_token and getattr(tokenizer, "bos_token", None) is not None:
40+
tokenizer.add_bos_token = add_bos_token
41+
if add_eos_token and getattr(tokenizer, "eos_token", None) is not None:
42+
tokenizer.add_eos_token = add_eos_token
4043
# Keep the wrapper class name at runtime, but remember the original HF tokenizer class
4144
# so we can save an HF-compatible `tokenizer_class` in `save_pretrained()`.
4245
base_tokenizer_cls = type(tokenizer)

nemo_automodel/components/checkpoint/checkpointing.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -606,14 +606,22 @@ def _maybe_build_consolidated_index(
606606
fqn_to_file_index_mapping = get_fqn_to_file_index_mapping(
607607
index_path, getattr(model, "_checkpoint_conversion_mapping", None)
608608
)
609-
# some HF models like Moonlight-16B have non-persistent buffers in the base checkpoint
610-
# however, HF initializes buffers with persistent=False, so we need to make sure these
611-
# buffer keys are not saved during checkpointing
612-
keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(self.config.model_state_dict_keys))
613-
if model_state.is_tied_lm_head:
614-
keys_to_remove.append(model_state.lm_head_param_name)
615-
for key in keys_to_remove:
616-
fqn_to_file_index_mapping.pop(key, None)
609+
model_part = model_state.model[0]
610+
config = getattr(model_part, "config", None)
611+
model_type = getattr(config, "model_type", None)
612+
if model_type and requires_tensor_merging(model_type) and not hasattr(model_part, "state_dict_adapter"):
613+
# in this case, Transformers performed weight conversion so we will save the converted format in the checkpoint
614+
num_shards = max(fqn_to_file_index_mapping.values()) if fqn_to_file_index_mapping else 1
615+
fqn_to_file_index_mapping = _equally_divide_layers(num_shards, self.config.model_state_dict_keys)
616+
else:
617+
# some HF models like Moonlight-16B have non-persistent buffers in the base checkpoint
618+
# however, HF initializes buffers with persistent=False, so we need to make sure these
619+
# buffer keys are not saved during checkpointing
620+
keys_to_remove = list(set(fqn_to_file_index_mapping.keys()) - set(self.config.model_state_dict_keys))
621+
if model_state.is_tied_lm_head:
622+
keys_to_remove.append(model_state.lm_head_param_name)
623+
for key in keys_to_remove:
624+
fqn_to_file_index_mapping.pop(key, None)
617625
else:
618626
fqn_to_file_index_mapping = {k: 1 for k in state_dict.keys()}
619627

@@ -1055,6 +1063,29 @@ def _maybe_adapt_state_dict_to_hf(
10551063
return state_dict
10561064

10571065

1066+
def _equally_divide_layers(num_shards: int, keys: list[str]) -> dict[str, int]:
1067+
"""
1068+
Equally divide the state dict keys into num_shards shards.
1069+
"""
1070+
if num_shards <= 0:
1071+
raise ValueError(f"num_shards must be > 0, got {num_shards}")
1072+
1073+
num_layers = len(keys)
1074+
if num_layers == 0:
1075+
return {}
1076+
1077+
layers_per_shard, remainder = divmod(num_layers, num_shards)
1078+
fqn_to_index_mapping: dict[str, int] = {}
1079+
start = 0
1080+
for shard_index in range(1, num_shards + 1):
1081+
extra = 1 if shard_index <= remainder else 0
1082+
end = start + layers_per_shard + extra
1083+
for key in keys[start:end]:
1084+
fqn_to_index_mapping[key] = shard_index
1085+
start = end
1086+
return fqn_to_index_mapping
1087+
1088+
10581089
def _maybe_adapt_state_dict_from_hf(
10591090
model_part: nn.Module, state_dict: dict[str, torch.Tensor], moe_mesh: Optional[DeviceMesh] = None
10601091
) -> dict[str, torch.Tensor]:

nemo_automodel/components/distributed/parallelizer.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -548,15 +548,35 @@ def get_hf_tp_shard_plan(model):
548548
if f"{model_prefix}.embed_tokens" not in hf_tp_plan:
549549
hf_tp_plan[f"{model_prefix}.embed_tokens"] = "rowwise_rep"
550550

551+
# Build translated plan, skipping HF's MoE-related styles.
552+
#
553+
# HuggingFace transformers v5 introduced these styles for MoE models, but they do NOT
554+
# implement true expert parallelism (where each rank stores only a subset of experts).
555+
# Instead, HF's approach:
556+
# - local_colwise/local_rowwise: Store expert weights as local tensors (NOT sharded).
557+
# Despite the names, these do NOT perform tensor parallelism on the experts.
558+
# Each rank stores ALL expert weights (full shape), which is memory inefficient.
559+
# - ep_router: Modifies routing so each rank only computes with a subset of experts.
560+
# This distributes compute but not memory.
561+
# - gather: All-reduces expert outputs across ranks.
562+
#
563+
# Since these styles result in replicated expert weights (not sharded), and we don't
564+
# support HF's routing modification approach, we skip them entirely. The experts will
565+
# be replicated across all ranks and computed redundantly, which is correct but not
566+
# memory/compute efficient for large MoE models.
567+
_hf_moe_styles = {"ep_router", "local_colwise", "local_rowwise", "gather"}
568+
translated_plan = {}
551569
for k, v in hf_tp_plan.items():
570+
if isinstance(v, str) and (v.startswith("ep_") or v in _hf_moe_styles):
571+
continue
552572
# speed up the tp plan for lm_head
553573
if (k == "lm_head" or k == "language_model.lm_head") and v == "colwise_rep":
554-
hf_tp_plan[k] = ColwiseParallel(output_layouts=Shard(-1), use_local_output=False)
574+
translated_plan[k] = ColwiseParallel(output_layouts=Shard(-1), use_local_output=False)
555575
else:
556-
hf_tp_plan[k] = translate_to_torch_parallel_style(v)
576+
translated_plan[k] = translate_to_torch_parallel_style(v)
557577

558-
logger.info(f"Hugging Face tp plan: {hf_tp_plan}")
559-
return hf_tp_plan
578+
logger.info(f"Hugging Face tp plan: {translated_plan}")
579+
return translated_plan
560580

561581

562582
def import_class_from_path(name: str) -> Any:

nemo_automodel/components/models/deepseek_v3/model.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,9 @@ def init_weights(self, buffer_device: torch.device | None = None) -> None:
213213
self.freqs_cis = precompute_freqs_cis(
214214
self.config.qk_rope_head_dim,
215215
self.max_seq_len,
216-
self.config.rope_theta,
216+
self.config.rope_parameters["rope_theta"]
217+
if hasattr(self.config, "rope_parameters")
218+
else self.config.rope_theta,
217219
self.config.rope_scaling,
218220
)
219221
self.freqs_cis = self.freqs_cis.to(buffer_device)
@@ -321,7 +323,9 @@ def initialize_weights(
321323
self.model.freqs_cis = precompute_freqs_cis(
322324
self.config.qk_rope_head_dim,
323325
self.model.max_seq_len,
324-
self.config.rope_theta,
326+
self.config.rope_parameters["rope_theta"]
327+
if hasattr(self.config, "rope_parameters")
328+
else self.config.rope_theta,
325329
self.config.rope_scaling,
326330
)
327331

nemo_automodel/components/models/gpt_oss/model.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,13 @@ def __init__(self, config: GptOssConfig, backend: BackendConfig, *, moe_config:
127127
"beta_slow": 1.0,
128128
"original_max_position_embeddings": 4096,
129129
}
130+
if hasattr(config, "rope_parameters") and config.rope_parameters:
131+
rope_theta = config.rope_parameters.get("rope_theta", 10000.0)
132+
else:
133+
rope_theta = getattr(config, "rope_theta", 10000.0)
130134
self.rotary_emb = RotaryEmbedding(
131135
head_dim=self.head_dim,
132-
base=getattr(config, "rope_theta", 10000.0),
136+
base=rope_theta,
133137
dtype=torch.float32,
134138
initial_context_length=rope_scaling["original_max_position_embeddings"],
135139
scaling_factor=rope_scaling["factor"],

nemo_automodel/recipes/vlm/finetune.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,8 @@ def _forward_backward_step(
880880
out = model(logits_to_keep=1, **batch)
881881
if "hidden_states" not in out:
882882
raise ValueError(
883-
"FusedLinearCrossEntropy requires the model to output hidden states. Set `model.output_hidden_states=True` in the config."
883+
"FusedLinearCrossEntropy requires the model to output hidden states. "
884+
"Set `model.text_config.output_hidden_states=True` in the config."
884885
)
885886
else:
886887
out = model(**batch)

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ vlm = [
124124
"numpy",
125125
"numba",
126126
"torchcodec; (platform_machine == 'x86_64' and platform_system != 'Darwin')",
127-
"mistral_common[opencv]",
127+
"mistral_common[opencv]>=1.9.0",
128128
"albumentations"
129129
]
130130
all = [
@@ -157,7 +157,7 @@ linting = [
157157
"ruff~=0.9.0",
158158
"import-linter~=2.4",
159159
]
160-
test = ["coverage", "pytest", "peft"]
160+
test = ["coverage", "pytest", "peft>=0.18.1"]
161161
dev = ["cut-cross-entropy", "liger-kernel; (platform_machine == 'x86_64' and platform_system != 'Darwin')"]
162162

163163
[tool.uv]

0 commit comments

Comments
 (0)