fix

akoumpa · akoumpa · commit 456cfc501cfb · 2026-01-05T08:11:23.000-08:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/tests/unit_tests/models/llama/test_llama_custom_model.py b/tests/unit_tests/models/llama/test_llama_custom_model.py
@@ -62,20 +62,20 @@ def test_model_matches_hf_with_adapter_bidirectional(self, tiny_llama_checkpoint
         adapter = LlamaStateDictAdapter(config)
 
         # Load HF model
-        llama_model_hf = (
-            AutoModelForCausalLM.from_pretrained(
-                tiny_llama_checkpoint, attn_implementation="eager", torch_dtype=torch.bfloat16
-            )
-            .to("cuda")
-            .to(torch.bfloat16)  # need to manual cast to bfloat16 since HF initialize weights in float32 dtype
-        )
+        llama_model_hf = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path=tiny_llama_checkpoint,
+            attn_implementation="eager",
+            torch_dtype=torch.bfloat16,
+        ).to("cuda")
+        llama_model_hf.eval()
 
         # Build custom model
         llama_model_custom = NeMoAutoModelForCausalLM.from_pretrained(
             pretrained_model_name_or_path=tiny_llama_checkpoint,
             attn_implementation="eager",
             torch_dtype=torch.bfloat16,
         ).to("cuda")
+        llama_model_custom.eval()
 
         # Verify parameter counts match
         num_params_hf = sum(p.numel() for p in llama_model_hf.parameters())
@@ -89,13 +89,23 @@ def test_model_matches_hf_with_adapter_bidirectional(self, tiny_llama_checkpoint
         custom_state_dict_from_hf = adapter.from_hf(hf_state_dict)
         llama_model_custom.load_state_dict(custom_state_dict_from_hf, strict=True)
 
+        s = adapter.to_hf(llama_model_custom.state_dict())
+
+        for n1, p1 in hf_state_dict.items():
+            p2 = s[n1]
+            assert p1.shape == p2.shape, f"Parameter shape mismatch: {p1.shape} != {p2.shape}"
+            assert p1.dtype == p2.dtype, f"Parameter dtype mismatch: {p1.dtype} != {p2.dtype}"
+            assert p1.device == p2.device, f"Parameter device mismatch: {p1.device} != {p2.device}"
+            assert p1.requires_grad == p2.requires_grad, f"Parameter requires_grad mismatch: {p1.requires_grad} != {p2.requires_grad}"
+            assert torch.allclose(p1, p2, atol=1e-5, rtol=1e-5), f"Parameter mismatch: {p1} != {p2}"
+
         # Generate test inputs
         input_ids = torch.randint(0, config.vocab_size, (1, 10)).to("cuda")
         attention_mask = torch.ones((1, 10)).to("cuda")
 
         # Compare HF → Custom outputs
         with torch.no_grad():
-            output_hf = llama_model_hf(input_ids, attention_mask)
+            output_hf = llama_model_hf(input_ids.clone(), attention_mask.clone())
             output_custom = llama_model_custom(input_ids, attention_mask)
 
         np.testing.assert_allclose(
@@ -111,13 +121,12 @@ def test_model_matches_hf_with_adapter_bidirectional(self, tiny_llama_checkpoint
         hf_state_dict_from_custom = adapter.to_hf(custom_state_dict)
 
         # Create new HF model and load converted state dict
-        llama_model_hf_converted = (
-            AutoModelForCausalLM.from_pretrained(
-                tiny_llama_checkpoint, attn_implementation="eager", torch_dtype=torch.bfloat16
-            )
-            .to("cuda")
-            .to(torch.bfloat16)
-        )
+        llama_model_hf_converted = AutoModelForCausalLM.from_pretrained(
+            tiny_llama_checkpoint,
+            attn_implementation="eager",
+            torch_dtype=torch.bfloat16
+        ).to("cuda")
+        llama_model_hf_converted.eval()
         llama_model_hf_converted.load_state_dict(hf_state_dict_from_custom, strict=True)
 
         # Compare Custom → HF outputs
@@ -191,6 +200,7 @@ def test_export_custom_to_hf_checkpoint(self, tiny_llama_checkpoint):
                 attn_implementation="eager",
                 torch_dtype=torch.bfloat16,
             ).to("cuda")
+            llama_model_custom.eval()
 
             # Generate test input
             input_ids = torch.randint(0, config.vocab_size, (1, 10)).to("cuda")
@@ -204,15 +214,12 @@ def test_export_custom_to_hf_checkpoint(self, tiny_llama_checkpoint):
             llama_model_custom.save_pretrained_hf_format(export_path)
 
             # Load from saved HF checkpoint
-            llama_model_hf_loaded = (
-                AutoModelForCausalLM.from_pretrained(
-                    export_path,
-                    attn_implementation="eager",
-                    torch_dtype=torch.bfloat16,
-                )
-                .to("cuda")
-                .to(torch.bfloat16)
-            )
+            llama_model_hf_loaded = AutoModelForCausalLM.from_pretrained(
+                export_path,
+                attn_implementation="eager",
+                torch_dtype=torch.bfloat16,
+            ).to("cuda")
+            llama_model_hf_loaded.eval()
 
             # Compare outputs
             with torch.no_grad():