Update on the test case

jingyu-ml · jingyu-ml · commit 024d57c86248 · 2025-09-30T22:33:27.000Z
Signed-off-by: Jingyu Xin &lt;jingyux@nvidia.com&gt;
diff --git a/tests/gpu/torch/peft/test_megatron_peft.py b/tests/gpu/torch/peft/test_megatron_peft.py
@@ -375,190 +375,194 @@ def test_mcore_save_restore(device_count, lora_config, tmp_path):
     )
 
 
-# TODO: Grad check
-
-# def test_edge_cases_and_error_handling():
-#     """Test edge cases and error scenarios."""
-#     hidden_size = 320
-#     initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
-#     model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
-
-#     # Test 1: Applying same adapter twice should work without issues
-#     mtp.update_model(model, DEFAULT_LORA_CFG_TEST)
-#     mtp.update_model(model, DEFAULT_LORA_CFG_TEST)  # Should not raise error
-
-#     # Test 2: Disabling non-existent adapter should not raise error
-#     mtp.disable_adapters(model, adapters_to_disable=["non_existent"])
-
-#     # Test 3: Empty adapter configuration
-#     empty_config = {
-#         "adapter_type": "lora",
-#         "adapter_name": "empty",
-#         "adapter_cfg": {},
-#     }
-#     # This might not add any adapters but shouldn't crash
-#     mtp.update_model(model, empty_config)
-
-#     # Test 4: Very large rank (might be memory intensive, so use small model)
-#     large_rank_config = {
-#         "adapter_type": "lora",
-#         "adapter_name": "large_rank",
-#         "adapter_cfg": {
-#             "*": {
-#                 "rank": 128,  # Large rank relative to hidden size
-#                 "scale": 1,
-#                 "lora_a_init": kaiming_init,
-#                 "lora_b_init": zero_init,
-#                 "enable": True,
-#             },
-#         },
-#     }
-#     small_model = _gpt_model_provider(tp_size=1, hidden_size=128)
-#     mtp.update_model(small_model, large_rank_config)
-
-#     # Verify the model still works
-#     prompt_tokens = torch.randint(0, small_model.vocab_size, (1, 16)).cuda()
-#     output = megatron_prefill(small_model, prompt_tokens)
-#     assert output is not None
-
-
-# def test_adapter_gradient_flow():
-#     """Test that gradients flow correctly through LoRA adapters."""
-#     hidden_size = 128
-#     initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
-#     model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
-
-#     # Apply LoRA adapter
-#     mtp.update_model(model, DEFAULT_LORA_CFG_RANDOM_INIT_TEST)
-
-#     # Set model to training mode
-#     model.train()
-
-#     # Forward pass
-#     prompt_tokens = torch.randint(0, model.vocab_size, (1, 16)).cuda()
-#     output = megatron_prefill(model, prompt_tokens)
-
-#     # Create a dummy loss and backward
-#     loss = output.sum()
-#     loss.backward()
-
-#     # Check that LoRA parameters have gradients
-#     for name, module in model.named_modules():
-#         if isinstance(module, LoRAModule):
-#             adapter_name = DEFAULT_LORA_CFG_RANDOM_INIT_TEST['adapter_name']
-#             lora_a = getattr(module, f"lora_a_{adapter_name}")
-#             lora_b = getattr(module, f"lora_b_{adapter_name}")
-
-#             # LoRA parameters should have gradients
-#             assert lora_a.grad is not None, f"lora_a in {name} has no gradient"
-#             assert lora_b.grad is not None, f"lora_b in {name} has no gradient"
-
-#             # Gradients should be non-zero
-#             assert torch.any(lora_a.grad != 0), f"lora_a gradient is all zeros in {name}"
-#             assert torch.any(lora_b.grad != 0), f"lora_b gradient is all zeros in {name}"
-
-
-# def test_adapter_parameter_count():
-#     """Test that LoRA reduces trainable parameters significantly."""
-#     hidden_size = 256
-#     initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
-#     model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
-
-#     # Count original parameters
-#     original_params = sum(p.numel() for p in model.parameters())
-
-#     # Apply LoRA with small rank
-#     small_rank_config = {
-#         "adapter_type": "lora",
-#         "adapter_name": "small",
-#         "adapter_cfg": {
-#             "*": {
-#                 "rank": 8,
-#                 "scale": 1,
-#                 "lora_a_init": kaiming_init,
-#                 "lora_b_init": zero_init,
-#                 "enable": True,
-#             },
-#         },
-#     }
-#     mtp.update_model(model, small_rank_config)
-
-#     # Count LoRA parameters
-#     lora_params = 0
-#     for module in model.modules():
-#         if isinstance(module, LoRAModule):
-#             for param_name, param in module.named_parameters():
-#                 if "lora_" in param_name:
-#                     lora_params += param.numel()
-
-#     # LoRA parameters should be much smaller than original model
-#     assert lora_params < original_params * 0.1, (
-#         f"LoRA params ({lora_params}) should be < 10% of original params ({original_params})"
-#     )
-
-#     # Verify LoRA parameters exist
-#     assert lora_params > 0, "No LoRA parameters found"
-
-
-# def test_multiple_forward_consistency():
-#     """Test that multiple forward passes produce consistent results."""
-#     hidden_size = 128
-#     initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
-#     model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
-
-#     # Apply LoRA adapter
-#     mtp.update_model(model, LARGE_SCALE_LORA_CFG)
-
-#     # Set to eval mode for deterministic behavior
-#     model.eval()
-
-#     # Run multiple forward passes with same input
-#     prompt_tokens = torch.randint(0, model.vocab_size, (2, 32)).cuda()
-#     outputs = []
-#     for _ in range(3):
-#         with torch.no_grad():
-#             output = megatron_prefill(model, prompt_tokens)
-#             outputs.append(output)
-
-#     # All outputs should be identical
-#     for i in range(1, len(outputs)):
-#         assert torch.allclose(outputs[0], outputs[i], rtol=1e-6), (
-#             f"Output {i} differs from output 0"
-#         )
-
-
-# # Placeholder functions for future implementation
-# def test_forward_with_lora_quantize():
-#     """Test applying LoRA to an already quantized model."""
-#     # TODO: Implement when quantization integration is ready
-#     pytest.skip("Quantization integration tests not yet implemented")
-
-
-# def test_forward_with_quantize_lora():
-#     """Test quantizing a model that already has LoRA adapters."""
-#     # TODO: Implement when quantization integration is ready
-#     pytest.skip("Quantization integration tests not yet implemented")
-
-
-# def test_one_lora_save_restore():
-#     """Test saving and restoring a model with one LoRA adapter."""
-#     # TODO: Implement when save/restore functionality is ready
-#     pytest.skip("Save/restore tests not yet implemented")
-
-
-# def test_two_loras_save_restore():
-#     """Test saving and restoring a model with multiple LoRA adapters."""
-#     # TODO: Implement when save/restore functionality is ready
-#     pytest.skip("Save/restore tests not yet implemented")
-
-
-# def test_one_lora_quantize_save_restore():
-#     """Test save/restore of quantized model with one LoRA adapter."""
-#     # TODO: Implement when quantization + save/restore is ready
-#     pytest.skip("Quantization + save/restore tests not yet implemented")
-
-
-# def test_two_loras_quantize_save_restore():
-#     """Test save/restore of quantized model with multiple LoRA adapters."""
-#     # TODO: Implement when quantization + save/restore is ready
-#     pytest.skip("Quantization + save/restore tests not yet implemented")
+# TODO: Save and restore 2 loras
+
+
+def _test_adapter_gradient_flow_freeze_base_model(lora_config, tmp_path, rank, size):
+    hidden_size = 1280
+    initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
+    model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
+    prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
+
+    mtpf.update_model(model, lora_config)
+    model.train()
+
+    # Use a simple forward pass instead for grad check
+    batch_size = prompt_tokens.shape[0]
+    seq_len = prompt_tokens.shape[-1]
+    device = prompt_tokens.device
+
+    attention_mask = (
+        torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
+        .bool()
+        .view(batch_size, 1, seq_len, seq_len)
+    )
+
+    output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
+
+    loss = output.sum()
+    loss.backward()
+
+    for name, module in model.named_modules():
+        if isinstance(module, LoRAModule):
+            if len(module._lora_adapters) == 0:
+                continue
+            for adapter_name in module._lora_adapters:
+                lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
+                lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
+
+                for param_name, param in lora_a_module.named_parameters():
+                    assert param.grad is not None, f"lora_a.{param_name} in {name} has no gradient"
+                    assert torch.any(param.grad != 0), (
+                        f"lora_a.{param_name} gradient is all zeros in {name}"
+                    )
+
+                for param_name, param in lora_b_module.named_parameters():
+                    assert param.grad is not None, f"lora_b.{param_name} in {name} has no gradient"
+                    assert torch.any(param.grad != 0), (
+                        f"lora_b.{param_name} gradient is all zeros in {name}"
+                    )
+                assert module.weight.grad is None
+
+
+@pytest.mark.parametrize("device_count", get_device_counts())
+@pytest.mark.parametrize(
+    "lora_config",
+    [
+        LARGE_LORA_CFG_RANDOM_INIT_TEST,  # Use random init so gradients flow to both lora_a and lora_b
+    ],
+)
+def test_adapter_gradient_flow_freeze_base_model(device_count, lora_config, tmp_path):
+    spawn_multiprocess_job(
+        size=device_count,
+        job=partial(_test_adapter_gradient_flow_freeze_base_model, lora_config, str(tmp_path)),
+        backend="nccl",
+    )
+
+
+def _test_adapter_gradient_flow_freeze_lora_model(lora_config, tmp_path, rank, size):
+    hidden_size = 1280
+    lora_config["freeze_lora_weights"] = True
+    lora_config["freeze_base_model"] = False
+
+    initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
+    model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
+    prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
+
+    mtpf.update_model(model, lora_config)
+    model.train()
+
+    # Use a simple forward pass instead for grad check
+    batch_size = prompt_tokens.shape[0]
+    seq_len = prompt_tokens.shape[-1]
+    device = prompt_tokens.device
+
+    attention_mask = (
+        torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
+        .bool()
+        .view(batch_size, 1, seq_len, seq_len)
+    )
+
+    output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
+
+    loss = output.sum()
+    loss.backward()
+
+    for name, module in model.named_modules():
+        if isinstance(module, LoRAModule):
+            if len(module._lora_adapters) == 0:
+                continue
+            for adapter_name in module._lora_adapters:
+                lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
+                lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
+
+                for param_name, param in lora_a_module.named_parameters():
+                    assert param.grad is None, f"lora_a.{param_name} in {name} has gradient"
+
+                for param_name, param in lora_b_module.named_parameters():
+                    assert param.grad is None, f"lora_b.{param_name} in {name} has gradient"
+
+                assert module.weight.grad is not None
+                assert torch.any(module.weight.grad != 0), "weight gradient is all zeros"
+
+
+@pytest.mark.parametrize("device_count", get_device_counts())
+@pytest.mark.parametrize(
+    "lora_config",
+    [
+        LARGE_LORA_CFG_RANDOM_INIT_TEST,  # Use random init so gradients flow to both lora_a and lora_b
+    ],
+)
+def test_adapter_gradient_flow_freeze_lora_model(device_count, lora_config, tmp_path):
+    spawn_multiprocess_job(
+        size=device_count,
+        job=partial(_test_adapter_gradient_flow_freeze_lora_model, lora_config, str(tmp_path)),
+        backend="nccl",
+    )
+
+
+def _test_adapter_gradient_flow(lora_config, tmp_path, rank, size):
+    hidden_size = 1280
+    lora_config["freeze_lora_weights"] = False
+    lora_config["freeze_base_model"] = False
+
+    initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
+    model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
+    prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
+
+    mtpf.update_model(model, lora_config)
+    model.train()
+
+    # Use a simple forward pass instead for grad check
+    batch_size = prompt_tokens.shape[0]
+    seq_len = prompt_tokens.shape[-1]
+    device = prompt_tokens.device
+
+    attention_mask = (
+        torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
+        .bool()
+        .view(batch_size, 1, seq_len, seq_len)
+    )
+
+    output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
+
+    loss = output.sum()
+    loss.backward()
+
+    for name, module in model.named_modules():
+        if isinstance(module, LoRAModule):
+            if len(module._lora_adapters) == 0:
+                continue
+            for adapter_name in module._lora_adapters:
+                lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
+                lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
+
+                for param_name, param in lora_a_module.named_parameters():
+                    assert param.grad is not None, f"lora_a.{param_name} in {name} has gradient"
+                    assert torch.any(param.grad != 0), (
+                        f"lora_a.{param_name} gradient is all zeros in {name}"
+                    )
+
+                for param_name, param in lora_b_module.named_parameters():
+                    assert param.grad is not None, f"lora_b.{param_name} in {name} has gradient"
+                    assert torch.any(param.grad != 0), (
+                        f"lora_b.{param_name} gradient is all zeros in {name}"
+                    )
+
+                assert module.weight.grad is not None
+                assert torch.any(module.weight.grad != 0), "weight gradient is all zeros"
+
+
+@pytest.mark.parametrize("device_count", get_device_counts())
+@pytest.mark.parametrize(
+    "lora_config",
+    [
+        LARGE_LORA_CFG_RANDOM_INIT_TEST,  # Use random init so gradients flow to both lora_a and lora_b
+    ],
+)
+def test_adapter_gradient_flow(device_count, lora_config, tmp_path):
+    spawn_multiprocess_job(
+        size=device_count,
+        job=partial(_test_adapter_gradient_flow, lora_config, str(tmp_path)),
+        backend="nccl",
+    )