Update test case

jingyu-ml · jingyu-ml · commit 8c31821510b8 · 2025-09-30T22:01:01.000Z
Signed-off-by: Jingyu Xin &lt;jingyux@nvidia.com&gt;
diff --git a/tests/gpu/torch/peft/test_megatron_peft.py b/tests/gpu/torch/peft/test_megatron_peft.py
@@ -3,11 +3,17 @@
 import pytest
 import torch
 from _test_utils.import_helper import skip_if_no_megatron
-from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
+from _test_utils.torch_dist.dist_utils import get_device_counts, spawn_multiprocess_job
 from _test_utils.torch_dist.plugins.megatron_common import (
     get_mcore_gpt_model,
     initialize_for_megatron,
 )
+from megatron.core import dist_checkpointing
+
+from modelopt.torch.opt.plugins.mcore_dist_checkpointing import (
+    restore_sharded_modelopt_state,
+    save_sharded_modelopt_state,
+)
 
 skip_if_no_megatron()
 
@@ -27,6 +33,22 @@
             "lora_b_init": "zero_init",
             "enable": True,
         },
+        "*output_layer*": {"enable": False},
+    },
+}
+
+LARGE_LORA_CFG_TEST = {
+    "adapter_type": "lora",
+    "adapter_name": "default",
+    "adapter_cfg": {
+        "*": {
+            "rank": 128,
+            "scale": 1,
+            "lora_a_init": "kaiming_init",
+            "lora_b_init": "zero_init",
+            "enable": True,
+        },
+        "*output_layer*": {"enable": False},
     },
 }
 
@@ -41,6 +63,22 @@
             "lora_b_init": "kaiming_init",
             "enable": True,
         },
+        "*output_layer*": {"enable": False},
+    },
+}
+
+LARGE_LORA_CFG_RANDOM_INIT_TEST = {
+    "adapter_type": "lora",
+    "adapter_name": "random",
+    "adapter_cfg": {
+        "*": {
+            "rank": 128,
+            "scale": 1,
+            "lora_a_init": "kaiming_init",
+            "lora_b_init": "kaiming_init",
+            "enable": True,
+        },
+        "*output_layer*": {"enable": False},
     },
 }
 
@@ -55,6 +93,7 @@
             "lora_b_init": "kaiming_init",
             "enable": True,
         },
+        "*output_layer*": {"enable": False},
     },
 }
 
@@ -70,10 +109,25 @@
             "lora_b_init": "zero_init",
             "enable": True,
         },
+        "*output_layer*": {"enable": False},
     },
 }
 
 
+def save_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix="")
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict = gpt_model.sharded_state_dict(prefix="")
+    checkpoint = dist_checkpointing.load(
+        sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path
+    )
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+
 def _gpt_model_provider(tp_size: int, hidden_size=256, vocab_size=64, meta_device=False):
     """Build the model."""
 
@@ -157,8 +211,9 @@ def _test_forward_with_one_lora(lora_config, rank, size):
                     assert lora_config["adapter_name"] not in module._lora_adapters
             else:
                 # Task: For non-selective configs, all LoRA modules should have the adapter
-                assert hasattr(module, f"lora_a_{lora_config['adapter_name']}")
-                assert hasattr(module, f"lora_b_{lora_config['adapter_name']}")
+                for adapter_name in module._lora_adapters:
+                    assert hasattr(module, f"lora_a_{adapter_name}")
+                    assert hasattr(module, f"lora_b_{adapter_name}")
                 lora_with_adapter_count += 1
 
     assert lora_module_count > 0
@@ -216,11 +271,9 @@ def _test_forward_with_two_loras(lora_config_1, lora_config_2, rank, size):
 
     for _, module in model.named_modules():
         if isinstance(module, LoRAModule):
-            assert hasattr(module, f"lora_a_{lora_config_1['adapter_name']}")
-            assert hasattr(module, f"lora_b_{lora_config_1['adapter_name']}")
-            assert hasattr(module, f"lora_a_{lora_config_2['adapter_name']}")
-            assert hasattr(module, f"lora_b_{lora_config_2['adapter_name']}")
-            assert len(module._lora_adapters) == 2
+            for adapter_name in module._lora_adapters:
+                assert hasattr(module, f"lora_a_{adapter_name}")
+                assert hasattr(module, f"lora_b_{adapter_name}")
 
 
 @pytest.mark.parametrize(
@@ -237,7 +290,91 @@ def test_forward_with_two_loras(lora_config_1, lora_config_2):
     )
 
 
-# TODO: Save and restore with 1 or 2 GPUs
+# TODO: Rank check
+def _test_attr_changes_with_one_lora(lora_config, rank, size):
+    """Test forward pass with a single LoRA adapter with various configurations."""
+    hidden_size = 320
+    initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
+    prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
+
+    mtpf.update_model(model, lora_config)
+    lora_1_output = megatron_prefill(model, prompt_tokens)
+
+    for _, module in model.named_modules():
+        if isinstance(module, LoRAModule):
+            for adapter_name in module._lora_adapters:
+                adapter = module._lora_adapters[adapter_name]
+                adapter["scale"] = 10.0
+
+    lora_2_output = megatron_prefill(model, prompt_tokens)
+    assert not torch.allclose(lora_1_output, lora_2_output)
+
+    for _, module in model.named_modules():
+        if isinstance(module, LoRAModule):
+            for adapter_name in module._lora_adapters:
+                adapter = module._lora_adapters[adapter_name]
+                adapter["scale"] = 1.0
+    lora_back_output = megatron_prefill(model, prompt_tokens)
+
+    assert torch.allclose(lora_1_output, lora_back_output)
+
+
+@pytest.mark.parametrize(
+    "lora_config",
+    [
+        DEFAULT_LORA_CFG_RANDOM_INIT_TEST,
+    ],
+)
+def test_attr_changes_with_one_lora(lora_config):
+    spawn_multiprocess_job(
+        size=1, job=partial(_test_attr_changes_with_one_lora, lora_config), backend="nccl"
+    )
+
+
+def _test_mcore_save_restore(lora_config, tmp_path, rank, size):
+    hidden_size = 1280
+    initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
+    model_ref = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
+    model_test = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
+    prompt_tokens = torch.randint(
+        0, model_ref.vocab_size, (2, model_ref.max_sequence_length)
+    ).cuda()
+    original_output_test = megatron_prefill(model_test, prompt_tokens)
+
+    mtpf.update_model(model_ref, lora_config)
+
+    lora_output_ref = megatron_prefill(model_ref, prompt_tokens)
+
+    save_distributed_checkpoint(tmp_path, model_ref)
+    save_sharded_modelopt_state([model_ref], tmp_path)
+
+    restore_sharded_modelopt_state([model_test], tmp_path)
+    model_test = load_distributed_checkpoint(tmp_path, model_test)
+
+    lora_output_test = megatron_prefill(model_test, prompt_tokens)
+
+    # Task: If the save and restore functions work correctly, they should produce the same output.
+    assert torch.allclose(lora_output_test, lora_output_ref)
+
+    assert not torch.allclose(original_output_test, lora_output_test)
+
+
+@pytest.mark.parametrize("device_count", get_device_counts())
+@pytest.mark.parametrize(
+    "lora_config",
+    [
+        DEFAULT_LORA_CFG_RANDOM_INIT_TEST,
+    ],
+)
+def test_mcore_save_restore(device_count, lora_config, tmp_path):
+    spawn_multiprocess_job(
+        size=device_count,
+        job=partial(_test_mcore_save_restore, lora_config, str(tmp_path)),
+        backend="nccl",
+    )
+
+
 # TODO: Grad check
 
 # def test_edge_cases_and_error_handling():