Skip to content

Commit 024d57c

Browse files
committed
Update on the test case
Signed-off-by: Jingyu Xin <[email protected]>
1 parent 8c31821 commit 024d57c

File tree

1 file changed

+191
-187
lines changed

1 file changed

+191
-187
lines changed

tests/gpu/torch/peft/test_megatron_peft.py

Lines changed: 191 additions & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -375,190 +375,194 @@ def test_mcore_save_restore(device_count, lora_config, tmp_path):
375375
)
376376

377377

378-
# TODO: Grad check
379-
380-
# def test_edge_cases_and_error_handling():
381-
# """Test edge cases and error scenarios."""
382-
# hidden_size = 320
383-
# initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
384-
# model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
385-
386-
# # Test 1: Applying same adapter twice should work without issues
387-
# mtp.update_model(model, DEFAULT_LORA_CFG_TEST)
388-
# mtp.update_model(model, DEFAULT_LORA_CFG_TEST) # Should not raise error
389-
390-
# # Test 2: Disabling non-existent adapter should not raise error
391-
# mtp.disable_adapters(model, adapters_to_disable=["non_existent"])
392-
393-
# # Test 3: Empty adapter configuration
394-
# empty_config = {
395-
# "adapter_type": "lora",
396-
# "adapter_name": "empty",
397-
# "adapter_cfg": {},
398-
# }
399-
# # This might not add any adapters but shouldn't crash
400-
# mtp.update_model(model, empty_config)
401-
402-
# # Test 4: Very large rank (might be memory intensive, so use small model)
403-
# large_rank_config = {
404-
# "adapter_type": "lora",
405-
# "adapter_name": "large_rank",
406-
# "adapter_cfg": {
407-
# "*": {
408-
# "rank": 128, # Large rank relative to hidden size
409-
# "scale": 1,
410-
# "lora_a_init": kaiming_init,
411-
# "lora_b_init": zero_init,
412-
# "enable": True,
413-
# },
414-
# },
415-
# }
416-
# small_model = _gpt_model_provider(tp_size=1, hidden_size=128)
417-
# mtp.update_model(small_model, large_rank_config)
418-
419-
# # Verify the model still works
420-
# prompt_tokens = torch.randint(0, small_model.vocab_size, (1, 16)).cuda()
421-
# output = megatron_prefill(small_model, prompt_tokens)
422-
# assert output is not None
423-
424-
425-
# def test_adapter_gradient_flow():
426-
# """Test that gradients flow correctly through LoRA adapters."""
427-
# hidden_size = 128
428-
# initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
429-
# model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
430-
431-
# # Apply LoRA adapter
432-
# mtp.update_model(model, DEFAULT_LORA_CFG_RANDOM_INIT_TEST)
433-
434-
# # Set model to training mode
435-
# model.train()
436-
437-
# # Forward pass
438-
# prompt_tokens = torch.randint(0, model.vocab_size, (1, 16)).cuda()
439-
# output = megatron_prefill(model, prompt_tokens)
440-
441-
# # Create a dummy loss and backward
442-
# loss = output.sum()
443-
# loss.backward()
444-
445-
# # Check that LoRA parameters have gradients
446-
# for name, module in model.named_modules():
447-
# if isinstance(module, LoRAModule):
448-
# adapter_name = DEFAULT_LORA_CFG_RANDOM_INIT_TEST['adapter_name']
449-
# lora_a = getattr(module, f"lora_a_{adapter_name}")
450-
# lora_b = getattr(module, f"lora_b_{adapter_name}")
451-
452-
# # LoRA parameters should have gradients
453-
# assert lora_a.grad is not None, f"lora_a in {name} has no gradient"
454-
# assert lora_b.grad is not None, f"lora_b in {name} has no gradient"
455-
456-
# # Gradients should be non-zero
457-
# assert torch.any(lora_a.grad != 0), f"lora_a gradient is all zeros in {name}"
458-
# assert torch.any(lora_b.grad != 0), f"lora_b gradient is all zeros in {name}"
459-
460-
461-
# def test_adapter_parameter_count():
462-
# """Test that LoRA reduces trainable parameters significantly."""
463-
# hidden_size = 256
464-
# initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
465-
# model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
466-
467-
# # Count original parameters
468-
# original_params = sum(p.numel() for p in model.parameters())
469-
470-
# # Apply LoRA with small rank
471-
# small_rank_config = {
472-
# "adapter_type": "lora",
473-
# "adapter_name": "small",
474-
# "adapter_cfg": {
475-
# "*": {
476-
# "rank": 8,
477-
# "scale": 1,
478-
# "lora_a_init": kaiming_init,
479-
# "lora_b_init": zero_init,
480-
# "enable": True,
481-
# },
482-
# },
483-
# }
484-
# mtp.update_model(model, small_rank_config)
485-
486-
# # Count LoRA parameters
487-
# lora_params = 0
488-
# for module in model.modules():
489-
# if isinstance(module, LoRAModule):
490-
# for param_name, param in module.named_parameters():
491-
# if "lora_" in param_name:
492-
# lora_params += param.numel()
493-
494-
# # LoRA parameters should be much smaller than original model
495-
# assert lora_params < original_params * 0.1, (
496-
# f"LoRA params ({lora_params}) should be < 10% of original params ({original_params})"
497-
# )
498-
499-
# # Verify LoRA parameters exist
500-
# assert lora_params > 0, "No LoRA parameters found"
501-
502-
503-
# def test_multiple_forward_consistency():
504-
# """Test that multiple forward passes produce consistent results."""
505-
# hidden_size = 128
506-
# initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
507-
# model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
508-
509-
# # Apply LoRA adapter
510-
# mtp.update_model(model, LARGE_SCALE_LORA_CFG)
511-
512-
# # Set to eval mode for deterministic behavior
513-
# model.eval()
514-
515-
# # Run multiple forward passes with same input
516-
# prompt_tokens = torch.randint(0, model.vocab_size, (2, 32)).cuda()
517-
# outputs = []
518-
# for _ in range(3):
519-
# with torch.no_grad():
520-
# output = megatron_prefill(model, prompt_tokens)
521-
# outputs.append(output)
522-
523-
# # All outputs should be identical
524-
# for i in range(1, len(outputs)):
525-
# assert torch.allclose(outputs[0], outputs[i], rtol=1e-6), (
526-
# f"Output {i} differs from output 0"
527-
# )
528-
529-
530-
# # Placeholder functions for future implementation
531-
# def test_forward_with_lora_quantize():
532-
# """Test applying LoRA to an already quantized model."""
533-
# # TODO: Implement when quantization integration is ready
534-
# pytest.skip("Quantization integration tests not yet implemented")
535-
536-
537-
# def test_forward_with_quantize_lora():
538-
# """Test quantizing a model that already has LoRA adapters."""
539-
# # TODO: Implement when quantization integration is ready
540-
# pytest.skip("Quantization integration tests not yet implemented")
541-
542-
543-
# def test_one_lora_save_restore():
544-
# """Test saving and restoring a model with one LoRA adapter."""
545-
# # TODO: Implement when save/restore functionality is ready
546-
# pytest.skip("Save/restore tests not yet implemented")
547-
548-
549-
# def test_two_loras_save_restore():
550-
# """Test saving and restoring a model with multiple LoRA adapters."""
551-
# # TODO: Implement when save/restore functionality is ready
552-
# pytest.skip("Save/restore tests not yet implemented")
553-
554-
555-
# def test_one_lora_quantize_save_restore():
556-
# """Test save/restore of quantized model with one LoRA adapter."""
557-
# # TODO: Implement when quantization + save/restore is ready
558-
# pytest.skip("Quantization + save/restore tests not yet implemented")
559-
560-
561-
# def test_two_loras_quantize_save_restore():
562-
# """Test save/restore of quantized model with multiple LoRA adapters."""
563-
# # TODO: Implement when quantization + save/restore is ready
564-
# pytest.skip("Quantization + save/restore tests not yet implemented")
378+
# TODO: Save and restore 2 loras
379+
380+
381+
def _test_adapter_gradient_flow_freeze_base_model(lora_config, tmp_path, rank, size):
382+
hidden_size = 1280
383+
initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
384+
model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
385+
prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
386+
387+
mtpf.update_model(model, lora_config)
388+
model.train()
389+
390+
# Use a simple forward pass instead for grad check
391+
batch_size = prompt_tokens.shape[0]
392+
seq_len = prompt_tokens.shape[-1]
393+
device = prompt_tokens.device
394+
395+
attention_mask = (
396+
torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
397+
.bool()
398+
.view(batch_size, 1, seq_len, seq_len)
399+
)
400+
401+
output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
402+
403+
loss = output.sum()
404+
loss.backward()
405+
406+
for name, module in model.named_modules():
407+
if isinstance(module, LoRAModule):
408+
if len(module._lora_adapters) == 0:
409+
continue
410+
for adapter_name in module._lora_adapters:
411+
lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
412+
lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
413+
414+
for param_name, param in lora_a_module.named_parameters():
415+
assert param.grad is not None, f"lora_a.{param_name} in {name} has no gradient"
416+
assert torch.any(param.grad != 0), (
417+
f"lora_a.{param_name} gradient is all zeros in {name}"
418+
)
419+
420+
for param_name, param in lora_b_module.named_parameters():
421+
assert param.grad is not None, f"lora_b.{param_name} in {name} has no gradient"
422+
assert torch.any(param.grad != 0), (
423+
f"lora_b.{param_name} gradient is all zeros in {name}"
424+
)
425+
assert module.weight.grad is None
426+
427+
428+
@pytest.mark.parametrize("device_count", get_device_counts())
429+
@pytest.mark.parametrize(
430+
"lora_config",
431+
[
432+
LARGE_LORA_CFG_RANDOM_INIT_TEST, # Use random init so gradients flow to both lora_a and lora_b
433+
],
434+
)
435+
def test_adapter_gradient_flow_freeze_base_model(device_count, lora_config, tmp_path):
436+
spawn_multiprocess_job(
437+
size=device_count,
438+
job=partial(_test_adapter_gradient_flow_freeze_base_model, lora_config, str(tmp_path)),
439+
backend="nccl",
440+
)
441+
442+
443+
def _test_adapter_gradient_flow_freeze_lora_model(lora_config, tmp_path, rank, size):
444+
hidden_size = 1280
445+
lora_config["freeze_lora_weights"] = True
446+
lora_config["freeze_base_model"] = False
447+
448+
initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
449+
model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
450+
prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
451+
452+
mtpf.update_model(model, lora_config)
453+
model.train()
454+
455+
# Use a simple forward pass instead for grad check
456+
batch_size = prompt_tokens.shape[0]
457+
seq_len = prompt_tokens.shape[-1]
458+
device = prompt_tokens.device
459+
460+
attention_mask = (
461+
torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
462+
.bool()
463+
.view(batch_size, 1, seq_len, seq_len)
464+
)
465+
466+
output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
467+
468+
loss = output.sum()
469+
loss.backward()
470+
471+
for name, module in model.named_modules():
472+
if isinstance(module, LoRAModule):
473+
if len(module._lora_adapters) == 0:
474+
continue
475+
for adapter_name in module._lora_adapters:
476+
lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
477+
lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
478+
479+
for param_name, param in lora_a_module.named_parameters():
480+
assert param.grad is None, f"lora_a.{param_name} in {name} has gradient"
481+
482+
for param_name, param in lora_b_module.named_parameters():
483+
assert param.grad is None, f"lora_b.{param_name} in {name} has gradient"
484+
485+
assert module.weight.grad is not None
486+
assert torch.any(module.weight.grad != 0), "weight gradient is all zeros"
487+
488+
489+
@pytest.mark.parametrize("device_count", get_device_counts())
490+
@pytest.mark.parametrize(
491+
"lora_config",
492+
[
493+
LARGE_LORA_CFG_RANDOM_INIT_TEST, # Use random init so gradients flow to both lora_a and lora_b
494+
],
495+
)
496+
def test_adapter_gradient_flow_freeze_lora_model(device_count, lora_config, tmp_path):
497+
spawn_multiprocess_job(
498+
size=device_count,
499+
job=partial(_test_adapter_gradient_flow_freeze_lora_model, lora_config, str(tmp_path)),
500+
backend="nccl",
501+
)
502+
503+
504+
def _test_adapter_gradient_flow(lora_config, tmp_path, rank, size):
505+
hidden_size = 1280
506+
lora_config["freeze_lora_weights"] = False
507+
lora_config["freeze_base_model"] = False
508+
509+
initialize_for_megatron(tensor_model_parallel_size=size, pipeline_model_parallel_size=1)
510+
model = _gpt_model_provider(tp_size=size, hidden_size=hidden_size)
511+
prompt_tokens = torch.randint(0, model.vocab_size, (2, model.max_sequence_length)).cuda()
512+
513+
mtpf.update_model(model, lora_config)
514+
model.train()
515+
516+
# Use a simple forward pass instead for grad check
517+
batch_size = prompt_tokens.shape[0]
518+
seq_len = prompt_tokens.shape[-1]
519+
device = prompt_tokens.device
520+
521+
attention_mask = (
522+
torch.triu(torch.ones((batch_size, seq_len, seq_len), device=device), diagonal=1)
523+
.bool()
524+
.view(batch_size, 1, seq_len, seq_len)
525+
)
526+
527+
output = model(prompt_tokens, position_ids=None, attention_mask=attention_mask)
528+
529+
loss = output.sum()
530+
loss.backward()
531+
532+
for name, module in model.named_modules():
533+
if isinstance(module, LoRAModule):
534+
if len(module._lora_adapters) == 0:
535+
continue
536+
for adapter_name in module._lora_adapters:
537+
lora_a_module = module._lora_adapters[adapter_name]["lora_a"]
538+
lora_b_module = module._lora_adapters[adapter_name]["lora_b"]
539+
540+
for param_name, param in lora_a_module.named_parameters():
541+
assert param.grad is not None, f"lora_a.{param_name} in {name} has gradient"
542+
assert torch.any(param.grad != 0), (
543+
f"lora_a.{param_name} gradient is all zeros in {name}"
544+
)
545+
546+
for param_name, param in lora_b_module.named_parameters():
547+
assert param.grad is not None, f"lora_b.{param_name} in {name} has gradient"
548+
assert torch.any(param.grad != 0), (
549+
f"lora_b.{param_name} gradient is all zeros in {name}"
550+
)
551+
552+
assert module.weight.grad is not None
553+
assert torch.any(module.weight.grad != 0), "weight gradient is all zeros"
554+
555+
556+
@pytest.mark.parametrize("device_count", get_device_counts())
557+
@pytest.mark.parametrize(
558+
"lora_config",
559+
[
560+
LARGE_LORA_CFG_RANDOM_INIT_TEST, # Use random init so gradients flow to both lora_a and lora_b
561+
],
562+
)
563+
def test_adapter_gradient_flow(device_count, lora_config, tmp_path):
564+
spawn_multiprocess_job(
565+
size=device_count,
566+
job=partial(_test_adapter_gradient_flow, lora_config, str(tmp_path)),
567+
backend="nccl",
568+
)

0 commit comments

Comments
 (0)