@@ -375,190 +375,194 @@ def test_mcore_save_restore(device_count, lora_config, tmp_path):
375
375
)
376
376
377
377
378
- # TODO: Grad check
379
-
380
- # def test_edge_cases_and_error_handling():
381
- # """Test edge cases and error scenarios."""
382
- # hidden_size = 320
383
- # initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
384
- # model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
385
-
386
- # # Test 1: Applying same adapter twice should work without issues
387
- # mtp.update_model(model, DEFAULT_LORA_CFG_TEST)
388
- # mtp.update_model(model, DEFAULT_LORA_CFG_TEST) # Should not raise error
389
-
390
- # # Test 2: Disabling non-existent adapter should not raise error
391
- # mtp.disable_adapters(model, adapters_to_disable=["non_existent"])
392
-
393
- # # Test 3: Empty adapter configuration
394
- # empty_config = {
395
- # "adapter_type": "lora",
396
- # "adapter_name": "empty",
397
- # "adapter_cfg": {},
398
- # }
399
- # # This might not add any adapters but shouldn't crash
400
- # mtp.update_model(model, empty_config)
401
-
402
- # # Test 4: Very large rank (might be memory intensive, so use small model)
403
- # large_rank_config = {
404
- # "adapter_type": "lora",
405
- # "adapter_name": "large_rank",
406
- # "adapter_cfg": {
407
- # "*": {
408
- # "rank": 128, # Large rank relative to hidden size
409
- # "scale": 1,
410
- # "lora_a_init": kaiming_init,
411
- # "lora_b_init": zero_init,
412
- # "enable": True,
413
- # },
414
- # },
415
- # }
416
- # small_model = _gpt_model_provider(tp_size=1, hidden_size=128)
417
- # mtp.update_model(small_model, large_rank_config)
418
-
419
- # # Verify the model still works
420
- # prompt_tokens = torch.randint(0, small_model.vocab_size, (1, 16)).cuda()
421
- # output = megatron_prefill(small_model, prompt_tokens)
422
- # assert output is not None
423
-
424
-
425
- # def test_adapter_gradient_flow():
426
- # """Test that gradients flow correctly through LoRA adapters."""
427
- # hidden_size = 128
428
- # initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
429
- # model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
430
-
431
- # # Apply LoRA adapter
432
- # mtp.update_model(model, DEFAULT_LORA_CFG_RANDOM_INIT_TEST)
433
-
434
- # # Set model to training mode
435
- # model.train()
436
-
437
- # # Forward pass
438
- # prompt_tokens = torch.randint(0, model.vocab_size, (1, 16)).cuda()
439
- # output = megatron_prefill(model, prompt_tokens)
440
-
441
- # # Create a dummy loss and backward
442
- # loss = output.sum()
443
- # loss.backward()
444
-
445
- # # Check that LoRA parameters have gradients
446
- # for name, module in model.named_modules():
447
- # if isinstance(module, LoRAModule):
448
- # adapter_name = DEFAULT_LORA_CFG_RANDOM_INIT_TEST['adapter_name']
449
- # lora_a = getattr(module, f"lora_a_{adapter_name}")
450
- # lora_b = getattr(module, f"lora_b_{adapter_name}")
451
-
452
- # # LoRA parameters should have gradients
453
- # assert lora_a.grad is not None, f"lora_a in {name} has no gradient"
454
- # assert lora_b.grad is not None, f"lora_b in {name} has no gradient"
455
-
456
- # # Gradients should be non-zero
457
- # assert torch.any(lora_a.grad != 0), f"lora_a gradient is all zeros in {name}"
458
- # assert torch.any(lora_b.grad != 0), f"lora_b gradient is all zeros in {name}"
459
-
460
-
461
- # def test_adapter_parameter_count():
462
- # """Test that LoRA reduces trainable parameters significantly."""
463
- # hidden_size = 256
464
- # initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
465
- # model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
466
-
467
- # # Count original parameters
468
- # original_params = sum(p.numel() for p in model.parameters())
469
-
470
- # # Apply LoRA with small rank
471
- # small_rank_config = {
472
- # "adapter_type": "lora",
473
- # "adapter_name": "small",
474
- # "adapter_cfg": {
475
- # "*": {
476
- # "rank": 8,
477
- # "scale": 1,
478
- # "lora_a_init": kaiming_init,
479
- # "lora_b_init": zero_init,
480
- # "enable": True,
481
- # },
482
- # },
483
- # }
484
- # mtp.update_model(model, small_rank_config)
485
-
486
- # # Count LoRA parameters
487
- # lora_params = 0
488
- # for module in model.modules():
489
- # if isinstance(module, LoRAModule):
490
- # for param_name, param in module.named_parameters():
491
- # if "lora_" in param_name:
492
- # lora_params += param.numel()
493
-
494
- # # LoRA parameters should be much smaller than original model
495
- # assert lora_params < original_params * 0.1, (
496
- # f"LoRA params ({lora_params}) should be < 10% of original params ({original_params})"
497
- # )
498
-
499
- # # Verify LoRA parameters exist
500
- # assert lora_params > 0, "No LoRA parameters found"
501
-
502
-
503
- # def test_multiple_forward_consistency():
504
- # """Test that multiple forward passes produce consistent results."""
505
- # hidden_size = 128
506
- # initialize_for_megatron(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
507
- # model = _gpt_model_provider(tp_size=1, hidden_size=hidden_size)
508
-
509
- # # Apply LoRA adapter
510
- # mtp.update_model(model, LARGE_SCALE_LORA_CFG)
511
-
512
- # # Set to eval mode for deterministic behavior
513
- # model.eval()
514
-
515
- # # Run multiple forward passes with same input
516
- # prompt_tokens = torch.randint(0, model.vocab_size, (2, 32)).cuda()
517
- # outputs = []
518
- # for _ in range(3):
519
- # with torch.no_grad():
520
- # output = megatron_prefill(model, prompt_tokens)
521
- # outputs.append(output)
522
-
523
- # # All outputs should be identical
524
- # for i in range(1, len(outputs)):
525
- # assert torch.allclose(outputs[0], outputs[i], rtol=1e-6), (
526
- # f"Output {i} differs from output 0"
527
- # )
528
-
529
-
530
- # # Placeholder functions for future implementation
531
- # def test_forward_with_lora_quantize():
532
- # """Test applying LoRA to an already quantized model."""
533
- # # TODO: Implement when quantization integration is ready
534
- # pytest.skip("Quantization integration tests not yet implemented")
535
-
536
-
537
- # def test_forward_with_quantize_lora():
538
- # """Test quantizing a model that already has LoRA adapters."""
539
- # # TODO: Implement when quantization integration is ready
540
- # pytest.skip("Quantization integration tests not yet implemented")
541
-
542
-
543
- # def test_one_lora_save_restore():
544
- # """Test saving and restoring a model with one LoRA adapter."""
545
- # # TODO: Implement when save/restore functionality is ready
546
- # pytest.skip("Save/restore tests not yet implemented")
547
-
548
-
549
- # def test_two_loras_save_restore():
550
- # """Test saving and restoring a model with multiple LoRA adapters."""
551
- # # TODO: Implement when save/restore functionality is ready
552
- # pytest.skip("Save/restore tests not yet implemented")
553
-
554
-
555
- # def test_one_lora_quantize_save_restore():
556
- # """Test save/restore of quantized model with one LoRA adapter."""
557
- # # TODO: Implement when quantization + save/restore is ready
558
- # pytest.skip("Quantization + save/restore tests not yet implemented")
559
-
560
-
561
- # def test_two_loras_quantize_save_restore():
562
- # """Test save/restore of quantized model with multiple LoRA adapters."""
563
- # # TODO: Implement when quantization + save/restore is ready
564
- # pytest.skip("Quantization + save/restore tests not yet implemented")
378
+ # TODO: Save and restore 2 loras
379
+
380
+
381
+ def _test_adapter_gradient_flow_freeze_base_model (lora_config , tmp_path , rank , size ):
382
+ hidden_size = 1280
383
+ initialize_for_megatron (tensor_model_parallel_size = size , pipeline_model_parallel_size = 1 )
384
+ model = _gpt_model_provider (tp_size = size , hidden_size = hidden_size )
385
+ prompt_tokens = torch .randint (0 , model .vocab_size , (2 , model .max_sequence_length )).cuda ()
386
+
387
+ mtpf .update_model (model , lora_config )
388
+ model .train ()
389
+
390
+ # Use a simple forward pass instead for grad check
391
+ batch_size = prompt_tokens .shape [0 ]
392
+ seq_len = prompt_tokens .shape [- 1 ]
393
+ device = prompt_tokens .device
394
+
395
+ attention_mask = (
396
+ torch .triu (torch .ones ((batch_size , seq_len , seq_len ), device = device ), diagonal = 1 )
397
+ .bool ()
398
+ .view (batch_size , 1 , seq_len , seq_len )
399
+ )
400
+
401
+ output = model (prompt_tokens , position_ids = None , attention_mask = attention_mask )
402
+
403
+ loss = output .sum ()
404
+ loss .backward ()
405
+
406
+ for name , module in model .named_modules ():
407
+ if isinstance (module , LoRAModule ):
408
+ if len (module ._lora_adapters ) == 0 :
409
+ continue
410
+ for adapter_name in module ._lora_adapters :
411
+ lora_a_module = module ._lora_adapters [adapter_name ]["lora_a" ]
412
+ lora_b_module = module ._lora_adapters [adapter_name ]["lora_b" ]
413
+
414
+ for param_name , param in lora_a_module .named_parameters ():
415
+ assert param .grad is not None , f"lora_a.{ param_name } in { name } has no gradient"
416
+ assert torch .any (param .grad != 0 ), (
417
+ f"lora_a.{ param_name } gradient is all zeros in { name } "
418
+ )
419
+
420
+ for param_name , param in lora_b_module .named_parameters ():
421
+ assert param .grad is not None , f"lora_b.{ param_name } in { name } has no gradient"
422
+ assert torch .any (param .grad != 0 ), (
423
+ f"lora_b.{ param_name } gradient is all zeros in { name } "
424
+ )
425
+ assert module .weight .grad is None
426
+
427
+
428
+ @pytest .mark .parametrize ("device_count" , get_device_counts ())
429
+ @pytest .mark .parametrize (
430
+ "lora_config" ,
431
+ [
432
+ LARGE_LORA_CFG_RANDOM_INIT_TEST , # Use random init so gradients flow to both lora_a and lora_b
433
+ ],
434
+ )
435
+ def test_adapter_gradient_flow_freeze_base_model (device_count , lora_config , tmp_path ):
436
+ spawn_multiprocess_job (
437
+ size = device_count ,
438
+ job = partial (_test_adapter_gradient_flow_freeze_base_model , lora_config , str (tmp_path )),
439
+ backend = "nccl" ,
440
+ )
441
+
442
+
443
+ def _test_adapter_gradient_flow_freeze_lora_model (lora_config , tmp_path , rank , size ):
444
+ hidden_size = 1280
445
+ lora_config ["freeze_lora_weights" ] = True
446
+ lora_config ["freeze_base_model" ] = False
447
+
448
+ initialize_for_megatron (tensor_model_parallel_size = size , pipeline_model_parallel_size = 1 )
449
+ model = _gpt_model_provider (tp_size = size , hidden_size = hidden_size )
450
+ prompt_tokens = torch .randint (0 , model .vocab_size , (2 , model .max_sequence_length )).cuda ()
451
+
452
+ mtpf .update_model (model , lora_config )
453
+ model .train ()
454
+
455
+ # Use a simple forward pass instead for grad check
456
+ batch_size = prompt_tokens .shape [0 ]
457
+ seq_len = prompt_tokens .shape [- 1 ]
458
+ device = prompt_tokens .device
459
+
460
+ attention_mask = (
461
+ torch .triu (torch .ones ((batch_size , seq_len , seq_len ), device = device ), diagonal = 1 )
462
+ .bool ()
463
+ .view (batch_size , 1 , seq_len , seq_len )
464
+ )
465
+
466
+ output = model (prompt_tokens , position_ids = None , attention_mask = attention_mask )
467
+
468
+ loss = output .sum ()
469
+ loss .backward ()
470
+
471
+ for name , module in model .named_modules ():
472
+ if isinstance (module , LoRAModule ):
473
+ if len (module ._lora_adapters ) == 0 :
474
+ continue
475
+ for adapter_name in module ._lora_adapters :
476
+ lora_a_module = module ._lora_adapters [adapter_name ]["lora_a" ]
477
+ lora_b_module = module ._lora_adapters [adapter_name ]["lora_b" ]
478
+
479
+ for param_name , param in lora_a_module .named_parameters ():
480
+ assert param .grad is None , f"lora_a.{ param_name } in { name } has gradient"
481
+
482
+ for param_name , param in lora_b_module .named_parameters ():
483
+ assert param .grad is None , f"lora_b.{ param_name } in { name } has gradient"
484
+
485
+ assert module .weight .grad is not None
486
+ assert torch .any (module .weight .grad != 0 ), "weight gradient is all zeros"
487
+
488
+
489
+ @pytest .mark .parametrize ("device_count" , get_device_counts ())
490
+ @pytest .mark .parametrize (
491
+ "lora_config" ,
492
+ [
493
+ LARGE_LORA_CFG_RANDOM_INIT_TEST , # Use random init so gradients flow to both lora_a and lora_b
494
+ ],
495
+ )
496
+ def test_adapter_gradient_flow_freeze_lora_model (device_count , lora_config , tmp_path ):
497
+ spawn_multiprocess_job (
498
+ size = device_count ,
499
+ job = partial (_test_adapter_gradient_flow_freeze_lora_model , lora_config , str (tmp_path )),
500
+ backend = "nccl" ,
501
+ )
502
+
503
+
504
+ def _test_adapter_gradient_flow (lora_config , tmp_path , rank , size ):
505
+ hidden_size = 1280
506
+ lora_config ["freeze_lora_weights" ] = False
507
+ lora_config ["freeze_base_model" ] = False
508
+
509
+ initialize_for_megatron (tensor_model_parallel_size = size , pipeline_model_parallel_size = 1 )
510
+ model = _gpt_model_provider (tp_size = size , hidden_size = hidden_size )
511
+ prompt_tokens = torch .randint (0 , model .vocab_size , (2 , model .max_sequence_length )).cuda ()
512
+
513
+ mtpf .update_model (model , lora_config )
514
+ model .train ()
515
+
516
+ # Use a simple forward pass instead for grad check
517
+ batch_size = prompt_tokens .shape [0 ]
518
+ seq_len = prompt_tokens .shape [- 1 ]
519
+ device = prompt_tokens .device
520
+
521
+ attention_mask = (
522
+ torch .triu (torch .ones ((batch_size , seq_len , seq_len ), device = device ), diagonal = 1 )
523
+ .bool ()
524
+ .view (batch_size , 1 , seq_len , seq_len )
525
+ )
526
+
527
+ output = model (prompt_tokens , position_ids = None , attention_mask = attention_mask )
528
+
529
+ loss = output .sum ()
530
+ loss .backward ()
531
+
532
+ for name , module in model .named_modules ():
533
+ if isinstance (module , LoRAModule ):
534
+ if len (module ._lora_adapters ) == 0 :
535
+ continue
536
+ for adapter_name in module ._lora_adapters :
537
+ lora_a_module = module ._lora_adapters [adapter_name ]["lora_a" ]
538
+ lora_b_module = module ._lora_adapters [adapter_name ]["lora_b" ]
539
+
540
+ for param_name , param in lora_a_module .named_parameters ():
541
+ assert param .grad is not None , f"lora_a.{ param_name } in { name } has gradient"
542
+ assert torch .any (param .grad != 0 ), (
543
+ f"lora_a.{ param_name } gradient is all zeros in { name } "
544
+ )
545
+
546
+ for param_name , param in lora_b_module .named_parameters ():
547
+ assert param .grad is not None , f"lora_b.{ param_name } in { name } has gradient"
548
+ assert torch .any (param .grad != 0 ), (
549
+ f"lora_b.{ param_name } gradient is all zeros in { name } "
550
+ )
551
+
552
+ assert module .weight .grad is not None
553
+ assert torch .any (module .weight .grad != 0 ), "weight gradient is all zeros"
554
+
555
+
556
+ @pytest .mark .parametrize ("device_count" , get_device_counts ())
557
+ @pytest .mark .parametrize (
558
+ "lora_config" ,
559
+ [
560
+ LARGE_LORA_CFG_RANDOM_INIT_TEST , # Use random init so gradients flow to both lora_a and lora_b
561
+ ],
562
+ )
563
+ def test_adapter_gradient_flow (device_count , lora_config , tmp_path ):
564
+ spawn_multiprocess_job (
565
+ size = device_count ,
566
+ job = partial (_test_adapter_gradient_flow , lora_config , str (tmp_path )),
567
+ backend = "nccl" ,
568
+ )
0 commit comments