support apply wd to qk layernorm

yuzhongw-nvidia · yuzhongw-nvidia · commit a34da998830e · 2026-01-05T21:29:15.000-08:00
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
@@ -88,9 +88,33 @@ def _matches(param: torch.nn.Parameter, param_name: str, param_key: ParamKey) ->
     return False
 
 
+def _get_no_wd_cond_fn(no_weight_decay_cond_type):
+    """Get the no weight decay condition function."""
+
+    if no_weight_decay_cond_type == 'apply_wd_to_qk_layernorm':
+
+        def no_wd_cond_fn(name, param):
+            if "q_layernorm" in name or "k_layernorm" in name:
+                """Applies weight decay to qk layernorm as a special case"""
+                no_wd = False
+            else:
+                no_wd = name.endswith(".bias") or len(param.shape) == 1
+            return no_wd
+
+    elif no_weight_decay_cond_type is None:
+
+        def no_wd_cond_fn(name, param):
+            return name.endswith(".bias") or len(param.shape) == 1
+
+    else:
+        raise ValueError(f"Unknown no_weight_decay_cond_type: {no_weight_decay_cond_type}")
+
+    return no_wd_cond_fn
+
+
 def _get_param_groups(
     model_chunks: List[MegatronModule],
-    config: OptimizerConfig,
+    optimizer_config: OptimizerConfig,
     config_overrides: Optional[Dict[ParamKey, OptimizerConfig]],
 ) -> List[Dict]:
     """Create parameter groups for optimizer.
@@ -100,7 +124,7 @@ def _get_param_groups(
     Args:
         model_chunks (List[MegatronModule]): model chunks to create parameter
             groups for.
-        config (OptimizerConfig): optimizer configuration object.
+        optimizer_config (OptimizerConfig): optimizer configuration object.
         config_overrides (Optional[Dict[LayerKey, OptimizerConfig]): optimizer overrides,
             specified on a per-layer basis.
     Returns:
@@ -119,7 +143,7 @@ def _get_param_groups(
             uses_default_config = False
             # Get optimizer config for this parameter.
             if config_overrides is None:
-                config_for_param = config
+                config_for_param = optimizer_config
                 uses_default_config = True
             else:
                 config_for_param = None
@@ -129,15 +153,16 @@ def _get_param_groups(
                         break
                 # Fall back to default config.
                 if config_for_param is None:
-                    config_for_param = config
+                    config_for_param = optimizer_config
                     uses_default_config = True
 
             is_expert_parallel = not getattr(param, 'allreduce', True)
 
             # TODO: Make sure there is a way to support old no_weight_decay_func functionality
             # and default_skip_embedding_weight_decay:
             #     or (default_skip_embedding_weight_decay and "embedding" in name)
-            no_wd = name.endswith(".bias") or len(param.shape) == 1
+            no_wd_cond_fn = _get_no_wd_cond_fn(optimizer_config.no_weight_decay_cond)
+            no_wd = no_wd_cond_fn(name, param)
             if not no_wd:
                 wd_mult = 1.0
             else:
@@ -173,12 +198,12 @@ def _get_param_groups(
     for key in params_key:
         wd_mult, is_expert_parallel, _ = key
         params = params_map[key] if key in params_map else []
-        config, uses_default_config = None, True
+        param_config, uses_default_config = None, True
         if key not in configs_map:
             assert params == []
         else:
-            config, uses_default_config = configs_map[key]
-            assert config is not None
+            param_config, uses_default_config = configs_map[key]
+            assert param_config is not None
 
         # TODO: Remove "backwards compatible" fields below eventually.
         param_group = {
@@ -191,9 +216,9 @@ def _get_param_groups(
         }
 
         # Stick relevant fields into param_group from config object.
-        if config is not None:
-            param_group['max_lr'] = config.lr
-            param_group['min_lr'] = config.min_lr
+        if param_config is not None:
+            param_group['max_lr'] = param_config.lr
+            param_group['min_lr'] = param_config.min_lr
             # TODO: Add other relevant arguments (e.g., weight decay, optimizer)
             # here as well.
         param_groups.append(param_group)
diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py
@@ -41,6 +41,15 @@ class OptimizerConfig:
     weight_decay: float = 0.01
     """Weight decay coefficient for L2 regularization."""
 
+    no_weight_decay_cond: Optional[str] = None
+    """Condition for whether a parameter should not perform weight decay.
+       Supported conditions:
+       - None (default): apply weight decay to 1D weights, biases,
+         and embedding weights.
+       - "apply_wd_to_qk_layernorm": additionally apply weight decay to 
+         qk layernorm as a special case.
+    """
+
     ##############
     # Precision
     ##############
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -2000,6 +2000,11 @@ def _add_regularization_args(parser):
     group.add_argument('--weight-decay-incr-style', type=str, default='constant',
                        choices=['constant', 'linear', 'cosine'],
                        help='Weight decay increment function.')
+    group.add_argument('--no-weight-decay-cond-type', type=str, choices=['apply_wd_to_qk_layernorm'],
+                       help='Type of no weight decay condition. Choices: '
+                       'None (default): apply weight decay to 1D weights and biases.'
+                       '"apply_wd_to_qk_layernorm": additionally apply weight decay to '
+                       'qk layernorm as a special case.')
     group.add_argument('--clip-grad', type=float, default=1.0,
                        help='Gradient clipping based on global L2 norm.')
     group.add_argument('--adam-beta1', type=float, default=0.9,
diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py
@@ -598,3 +598,74 @@ def test_get_megatron_optimizer_custom_process_groups_validation():
             use_gloo_process_groups=True,  # Should be False when using custom groups
             pg_collection=pg_collection_complete,
         )
+
+
+class QKLayerNormModel(nn.Module):
+    """A model with q_layernorm, k_layernorm, regular layernorm and bias parameters
+    to test the 'apply_wd_to_qk_layernorm' no_weight_decay_cond option.
+    """
+
+    def __init__(self, hidden_size=64):
+        super().__init__()
+        # q_layernorm and k_layernorm should have wd_mult=1.0 when apply_wd_to_qk_layernorm is set
+        self.q_layernorm = nn.LayerNorm(hidden_size, bias=True)
+        self.k_layernorm = nn.LayerNorm(hidden_size, bias=True)
+        # Regular layernorm should have wd_mult=0.0 (1D params)
+        self.regular_layernorm = nn.LayerNorm(hidden_size, bias=False)
+        # Linear layer: weight should have wd_mult=1.0, bias should have wd_mult=0.0
+        self.linear = nn.Linear(hidden_size, hidden_size, bias=True)
+
+
+def test_no_weight_decay_cond_apply_wd_to_qk_layernorm():
+    """
+    Test that no_weight_decay_cond='apply_wd_to_qk_layernorm' correctly assigns
+    wd_mult=1.0 to q_layernorm and k_layernorm parameters while other 1D params
+    (bias, regular layernorm) have wd_mult=0.0.
+
+    This test uses get_megatron_optimizer to build an optimizer and then checks
+    the param_groups to verify the wd_mult assignment.
+    """
+    world = int(os.getenv('WORLD_SIZE', '1'))
+    rank = int(os.getenv('RANK', '0'))
+    _init_distributed(world, rank)
+    Utils.initialize_model_parallel()
+
+    # Create model with q_layernorm, k_layernorm, and regular layernorm
+    model = QKLayerNormModel(hidden_size=64).bfloat16().cuda()
+    model.requires_grad_(True)
+
+    ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
+    model = DistributedDataParallel(
+        TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+    )
+
+    # Create optimizer config with no_weight_decay_cond='apply_wd_to_qk_layernorm'
+    optimizer_config = OptimizerConfig(
+        optimizer='adam',
+        lr=0.01,
+        bf16=True,
+        use_distributed_optimizer=False,
+        no_weight_decay_cond='apply_wd_to_qk_layernorm',
+    )
+
+    # Build optimizer
+    optim = get_megatron_optimizer(optimizer_config, [model])
+
+    # Count params by wd_mult
+    wd_mult_1_count = 0  # Params with weight decay
+    wd_mult_0_count = 0  # Params without weight decay
+
+    for group in optim.param_groups:
+        wd_mult = group['wd_mult']
+        num_params = len(group['params'])
+        if wd_mult == 1.0:
+            wd_mult_1_count += num_params
+        else:
+            wd_mult_0_count += num_params
+
+    # Expected:
+    # wd_mult=1.0: q_layernorm.weight, q_layernorm.bias, k_layernorm.weight,
+    #              k_layernorm.bias, linear.weight = 5 params
+    # wd_mult=0.0: regular_layernorm.weight, linear.bias = 2 params
+    assert wd_mult_1_count == 5, f"Expected 5 params with wd_mult=1.0, but got {wd_mult_1_count}"
+    assert wd_mult_0_count == 2, f"Expected 3 params with wd_mult=0.0, but got {wd_mult_0_count}"