support GDN packed sequence

yuzhongw-nvidia · yuzhongw-nvidia · commit 2575c6d64bea · 2025-12-14T19:38:48.000-08:00
diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py
@@ -99,8 +99,11 @@ def __init__(
             cp_comm_type: No use for GDN, just for compatibility with Attention class.
         """
 
-        if not HAVE_FLA:
-            raise ImportError("FLA is not installed. Please install it with `pip install fla`.")
+        if not HAVE_FLA and not self.config.deterministic_mode:
+            raise ImportError(
+                "FLA is not installed. Please install it with "
+                "`pip install fla` or use deterministic mode."
+            )
 
         super().__init__(config)
 
@@ -304,28 +307,62 @@ def forward(
             raise NotImplementedError("GDN does not support inference for now.")
 
         if packed_seq_params is not None:
-            # TODO: support packed sequence
-            raise NotImplementedError("GDN does not support packed sequence for now.")
+            assert batch == 1, "Packed sequence expects batch dimension to be 1"
+            # Prefer cu_seqlens_q_padded if available, otherwise use cu_seqlens_q
+            cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded or packed_seq_params.cu_seqlens_q
+            # Prefer cu_seqlens_kv_padded if available, otherwise use cu_seqlens_kv
+            cu_seqlens_kv = (
+                packed_seq_params.cu_seqlens_kv_padded or packed_seq_params.cu_seqlens_kv
+            )
+            assert torch.equal(cu_seqlens_q, cu_seqlens_kv), (
+                "Currently only support cu_seqlens_q equals to cu_seqlens_kv, "
+                f"but got {cu_seqlens_q=} and {cu_seqlens_kv=}"
+            )
+            num_packed_seqs = cu_seqlens_q.shape[0] - 1
+            assert num_packed_seqs > 0, (
+                "Number of packed sequences must be greater than 0, "
+                f"but got {cu_seqlens_q=} and {cu_seqlens_kv=}"
+            )
 
         # Input projection
         nvtx_range_push(suffix="in_proj")
         qkvzba, _ = self.in_proj(hidden_states)
         nvtx_range_pop(suffix="in_proj")
 
         # CP All to All: CP to HP
-        qkvzba = self.cp.tensor_a2a_cp2hp(
-            qkvzba,
-            seq_dim=0,
-            head_dim=-1,
-            split_size_or_sections=[
-                self.qk_dim_local_tp,
-                self.qk_dim_local_tp,
-                self.v_dim_local_tp,
-                self.v_dim_local_tp,
-                self.num_value_heads // self.tp_size,
-                self.num_value_heads // self.tp_size,
-            ],
-        )
+        if packed_seq_params is not None:
+            unpacked_qkvzba = _unpack_sequence(qkvzba, cu_seqlens_q // self.cp_size, dim=0)
+            outputs = []
+            for qkvzba_i in unpacked_qkvzba:
+                qkvzba_i = self.cp.tensor_a2a_cp2hp(
+                    qkvzba_i,
+                    seq_dim=0,
+                    head_dim=-1,
+                    split_size_or_sections=[
+                        self.qk_dim_local_tp,
+                        self.qk_dim_local_tp,
+                        self.v_dim_local_tp,
+                        self.v_dim_local_tp,
+                        self.num_value_heads // self.tp_size,
+                        self.num_value_heads // self.tp_size,
+                    ],
+                )
+                outputs.append(qkvzba_i)
+            qkvzba = torch.cat(outputs, dim=0)
+        else:
+            qkvzba = self.cp.tensor_a2a_cp2hp(
+                qkvzba,
+                seq_dim=0,
+                head_dim=-1,
+                split_size_or_sections=[
+                    self.qk_dim_local_tp,
+                    self.qk_dim_local_tp,
+                    self.v_dim_local_tp,
+                    self.v_dim_local_tp,
+                    self.num_value_heads // self.tp_size,
+                    self.num_value_heads // self.tp_size,
+                ],
+            )
 
         # Transpose: s b x --> b s x
         # From sbhd to bshd format
@@ -347,45 +384,18 @@ def forward(
         alpha = alpha.reshape(batch, seq_len, -1)
 
         # Convolution on qkv
-        qkv = qkv.transpose(1, 2).contiguous()  # b, s, d -> b, d, s
         nvtx_range_push(suffix="conv1d")
-        qkv_channels_split_sections = [
-            self.qk_dim_local_tp,
-            self.qk_dim_local_tp,
-            self.v_dim_local_tp,
-        ]
-        conv1d_weight = self.cp.get_parameter_local_cp(
-            self.conv1d.weight, dim=0, split_size_or_sections=qkv_channels_split_sections
-        )
-        conv1d_bias = (
-            self.cp.get_parameter_local_cp(
-                self.conv1d.bias, dim=0, split_size_or_sections=qkv_channels_split_sections
-            )
-            if self.conv_bias
-            else None
-        )
-        if (causal_conv1d_fn is None) or self.config.deterministic_mode:
-            conv_out = F.conv1d(
-                input=qkv,
-                weight=conv1d_weight,
-                bias=conv1d_bias,
-                stride=self.conv1d.stride,
-                padding=self.conv1d.padding,
-                dilation=self.conv1d.dilation,
-                groups=self.conv_dim_local_tp // self.cp_size,
-            )
-            qkv = self.act_fn(conv_out[..., :seq_len])
+        if packed_seq_params is not None:
+            unpacked_qkv = _unpack_sequence(qkv, cu_seqlens_q)
+            outputs = []
+            for qkv_i in unpacked_qkv:
+                qkv_i = self._conv1d_on_qkv(qkv_i)
+                outputs.append(qkv_i)
+            qkv = torch.cat(outputs, dim=1)
         else:
-            assert self.activation in ["silu", "swish"]
-            qkv = causal_conv1d_fn(
-                x=qkv,
-                weight=conv1d_weight.squeeze(1),  # d, 1, w -> d, w
-                bias=conv1d_bias,
-                activation=self.activation,
-            )
+            qkv = self._conv1d_on_qkv(qkv)
         nvtx_range_pop(suffix="conv1d")
         # Split qkv into query, key, and value
-        qkv = qkv.transpose(1, 2)  # b, d, s -> b, s, d
         query, key, value = torch.split(
             qkv,
             [
@@ -424,18 +434,36 @@ def forward(
 
         nvtx_range_push(suffix="gated_delta_rule")
         if self.config.deterministic_mode:
-            core_attn_out, last_recurrent_state = torch_chunk_gated_delta_rule(
-                query,
-                key,
-                value,
-                g=g,
-                beta=beta,
-                initial_state=None,
-                output_final_state=False,
-                use_qk_l2norm_in_kernel=False,
-            )
+            gated_delta_rule_fn = torch_chunk_gated_delta_rule
         else:
-            core_attn_out, last_recurrent_state = chunk_gated_delta_rule(
+            gated_delta_rule_fn = chunk_gated_delta_rule
+
+        if packed_seq_params is not None:
+            # Packed sequence forward pass (THD format)
+            query = _unpack_sequence(query, cu_seqlens_q)
+            key = _unpack_sequence(key, cu_seqlens_kv)
+            value = _unpack_sequence(value, cu_seqlens_kv)
+            g = _unpack_sequence(g, cu_seqlens_q)
+            beta = _unpack_sequence(beta, cu_seqlens_q)
+
+            outputs = []
+            for i, (q_i, k_i, v_i, g_i, beta_i) in enumerate(zip(query, key, value, g, beta)):
+                out_i, last_recurrent_state = gated_delta_rule_fn(
+                    q_i,
+                    k_i,
+                    v_i,
+                    g=g_i,
+                    beta=beta_i,
+                    initial_state=None,
+                    output_final_state=False,
+                    use_qk_l2norm_in_kernel=False,
+                )
+                outputs.append(out_i)
+
+            core_attn_out = torch.cat(outputs, dim=1)
+        else:
+            # Regular forward pass (BSHD format)
+            core_attn_out, last_recurrent_state = gated_delta_rule_fn(
                 query,
                 key,
                 value,
@@ -458,7 +486,15 @@ def forward(
         norm_out = norm_out.transpose(0, 1).contiguous()
 
         # CP all to all: HP to CP
-        norm_out = self.cp.tensor_a2a_hp2cp(norm_out, seq_dim=0, head_dim=-1)
+        if packed_seq_params is not None:
+            unpacked_norm_out = _unpack_sequence(norm_out, cu_seqlens_q, dim=0)
+            outputs = []
+            for norm_out_i in unpacked_norm_out:
+                norm_out_i = self.cp.tensor_a2a_hp2cp(norm_out_i, seq_dim=0, head_dim=-1)
+                outputs.append(norm_out_i)
+            norm_out = torch.cat(outputs, dim=0)
+        else:
+            norm_out = self.cp.tensor_a2a_hp2cp(norm_out, seq_dim=0, head_dim=-1)
 
         # Output projection
         nvtx_range_push(suffix="out_proj")
@@ -467,6 +503,47 @@ def forward(
 
         return out, out_bias
 
+    def _conv1d_on_qkv(self, qkv):
+        qkv = qkv.transpose(1, 2).contiguous()  # b, s, d -> b, d, s
+        seq_len = qkv.shape[2]
+        qkv_channels_split_sections = [
+            self.qk_dim_local_tp,
+            self.qk_dim_local_tp,
+            self.v_dim_local_tp,
+        ]
+        conv1d_weight = self.cp.get_parameter_local_cp(
+            self.conv1d.weight, dim=0, split_size_or_sections=qkv_channels_split_sections
+        )
+        conv1d_bias = (
+            self.cp.get_parameter_local_cp(
+                self.conv1d.bias, dim=0, split_size_or_sections=qkv_channels_split_sections
+            )
+            if self.conv_bias
+            else None
+        )
+        if (causal_conv1d_fn is None) or self.config.deterministic_mode:
+            conv_out = F.conv1d(
+                input=qkv,
+                weight=conv1d_weight,
+                bias=conv1d_bias,
+                stride=self.conv1d.stride,
+                padding=self.conv1d.padding,
+                dilation=self.conv1d.dilation,
+                groups=self.conv_dim_local_tp // self.cp_size,
+            )
+            qkv = self.act_fn(conv_out[..., :seq_len])
+        else:
+            assert self.activation in ["silu", "swish"]
+            qkv = causal_conv1d_fn(
+                x=qkv,
+                weight=conv1d_weight.squeeze(1),  # d, 1, w -> d, w
+                bias=conv1d_bias,
+                activation=self.activation,
+            )
+        qkv = qkv.transpose(1, 2)  # b, d, s -> b, s, d
+
+        return qkv
+
     @jit_fuser
     def _apply_gated_norm(self, x, gate):
         # Output Norm
@@ -564,6 +641,17 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_gr
         return sharded_state_dict
 
 
+def _unpack_sequence(x, cu_seqlens, dim=1):
+    unpacked_x = []
+    num_seqs = cu_seqlens.shape[0] - 1
+    for i in range(num_seqs):
+        idx_start = cu_seqlens[i].item()
+        idx_end = cu_seqlens[i + 1].item()
+        chunked_index = [slice(None)] * dim + [slice(idx_start, idx_end)]
+        unpacked_x.append(x[chunked_index])
+    return unpacked_x
+
+
 def _split_tensor_factory(
     orig_sh_ten: ShardedTensor, split_sections: List[int], split_names: List[str], split_dim: int
 ) -> ShardedTensorFactory:
diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py
@@ -29,6 +29,7 @@
 )
 from tests.unit_tests.test_utilities import Utils
 from tests.unit_tests.transformer.test_attention import _test_parallel_attention_correctness
+from tests.unit_tests.transformer.test_multi_latent_attention import make_test_packed_seq_params
 
 try:
     import fla
@@ -132,6 +133,68 @@ def test_gpu_forward(self):
             output.dtype == hidden_states.dtype
         ), f"Output dtype {output.dtype=} mismatch with {hidden_states.dtype=}"
 
+    def test_gpu_forward_thd(self):
+        # Input shape
+        sequence_length = 32
+        micro_batch_size = 4
+        cu_seqlens = [0, 32, 64, 96, 128]
+        # sbhd input shape: [sequence length, batch size, hidden size]
+        sub_sequence_length = sequence_length // self.cp_size // self.sp_size
+        hidden_states_sbhd = torch.rand(
+            (sub_sequence_length, micro_batch_size, self.gdn.config.hidden_size)
+        )
+        hidden_states_sbhd = hidden_states_sbhd.cuda().bfloat16()
+        # thd input shape: [sequence length * batch size, 1, hidden size]
+        hidden_states_thd = hidden_states_sbhd.transpose(0, 1).contiguous()
+        hidden_states_thd = hidden_states_thd.view(-1, 1, self.gdn.config.hidden_size)
+        attention_mask = None
+        packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
+
+        output, _ = self.gdn(hidden_states_thd, attention_mask, packed_seq_params=packed_seq_params)
+
+        assert output.shape[0] == sub_sequence_length * micro_batch_size
+        assert output.shape[1] == 1
+        assert output.shape[2] == self.gdn.config.hidden_size
+
+    def test_gpu_forward_thd_correctness(self):
+        if self.sp_size > 1:
+            pytest.skip("Sequence parallel is not supported for this test case.")
+
+        atol, rtol = 3e-4, 3e-4
+
+        # Input shape
+        sequence_length = 32
+        micro_batch_size = 4
+        cu_seqlens = [0, 32, 64, 96, 128]
+        # sbhd input shape: [sequence length, batch size, hidden size]
+        sub_sequence_length = sequence_length // self.cp_size
+        hidden_states_sbhd = torch.rand(
+            (sub_sequence_length, micro_batch_size, self.gdn.config.hidden_size)
+        )
+        attention_mask_sbhd = None
+        hidden_states_sbhd = hidden_states_sbhd.cuda().bfloat16()
+        # thd input shape: [sequence length * batch size, 1, hidden size]
+        hidden_states_thd = hidden_states_sbhd.transpose(0, 1).contiguous()
+        hidden_states_thd = hidden_states_thd.view(-1, 1, self.gdn.config.hidden_size)
+        attention_mask_thd = None
+        packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
+
+        # SBHD format
+        output_sbhd, _ = self.gdn(hidden_states_sbhd, attention_mask_sbhd)
+        # THD format
+        output_thd, _ = self.gdn(
+            hidden_states_thd, attention_mask_thd, packed_seq_params=packed_seq_params
+        )
+        _output_sbhd = output_sbhd.transpose(0, 1).contiguous().view(*output_thd.shape)
+        rank = torch.distributed.get_rank()
+        torch.testing.assert_close(
+            _output_sbhd,
+            output_thd,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda msg: f"Output mismatch ({rank=}): {msg}",
+        )
+
 
 @pytest.mark.parametrize(
     ("tp", "sp", "cp"),
@@ -146,7 +209,7 @@ def test_gpu_forward(self):
 @pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.")
 def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp):
     transformer_config = TransformerConfig(
-        hidden_size=hidden_size,
+        hidden_size=128,
         linear_conv_kernel_dim=2,
         linear_key_head_dim=32,
         linear_value_head_dim=32,