add unit tests

cuichenx · cuichenx · commit ae07f11238c9 · 2026-01-28T17:40:14.000-08:00
Signed-off-by: Chen Cui &lt;chcui@nvidia.com&gt;
diff --git a/src/megatron/bridge/training/utils/packed_seq_utils.py b/src/megatron/bridge/training/utils/packed_seq_utils.py
@@ -44,15 +44,17 @@ def get_packed_seq_params(batch: dict[str, torch.Tensor]) -> PackedSeqParams:
     cu_seqlens_unpadded_argmin = batch.get("cu_seqlens_unpadded_argmin")
 
     if cu_seqlens_argmin is not None:
-        cu_seqlens_padded = cu_seqlens_padded[: cu_seqlens_argmin.item()]
-        assert cu_seqlens_padded[cu_seqlens_argmin.item()] == -1  # cu_seqlens padding is -1
+        argmin_idx = cu_seqlens_argmin.item()
+        assert argmin_idx == 0 or cu_seqlens_padded[argmin_idx] == -1  # cu_seqlens padding is -1
+        cu_seqlens_padded = cu_seqlens_padded[:argmin_idx]
     elif torch.min(cu_seqlens_padded) == -1:
         cu_seqlens_padded = cu_seqlens_padded[: torch.argmin(cu_seqlens_padded)]
 
     if cu_seqlens_unpadded is not None:
         if cu_seqlens_unpadded_argmin is not None:
-            cu_seqlens_unpadded = cu_seqlens_unpadded[: cu_seqlens_unpadded_argmin.item()]
-            assert cu_seqlens_unpadded[cu_seqlens_unpadded_argmin.item()] == -1  # cu_seqlens padding is -1
+            argmin_idx = cu_seqlens_unpadded_argmin.item()
+            assert argmin_idx == 0 or cu_seqlens_unpadded[argmin_idx] == -1  # cu_seqlens padding is -1
+            cu_seqlens_unpadded = cu_seqlens_unpadded[:argmin_idx]
         elif torch.min(cu_seqlens_unpadded) == -1:
             cu_seqlens_unpadded = cu_seqlens_unpadded[: torch.argmin(cu_seqlens_unpadded)]
 
diff --git a/src/megatron/bridge/training/vlm_step.py b/src/megatron/bridge/training/vlm_step.py
@@ -105,7 +105,7 @@ def pack_batch_sequences(
     position_ids: torch.Tensor,
     pad_token_id: int = 0,
     pad_to_multiple_of: int = 1,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Pack sequences in a batch by concatenating them and removing padding.
 
@@ -125,8 +125,7 @@ def pack_batch_sequences(
         - packed_attention_mask: None (not used with packing)
         - packed_position_ids: [1, total_len]
         - cu_seqlens: [num_sequences + 1] - cumulative sequence lengths
-        - cu_seqlens_argmin: 0 (dummy)
-        - max_seqlen: int - max sequence length in packed batch
+        - max_seqlen: tensor - max sequence length in packed batch
     """
     batch_size, seq_len = tokens.shape
     device = tokens.device
@@ -159,8 +158,7 @@ def pack_batch_sequences(
             attention_mask,
             position_ids[:1],
             torch.tensor([0, seq_len], dtype=torch.int32, device=device),
-            0,
-            seq_len,
+            torch.tensor(seq_len, dtype=torch.int32, device=device),
         )
 
     # Build cumulative sequence lengths
diff --git a/tests/unit_tests/models/ministral3/test_ministral3_provider.py b/tests/unit_tests/models/ministral3/test_ministral3_provider.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
 
 from megatron.bridge.models.ministral3.ministral3_provider import (
     Ministral3ModelProvider,
@@ -172,3 +173,175 @@ def test_ministral3_14b_initialization(self):
         assert provider.ffn_hidden_size == 16384
         assert provider.num_layers == 40
         assert provider.rotary_base == 1000000000.0
+
+
+class TestGetLlama4AttnScale:
+    """Test cases for _get_llama_4_attn_scale function used in MinistralTEDotProductAttention.
+
+    This function computes attention scaling based on Llama 4 attention parameters.
+    The key change in PR 1997 is that it now handles different query shapes for
+    packed (3D) vs unpacked (4D) tensors.
+    """
+
+    def _get_llama_4_attn_scale(
+        self, positions_ids: torch.Tensor, beta: float, max_position_embeddings: int, query_shape: tuple
+    ) -> torch.Tensor:
+        """Reimplementation of the function for testing."""
+        scaling = 1 + beta * torch.log(1 + torch.floor(positions_ids / max_position_embeddings))
+        num_dims_to_add = len(query_shape) - 1
+        for _ in range(num_dims_to_add):
+            scaling = scaling.unsqueeze(-1)
+        return scaling
+
+    def test_unpacked_4d_query_shape(self):
+        """Test attention scaling with unpacked 4D query shape [seq_len, batch, num_heads, head_dim]."""
+        seq_len = 8
+        batch_size = 2
+        num_heads = 4
+        head_dim = 64
+
+        positions_ids = torch.arange(seq_len)
+        beta = 0.1
+        max_position_embeddings = 16384
+        query_shape = (seq_len, batch_size, num_heads, head_dim)
+
+        scaling = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape)
+
+        # Output should have shape [seq_len, 1, 1, 1] for broadcasting
+        assert scaling.shape == (seq_len, 1, 1, 1)
+
+        # First position should have scaling = 1 (since log(1 + 0) = 0)
+        expected_first = 1 + beta * torch.log(torch.tensor(1.0))
+        assert torch.isclose(scaling[0, 0, 0, 0], expected_first, atol=1e-6)
+
+    def test_packed_3d_query_shape(self):
+        """Test attention scaling with packed 3D query shape [seq_len, num_heads, head_dim]."""
+        seq_len = 16
+        num_heads = 8
+        head_dim = 32
+
+        positions_ids = torch.arange(seq_len)
+        beta = 0.2
+        max_position_embeddings = 8192
+        query_shape = (seq_len, num_heads, head_dim)
+
+        scaling = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape)
+
+        # Output should have shape [seq_len, 1, 1] for broadcasting (3D - 1 = 2 dims added)
+        assert scaling.shape == (seq_len, 1, 1)
+
+        # Verify scaling values are computed correctly
+        expected = 1 + beta * torch.log(1 + torch.floor(positions_ids / max_position_embeddings))
+        assert torch.allclose(scaling.squeeze(), expected, atol=1e-6)
+
+    def test_scaling_formula_correctness(self):
+        """Test that the scaling formula matches expected Llama 4 attention scaling."""
+        positions_ids = torch.tensor([0, 1, 100, 1000, 16384, 32768])
+        beta = 0.15
+        max_position_embeddings = 16384
+        query_shape = (6, 1, 1, 1)
+
+        scaling = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape)
+
+        # Manual computation of expected values
+        # For position 0: 1 + 0.15 * log(1 + 0) = 1
+        # For position 16384: 1 + 0.15 * log(1 + 1) = 1 + 0.15 * log(2)
+        # For position 32768: 1 + 0.15 * log(1 + 2) = 1 + 0.15 * log(3)
+
+        expected_0 = 1.0
+        expected_16384 = 1 + beta * torch.log(torch.tensor(2.0))
+        expected_32768 = 1 + beta * torch.log(torch.tensor(3.0))
+
+        assert torch.isclose(scaling[0].squeeze(), torch.tensor(expected_0), atol=1e-6)
+        assert torch.isclose(scaling[4].squeeze(), expected_16384, atol=1e-6)
+        assert torch.isclose(scaling[5].squeeze(), expected_32768, atol=1e-6)
+
+    def test_beta_zero_returns_ones(self):
+        """Test that beta=0 returns all ones (no scaling)."""
+        positions_ids = torch.arange(10)
+        beta = 0.0
+        max_position_embeddings = 4096
+        query_shape = (10, 4, 64)
+
+        scaling = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape)
+
+        assert torch.allclose(scaling.squeeze(), torch.ones(10), atol=1e-6)
+
+    def test_different_query_shapes_get_correct_dims(self):
+        """Test that different query shapes result in correct number of dimensions added."""
+        positions_ids = torch.arange(4)
+        beta = 0.1
+        max_position_embeddings = 1000
+
+        # 2D query shape
+        query_shape_2d = (4, 32)
+        scaling_2d = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape_2d)
+        assert scaling_2d.shape == (4, 1)  # 2-1 = 1 dim added
+
+        # 3D query shape (packed THD)
+        query_shape_3d = (4, 8, 32)
+        scaling_3d = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape_3d)
+        assert scaling_3d.shape == (4, 1, 1)  # 3-1 = 2 dims added
+
+        # 4D query shape (unpacked BSHD)
+        query_shape_4d = (4, 2, 8, 32)
+        scaling_4d = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape_4d)
+        assert scaling_4d.shape == (4, 1, 1, 1)  # 4-1 = 3 dims added
+
+    def test_broadcasting_compatibility(self):
+        """Test that scaling tensor is broadcastable to query tensor."""
+        seq_len = 8
+        num_heads = 4
+        head_dim = 64
+
+        positions_ids = torch.arange(seq_len)
+        beta = 0.1
+        max_position_embeddings = 16384
+
+        # Test for 3D packed format
+        query_3d = torch.randn(seq_len, num_heads, head_dim)
+        scaling_3d = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_3d.shape)
+
+        # Broadcasting should work
+        result_3d = query_3d * scaling_3d.to(query_3d.dtype)
+        assert result_3d.shape == query_3d.shape
+
+        # Test for 4D unpacked format
+        batch = 2
+        query_4d = torch.randn(seq_len, batch, num_heads, head_dim)
+        scaling_4d = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_4d.shape)
+
+        # Broadcasting should work
+        result_4d = query_4d * scaling_4d.to(query_4d.dtype)
+        assert result_4d.shape == query_4d.shape
+
+    def test_gpu_tensor_support(self):
+        """Test that the function works with GPU tensors if available."""
+        if not torch.cuda.is_available():
+            return  # Skip test if no GPU
+
+        positions_ids = torch.arange(8, device="cuda")
+        beta = 0.1
+        max_position_embeddings = 1024
+        query_shape = (8, 4, 32)
+
+        scaling = self._get_llama_4_attn_scale(positions_ids, beta, max_position_embeddings, query_shape)
+
+        assert scaling.device.type == "cuda"
+        assert scaling.shape == (8, 1, 1)
+
+    def test_dtype_preservation(self):
+        """Test that output dtype matches input positions_ids dtype."""
+        positions_ids_float32 = torch.arange(4, dtype=torch.float32)
+        positions_ids_float64 = torch.arange(4, dtype=torch.float64)
+        beta = 0.1
+        max_position_embeddings = 100
+        query_shape = (4, 2, 8)
+
+        scaling_32 = self._get_llama_4_attn_scale(positions_ids_float32, beta, max_position_embeddings, query_shape)
+        scaling_64 = self._get_llama_4_attn_scale(positions_ids_float64, beta, max_position_embeddings, query_shape)
+
+        # Note: torch.arange with int creates int tensors, but the function uses float operations
+        # The scaling result will be float due to log operation
+        assert scaling_32.dtype == torch.float32
+        assert scaling_64.dtype == torch.float64
diff --git a/tests/unit_tests/training/test_gpt_step.py b/tests/unit_tests/training/test_gpt_step.py
@@ -115,9 +115,8 @@ def test_packed_seq_params_no_padding(self):
         # Verify the result is a PackedSeqParams object
         assert isinstance(result, PackedSeqParams)
 
-        # When there's no -1 padding, argmin returns 0 (index of min value)
-        # So cu_seqlens[:0] returns empty tensor
-        expected_cu_seqlens = torch.empty(0, dtype=torch.int32)  # Empty tensor
+        # When there's no -1 padding, the tensor is returned unchanged
+        expected_cu_seqlens = torch.tensor([0, 7, 14], dtype=torch.int32)
         assert torch.equal(result.cu_seqlens_q, expected_cu_seqlens)
         assert torch.equal(result.cu_seqlens_kv, expected_cu_seqlens)
 
@@ -181,6 +180,111 @@ def test_packed_seq_params_all_fields_match(self):
         assert torch.equal(result.cu_seqlens_q, result.cu_seqlens_kv)
         assert torch.equal(result.max_seqlen_q, result.max_seqlen_kv)
 
+    def test_packed_seq_params_with_cu_seqlens_unpadded(self):
+        """Test functionality with cu_seqlens_unpadded for THD CP support."""
+        # Padded cu_seqlens (includes padding for CP divisibility)
+        cu_seqlens_padded = torch.tensor([[0, 8, 16, -1, -1]], dtype=torch.int32)
+        # Unpadded cu_seqlens (actual sequence boundaries)
+        cu_seqlens_unpadded = torch.tensor([[0, 6, 14, -1, -1]], dtype=torch.int32)
+
+        batch = {
+            "cu_seqlens": cu_seqlens_padded,
+            "cu_seqlens_unpadded": cu_seqlens_unpadded,
+            "max_seqlen": torch.tensor([[10]], dtype=torch.int32),
+        }
+
+        result = get_packed_seq_params(batch)
+
+        # cu_seqlens_q and cu_seqlens_kv should use unpadded values
+        expected_unpadded = torch.tensor([0, 6, 14], dtype=torch.int32)
+        assert torch.equal(result.cu_seqlens_q, expected_unpadded)
+        assert torch.equal(result.cu_seqlens_kv, expected_unpadded)
+
+        # cu_seqlens_q_padded and cu_seqlens_kv_padded should use padded values
+        expected_padded = torch.tensor([0, 8, 16], dtype=torch.int32)
+        assert torch.equal(result.cu_seqlens_q_padded, expected_padded)
+        assert torch.equal(result.cu_seqlens_kv_padded, expected_padded)
+
+    def test_packed_seq_params_cu_seqlens_unpadded_with_argmin(self):
+        """Test cu_seqlens_unpadded processing with argmin hint."""
+        batch = {
+            "cu_seqlens": torch.tensor([[0, 4, 8, 12, -1, -1]], dtype=torch.int32),
+            "cu_seqlens_argmin": torch.tensor(4),  # Index where -1 starts
+            "cu_seqlens_unpadded": torch.tensor([[0, 3, 7, 10, -1, -1]], dtype=torch.int32),
+            "cu_seqlens_unpadded_argmin": torch.tensor(4),  # Index where -1 starts
+        }
+
+        result = get_packed_seq_params(batch)
+
+        # Verify unpadded values are used for q/kv
+        expected_unpadded = torch.tensor([0, 3, 7, 10], dtype=torch.int32)
+        assert torch.equal(result.cu_seqlens_q, expected_unpadded)
+        assert torch.equal(result.cu_seqlens_kv, expected_unpadded)
+
+        # Verify padded values are set for _padded fields
+        expected_padded = torch.tensor([0, 4, 8, 12], dtype=torch.int32)
+        assert torch.equal(result.cu_seqlens_q_padded, expected_padded)
+        assert torch.equal(result.cu_seqlens_kv_padded, expected_padded)
+
+    def test_packed_seq_params_without_unpadded_fallback(self):
+        """Test fallback to cu_seqlens when cu_seqlens_unpadded is not provided."""
+        batch = {
+            "cu_seqlens": torch.tensor([[0, 5, 10, 15, -1]], dtype=torch.int32),
+            "max_seqlen": torch.tensor([[8]], dtype=torch.int32),
+        }
+
+        result = get_packed_seq_params(batch)
+
+        expected_cu_seqlens = torch.tensor([0, 5, 10, 15], dtype=torch.int32)
+
+        # Without unpadded, q/kv should use padded values
+        assert torch.equal(result.cu_seqlens_q, expected_cu_seqlens)
+        assert torch.equal(result.cu_seqlens_kv, expected_cu_seqlens)
+
+        # Padded fields should match q/kv
+        assert torch.equal(result.cu_seqlens_q_padded, expected_cu_seqlens)
+        assert torch.equal(result.cu_seqlens_kv_padded, expected_cu_seqlens)
+
+    def test_packed_seq_params_no_padding_in_cu_seqlens(self):
+        """Test when cu_seqlens has no -1 padding markers."""
+        batch = {
+            "cu_seqlens": torch.tensor([[0, 5, 10]], dtype=torch.int32),  # No -1 padding
+            "max_seqlen": torch.tensor([[7]], dtype=torch.int32),
+        }
+
+        result = get_packed_seq_params(batch)
+
+        # When no -1 present and min != -1, the tensor should remain as-is
+        expected = torch.tensor([0, 5, 10], dtype=torch.int32)
+        assert torch.equal(result.cu_seqlens_q, expected)
+        assert torch.equal(result.cu_seqlens_q_padded, expected)
+
+    def test_packed_seq_params_qkv_format_is_thd(self):
+        """Test that qkv_format is always set to 'thd'."""
+        batch = {
+            "cu_seqlens": torch.tensor([[0, 10, -1]], dtype=torch.int32),
+        }
+
+        result = get_packed_seq_params(batch)
+
+        assert result.qkv_format == "thd"
+
+    def test_packed_seq_params_cu_seqlens_unpadded_no_padding(self):
+        """Test cu_seqlens_unpadded with no padding markers."""
+        batch = {
+            "cu_seqlens": torch.tensor([[0, 6, 12]], dtype=torch.int32),
+            "cu_seqlens_unpadded": torch.tensor([[0, 5, 10]], dtype=torch.int32),  # No -1
+        }
+
+        result = get_packed_seq_params(batch)
+
+        # Unpadded should be used as-is since no -1 and min != -1
+        expected_unpadded = torch.tensor([0, 5, 10], dtype=torch.int32)
+        expected_padded = torch.tensor([0, 6, 12], dtype=torch.int32)
+
+        assert torch.equal(result.cu_seqlens_q, expected_unpadded)
+        assert torch.equal(result.cu_seqlens_q_padded, expected_padded)
+
 
 class TestCreateLossFunction:
     """Tests for the _create_loss_function helper function."""
diff --git a/tests/unit_tests/training/test_vlm_step.py b/tests/unit_tests/training/test_vlm_step.py