NVIDIA-NeMo
diff --git a/‎tests/unit_tests/models/deepseek_v3/test_dsv3_layers.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/unit_tests/models/deepseek_v3/test_dsv3_layers.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/unit_tests/models/deepseek_v32/test_dsv32_layers.py‎
Lines changed: 388 additions & 0 deletions b/‎tests/unit_tests/models/deepseek_v32/test_dsv32_layers.py‎
Lines changed: 388 additions & 0 deletions
@@ -359,7 +359,6 @@ def create_mock_config(self, **overrides):
         config.hidden_size = 1024
         config.rope_scaling = None
         config.max_position_embeddings = 4096
-        config.rms_norm_eps = 1e-6
 
         for key, value in overrides.items():
             setattr(config, key, value)
 
@@ -408,3 +408,391 @@ def test_init_weights(self, mock_trunc_normal):
 
             # Check that norm reset_parameters was called
             assert mock_norm.reset_parameters.call_count >= 2  # q_a_layernorm, kv_a_layernorm
+
+
+class TestDeepseekV32IndexerInitWeights:
+    def create_mock_config(self, **overrides):
+        config = Mock(spec=DeepseekV32Config)
+        config.num_attention_heads = 8
+        config.hidden_size = 256
+        config.q_lora_rank = 128
+        config.index_n_heads = 4
+        config.index_head_dim = 32
+        config.index_topk = 16
+        config.qk_rope_head_dim = 16
+
+        for key, value in overrides.items():
+            setattr(config, key, value)
+        return config
+
+    @patch("torch.nn.init.trunc_normal_")
+    @patch("nemo_automodel.components.models.deepseek_v32.layers.initialize_linear_module")
+    def test_indexer_init_weights(self, mock_init_linear, mock_trunc_normal):
+        """Test Indexer weight initialization directly."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        mock_linear = Mock()
+        mock_linear.weight = torch.randn(64, 256)
+        mock_init_linear.return_value = mock_linear
+
+        indexer = DeepseekV32Indexer(config, backend)
+        indexer.init_weights(init_std=0.02)
+
+        # Should call trunc_normal_ for wq_b, wk, weights_proj (3 linear layers)
+        assert mock_trunc_normal.call_count == 3
+
+
+class TestDeepseekV32IndexerForward:
+    def create_mock_config(self, **overrides):
+        config = Mock(spec=DeepseekV32Config)
+        config.num_attention_heads = 8
+        config.hidden_size = 64
+        config.q_lora_rank = 32
+        config.index_n_heads = 4
+        config.index_head_dim = 16
+        config.index_topk = 8
+        config.qk_rope_head_dim = 8
+
+        for key, value in overrides.items():
+            setattr(config, key, value)
+        return config
+
+    @skip_if_no_gpu
+    def test_indexer_forward_bshd(self):
+        """Test Indexer forward pass with bshd format."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        indexer = DeepseekV32Indexer(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        q_resid = torch.randn(bsz, seq_len, config.q_lora_rank, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        topk_indices = indexer(x, q_resid, freqs_cis)
+
+        assert topk_indices.shape == (bsz, seq_len, config.index_topk)
+        assert topk_indices.dtype == torch.int64
+
+    @pytest.mark.skip(reason="thd format requires complex freqs_cis setup matching model runtime")
+    @skip_if_no_gpu
+    def test_indexer_forward_thd(self):
+        """Test Indexer forward pass with thd format."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        indexer = DeepseekV32Indexer(config, backend).cuda().to(torch.bfloat16)
+
+        num_tokens = 32
+        x = torch.randn(num_tokens, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        q_resid = torch.randn(num_tokens, config.q_lora_rank, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for thd format [T, D/2] as complex tensor
+        angles = torch.randn(num_tokens, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        topk_indices = indexer(x, q_resid, freqs_cis)
+
+        assert topk_indices.shape == (num_tokens, config.index_topk)
+        assert topk_indices.dtype == torch.int64
+
+    @skip_if_no_gpu
+    def test_indexer_forward_with_attention_mask_bshd(self):
+        """Test Indexer forward pass with attention mask in bshd format."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        indexer = DeepseekV32Indexer(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        q_resid = torch.randn(bsz, seq_len, config.q_lora_rank, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        # Create causal mask
+        attention_mask = torch.triu(
+            torch.full((1, 1, seq_len, seq_len), float("-inf"), device="cuda"),
+            diagonal=1,
+        )
+
+        topk_indices = indexer(x, q_resid, freqs_cis, attention_mask=attention_mask)
+
+        assert topk_indices.shape == (bsz, seq_len, config.index_topk)
+
+    @pytest.mark.skip(reason="thd format requires complex freqs_cis setup matching model runtime")
+    @skip_if_no_gpu
+    def test_indexer_forward_with_attention_mask_thd(self):
+        """Test Indexer forward pass with attention mask in thd format."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        indexer = DeepseekV32Indexer(config, backend).cuda().to(torch.bfloat16)
+
+        num_tokens = 32
+        x = torch.randn(num_tokens, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        q_resid = torch.randn(num_tokens, config.q_lora_rank, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for thd format [T, D/2] as complex tensor
+        angles = torch.randn(num_tokens, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        # Create causal mask for thd format
+        attention_mask = torch.triu(
+            torch.full((1, 1, num_tokens, num_tokens), float("-inf"), device="cuda"),
+            diagonal=1,
+        )
+
+        topk_indices = indexer(x, q_resid, freqs_cis, attention_mask=attention_mask)
+
+        assert topk_indices.shape == (num_tokens, config.index_topk)
+
+    @skip_if_no_gpu
+    def test_indexer_forward_topk_larger_than_seq(self):
+        """Test Indexer forward when topk > seq_len."""
+        config = self.create_mock_config(index_topk=64)  # larger than seq_len
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        indexer = DeepseekV32Indexer(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16  # seq_len < index_topk
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        q_resid = torch.randn(bsz, seq_len, config.q_lora_rank, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        topk_indices = indexer(x, q_resid, freqs_cis)
+
+        # Should clamp to seq_len
+        assert topk_indices.shape == (bsz, seq_len, seq_len)
+
+
+class TestDeepseekV32MLAForward:
+    def create_mock_config(self, **overrides):
+        config = Mock(spec=DeepseekV32Config)
+        config.num_attention_heads = 4
+        config.hidden_size = 64
+        config.q_lora_rank = 32
+        config.kv_lora_rank = 32
+        config.qk_nope_head_dim = 8
+        config.qk_rope_head_dim = 8
+        config.qk_head_dim = 16
+        config.v_head_dim = 16
+        config.rope_scaling = None
+        config.max_position_embeddings = 4096
+        config.index_n_heads = 4
+        config.index_head_dim = 16
+        config.index_topk = 8
+
+        for key, value in overrides.items():
+            setattr(config, key, value)
+        return config
+
+    @skip_if_no_gpu
+    def test_mla_forward_bshd_sdpa(self):
+        """Test MLA forward pass with bshd format and SDPA backend."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        mla = DeepseekV32MLA(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        output = mla(x, freqs_cis)
+
+        assert output.shape == (bsz, seq_len, config.hidden_size)
+        assert output.dtype == torch.bfloat16
+
+    @pytest.mark.skip(reason="thd format requires complex freqs_cis setup matching model runtime")
+    @skip_if_no_gpu
+    def test_mla_forward_thd_sdpa(self):
+        """Test MLA forward pass with thd format and SDPA backend."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        mla = DeepseekV32MLA(config, backend).cuda().to(torch.bfloat16)
+
+        num_tokens = 32
+        x = torch.randn(num_tokens, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for thd format [T, D/2] as complex tensor
+        angles = torch.randn(num_tokens, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        output = mla(x, freqs_cis)
+
+        assert output.shape == (num_tokens, config.hidden_size)
+        assert output.dtype == torch.bfloat16
+
+    @skip_if_no_gpu
+    def test_mla_forward_with_attention_mask(self):
+        """Test MLA forward pass with attention mask."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        mla = DeepseekV32MLA(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        # Create causal mask
+        attention_mask = torch.triu(
+            torch.full((1, 1, seq_len, seq_len), float("-inf"), device="cuda"),
+            diagonal=1,
+        )
+
+        output = mla(x, freqs_cis, attention_mask=attention_mask)
+
+        assert output.shape == (bsz, seq_len, config.hidden_size)
+
+    @skip_te
+    @skip_if_no_gpu
+    def test_mla_forward_bshd_te(self):
+        """Test MLA forward pass with bshd format and TE backend."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="te", linear="torch", rms_norm="torch")
+
+        mla = DeepseekV32MLA(config, backend).cuda().to(torch.bfloat16)
+
+        bsz, seq_len = 2, 16
+        x = torch.randn(bsz, seq_len, config.hidden_size, device="cuda", dtype=torch.bfloat16)
+        # Create complex freqs_cis for bshd format [B, T, D/2] as complex tensor
+        angles = torch.randn(bsz, seq_len, config.qk_rope_head_dim // 2, device="cuda", dtype=torch.float32)
+        freqs_cis = torch.polar(torch.ones_like(angles), angles)
+
+        output = mla(x, freqs_cis)
+
+        assert output.shape == (bsz, seq_len, config.hidden_size)
+
+
+class TestBuildSparseMaskWithAttentionMask:
+    def create_mock_config(self, **overrides):
+        config = Mock(spec=DeepseekV32Config)
+        config.num_attention_heads = 8
+        config.hidden_size = 256
+        config.q_lora_rank = 128
+        config.kv_lora_rank = 64
+        config.qk_nope_head_dim = 16
+        config.qk_rope_head_dim = 16
+        config.qk_head_dim = 32
+        config.v_head_dim = 32
+        config.rope_scaling = None
+        config.max_position_embeddings = 4096
+        config.index_n_heads = 4
+        config.index_head_dim = 32
+        config.index_topk = 16
+
+        for key, value in overrides.items():
+            setattr(config, key, value)
+        return config
+
+    @patch("nemo_automodel.components.models.deepseek_v32.layers.initialize_linear_module")
+    @patch("nemo_automodel.components.models.deepseek_v32.layers.initialize_rms_norm_module")
+    @patch("nemo_automodel.components.models.deepseek_v32.layers.initialize_attn_module_and_func")
+    def test_build_sparse_mask_combines_with_attention_mask(self, mock_init_attn, mock_init_rms, mock_init_linear):
+        """Test that sparse mask is combined with attention mask."""
+        config = self.create_mock_config()
+        backend = BackendConfig(attn="sdpa", linear="torch", rms_norm="torch")
+
+        mock_init_linear.return_value = Mock()
+        mock_init_rms.return_value = Mock()
+        mock_init_attn.return_value = (Mock(), Mock())
+
+        mla = DeepseekV32MLA(config, backend)
+
+        bsz, seq_len, topk = 2, 32, 8
+        topk_indices = torch.randint(0, seq_len, (bsz, seq_len, topk))
+
+        # Create an attention mask
+        attention_mask = torch.triu(
+            torch.full((bsz, 1, seq_len, seq_len), float("-inf")),
+            diagonal=1,
+        )
+
+        sparse_mask = mla._build_sparse_mask(
+            topk_indices,
+            seq_len,
+            qkv_format="bshd",
+            bsz=bsz,
+            n_heads=1,
+            dtype=torch.float32,
+            attention_mask=attention_mask,
+            union_across_batches=False,
+        )
+
+        # Result should combine both masks
+        assert sparse_mask.shape == (bsz, 1, seq_len, seq_len)
+
+        # Check that causal structure is preserved (upper triangle should be -inf)
+        for b in range(bsz):
+            for i in range(seq_len):
+                for j in range(i + 1, seq_len):
+                    assert sparse_mask[b, 0, i, j] == float("-inf")
+
+
+class TestHadamardTransformFallback:
+    """Test the fallback hadamard_transform implementation when fast_hadamard_transform is not available."""
+
+    def test_hadamard_transform_torch_basic(self):
+        """Test basic hadamard transform functionality."""
+        # Import the torch fallback implementation directly
+        from nemo_automodel.components.models.deepseek_v32 import layers
+
+        # Check if we're using the fallback
+        if not layers._FAST_HADAMARD_AVAILABLE:
+            # Test the torch implementation
+            batch_size = 4
+            n = 64  # Must be power of 2
+            x = torch.randn(batch_size, n)
+            scale = n**-0.5
+
+            result = layers.hadamard_transform(x, scale)
+
+            assert result.shape == x.shape
+            assert result.dtype == x.dtype
+
+    def test_hadamard_transform_torch_power_of_2(self):
+        """Test that hadamard transform works with different power-of-2 sizes."""
+        from nemo_automodel.components.models.deepseek_v32 import layers
+
+        if not layers._FAST_HADAMARD_AVAILABLE:
+            for n in [8, 16, 32, 64, 128]:
+                batch_size = 2
+                x = torch.randn(batch_size, n)
+                scale = n**-0.5
+
+                result = layers.hadamard_transform(x, scale)
+                assert result.shape == (batch_size, n)
+
+
+class TestRotateActivationEdgeCases:
+    """Test edge cases for _rotate_activation function."""
+
+    @skip_if_no_gpu
+    def test_rotate_activation_float16_converts(self):
+        """Test that float16 input is converted to bfloat16."""
+        x = torch.randn(2, 8, 64, device="cuda", dtype=torch.float16)
+        result = _rotate_activation(x)
+        assert result.dtype == torch.bfloat16
+        assert result.shape == x.shape
+
+    def test_rotate_activation_applies_scale(self):
+        """Test that rotation applies the correct scale factor."""
+        from nemo_automodel.components.models.deepseek_v32 import layers
+
+        if not layers._FAST_HADAMARD_AVAILABLE:
+            # With fallback, we can verify the scale is applied
+            x = torch.randn(2, 64, dtype=torch.bfloat16)
+            result = _rotate_activation(x)
+            # The scale factor should be hidden_size^-0.5 = 64^-0.5 = 0.125
+            assert result.shape == x.shape