NVIDIA
diff --git a/‎megatron/core/distributed/nonuniform_tp.py‎
Lines changed: 0 additions & 38 deletions b/‎megatron/core/distributed/nonuniform_tp.py‎
Lines changed: 0 additions & 38 deletions
@@ -697,41 +697,3 @@ def prepare_grads(self, *args, **kwargs):
         return result
 
 
-# ======================================================================================
-# Test Function
-# ======================================================================================
-
-
-def test_ntp():
-    """Test function for nonuniform TP initialization."""
-    head_dim = 128
-    ffn_exp = 4
-
-    class MockConfig:
-        num_attention_heads = 24
-        ffn_hidden_size = num_attention_heads * head_dim * ffn_exp
-
-    class MockModule:
-        def __init__(self, out_features):
-            self.weight = torch.nn.Parameter(torch.randn(out_features, 1, dtype=torch.half))
-            self.weight.partition_dim = 1
-            self.weight.tensor_model_parallel = True
-            self.config = MockConfig()
-
-        def parameters(self):
-            return [self.weight]
-
-    class MockLayer:
-        def __init__(self):
-            self.self_attention = MockModule(int(3 * 10248 / 8))
-            self.mlp = MockModule(12288 // 8)
-
-    layer = MockLayer()
-    ddp_config = DistributedDataParallelConfig(tp_base=8, tp_spares=2)
-    ntp_init(layer, ddp_config)
-    print("NTP initialization test passed!")
-    return layer
-
-
-if __name__ == '__main__':
-    layer = test_ntp()