unit test works well for pure CP on 4 and 8 GPUs

brb-nv · brb-nv · commit 115bc2569745 · 2025-12-06T20:22:45.000Z
diff --git a/tests/unittest/_torch/modules/test_mla_helix.py b/tests/unittest/_torch/modules/test_mla_helix.py
@@ -99,7 +99,9 @@ def max_position_embeddings(self) -> int:
 
 
 all_scenarios = [
-    Scenario(batch=1, ctx_len=64),
+    # Scenario(batch=1, ctx_len=64),
+    # Scenario(batch=1, ctx_len=64),
+    Scenario(batch=1, ctx_len=128),
     Scenario(batch=1, ctx_len=512),
     Scenario(batch=1, ctx_len=1024),
     Scenario(batch=1, ctx_len=2048),
@@ -414,7 +416,7 @@ def rotate_half_inv(x):
         )
 
     mapping = Mapping(
-        world_size=world_size, rank=rank, cp_size=world_size, cp_config={"cp_type": CpType.HELIX}
+        world_size=world_size, rank=rank, cp_size=world_size, cp_config={"cp_type": CpType.HELIX, "tokens_per_block": 32}
     )
     # use cp_allgather here to broadcast from rank 0 to all other ranks
     ret_all = cp_allgather(ret, mapping=mapping, dim=0)
@@ -837,15 +839,15 @@ def _run_single_rank(func, *args, **kwargs):
         raise Exception(f"\n\nError occurred. Original traceback is\n{tb}\n")
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="needs 2 GPUs to run this test")
+@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="needs 8 GPUs to run this test")
 @pytest.mark.parametrize("scenario", test_scenarios, ids=lambda x: f"scenario: {x}")
 def test_mla_helix_distributed(
     scenario: Scenario,
     gen_steps: Optional[int] = None,
     max_mismatch_ratio: float = 0.02,
     mismatch_ratios: Optional[List[float]] = None,
 ):
-    world_size = 2
+    world_size = 8
     gen_steps = scenario.ref_steps if gen_steps is None else gen_steps
     with MPIPoolExecutor(max_workers=world_size) as executor:
         results = executor.map(