Fix CUDA flaky tests for stochastic gates by using CPU-seeded RNG (#1802)

cyrjano · meta-codesync[bot] · commit 99aa09a33519 · 2026-03-24T10:07:04.000-07:00
Summary: Pull Request resolved: #1802 ## Problem CUDA RNG produces different random sequences on different GPU architectures (e.g. V100 vs A100 vs H100) even with the same seed set via `torch.manual_seed()`. This causes stochastic gate CUDA tests to be flaky in CI — the same test passes on one GPU type but fails on another because expected values were hardcoded for a specific architecture's RNG output. Additionally, `test_p_norm_decay` uses exact `assert ==` for floating-point tensor comparison, which fails on GPU due to floating-point precision differences. ## Solution **CPU-seeded RNG approach**: In CUDA test subclasses, patch `_sample_gate_values` to generate random noise on CPU (where `torch.manual_seed` is deterministic across all hardware) and then move the tensor to the GPU device. This keeps the full training codepath exercised (noise + mu → clamp → gather → multiply) while ensuring cross-architecture determinism. For `LazyGaussianStochasticGates`, both `initialize_parameters` (mu initialization) and `_sample_gate_values` (noise sampling) happen on-device after `.to(cuda)`, so both are patched to use CPU RNG. Since CUDA tests now produce identical values to CPU tests, the `if cpu / elif cuda` branches in base test files are removed, along with associated `pyre-fixme[61]` comments. For `test_p_norm_decay`, exact `assert ==` is replaced with `assertTensorAlmostEqual` with `delta=0.01` tolerance. ## Files Changed - `test_gaussian_stochastic_gates_cuda.py`: Patch `_sample_gate_values` with CPU-seeded `normal_()` sampling - `test_kuma_stochastic_gates_cuda.py`: Patch `_sample_gate_values` with CPU-seeded `uniform_()` sampling + Kumaraswamy transform - `test_lazy_gaussian_stochastic_gates_cuda.py`: Patch both `initialize_parameters` and `_sample_gate_values` - `test_gaussian_stochastic_gates.py`: Remove cpu/cuda branches (4 tests) - `test_kuma_stochastic_gates.py`: Remove cpu/cuda branches (4 tests) - `test_lazy_gaussian_stochastic_gates.py`: Remove cpu/cuda branches (12 tests) - `test_p_norm_decay.py`: Use `assertTensorAlmostEqual` instead of exact equality (2 tests) Reviewed By: craymichael Differential Revision: D97775614 fbshipit-source-id: 348ad6f317838fd5577fcc19a18bed256905c5b7
diff --git a/tests/module/test_binary_concrete_stochastic_gates.py b/tests/module/test_binary_concrete_stochastic_gates.py
@@ -32,12 +32,8 @@ def test_bcstg_1d_input(self) -> None:
         gated_input, reg = bcstg(input_tensor)
         expected_reg = 2.4947
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [[0.0000, 0.0212, 0.1892], [0.1839, 0.3753, 0.4937]]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [[0.0000, 0.0985, 0.1149], [0.2329, 0.0497, 0.5000]]
+        expected_gated_input = [[0.0000, 0.0212, 0.1892], [0.1839, 0.3753, 0.4937]]
 
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
@@ -110,12 +106,8 @@ def test_bcstg_1d_input_with_mask(self) -> None:
         gated_input, reg = bcstg(input_tensor)
         expected_reg = 1.6643
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [[0.0000, 0.0000, 0.1679], [0.0000, 0.0000, 0.2223]]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [[0.0000, 0.0000, 0.1971], [0.1737, 0.2317, 0.3888]]
+        expected_gated_input = [[0.0000, 0.0000, 0.1679], [0.0000, 0.0000, 0.2223]]
 
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
@@ -143,18 +135,10 @@ def test_bcstg_2d_input(self) -> None:
         gated_input, reg = bcstg(input_tensor)
 
         expected_reg = 4.9903
-        expected_gated_input = []
-
-        if self.testing_device == "cpu":
-            expected_gated_input = [
-                [[0.0000, 0.0990], [0.0261, 0.2431], [0.0551, 0.3863]],
-                [[0.0476, 0.6177], [0.5400, 0.1530], [0.0984, 0.8013]],
-            ]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [
-                [[0.0000, 0.0985], [0.1149, 0.2331], [0.0486, 0.5000]],
-                [[0.1840, 0.1571], [0.4612, 0.7937], [0.2975, 0.7393]],
-            ]
+        expected_gated_input = [
+            [[0.0000, 0.0990], [0.0261, 0.2431], [0.0551, 0.3863]],
+            [[0.0476, 0.6177], [0.5400, 0.1530], [0.0984, 0.8013]],
+        ]
 
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
@@ -207,18 +191,11 @@ def test_bcstg_2d_input_with_mask(self) -> None:
         gated_input, reg = bcstg(input_tensor)
         expected_reg = 2.4947
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [
-                [[0.0000, 0.0212], [0.0424, 0.0636], [0.3191, 0.4730]],
-                [[0.3678, 0.6568], [0.7507, 0.8445], [0.6130, 1.0861]],
-            ]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [
-                [[0.0000, 0.0985], [0.1971, 0.2956], [0.0000, 0.2872]],
-                [[0.4658, 0.0870], [0.0994, 0.1119], [0.7764, 1.1000]],
-            ]
+        expected_gated_input = [
+            [[0.0000, 0.0212], [0.0424, 0.0636], [0.3191, 0.4730]],
+            [[0.3678, 0.6568], [0.7507, 0.8445], [0.6130, 1.0861]],
+        ]
 
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
diff --git a/tests/module/test_binary_concrete_stochastic_gates_cuda.py b/tests/module/test_binary_concrete_stochastic_gates_cuda.py
@@ -3,8 +3,49 @@
 
 # pyre-strict
 
+import unittest
+from unittest.mock import patch
+
+import torch
+from captum.module.binary_concrete_stochastic_gates import BinaryConcreteStochasticGates
+from torch import Tensor
+
 from .test_binary_concrete_stochastic_gates import TestBinaryConcreteStochasticGates
 
 
-class TestBinaryConcreteStochasticGatesCUDA(TestBinaryConcreteStochasticGates):
+# CUDA RNG produces different sequences on different GPU architectures
+# (e.g. V100 vs A100 vs H100) even with the same seed, causing flaky
+# tests. By generating uniform samples on CPU and moving to the device,
+# tests get consistent results regardless of which GPU type runs them.
+def _cpu_rng_sample(self: BinaryConcreteStochasticGates, batch_size: int) -> Tensor:
+    if self.training:
+        u = torch.empty(batch_size, self.n_gates)
+        u.uniform_(self.eps, 1 - self.eps)
+        u = u.to(self.log_alpha_param.device)
+        s = torch.sigmoid((torch.logit(u) + self.log_alpha_param) / self.temperature)
+    else:
+        s = torch.sigmoid(self.log_alpha_param)
+        s = s.expand(batch_size, self.n_gates)
+
+    s_bar = s * (self.upper_bound - self.lower_bound) + self.lower_bound
+    return s_bar
+
+
+class TestBinaryConcreteStochasticGatesCUDA(
+    TestBinaryConcreteStochasticGates,
+):
     testing_device: str = "cuda"
+
+    def setUp(self) -> None:
+        super().setUp()
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("Skipping GPU test since CUDA not available.")
+        # pyre-fixme[8]: Attribute has type
+        #  `BoundMethod[..., Tensor]`; used as `(...) -> Tensor`.
+        patcher = patch.object(
+            BinaryConcreteStochasticGates,
+            "_sample_gate_values",
+            _cpu_rng_sample,
+        )
+        patcher.start()
+        self.addCleanup(patcher.stop)
diff --git a/tests/module/test_gaussian_stochastic_gates.py b/tests/module/test_gaussian_stochastic_gates.py
@@ -33,13 +33,8 @@ def test_gstg_1d_input(self) -> None:
 
         gated_input, reg = gstg(input_tensor)
         expected_reg = 2.5213
+        expected_gated_input = [[0.0000, 0.0198, 0.1483], [0.1848, 0.3402, 0.1782]]
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [[0.0000, 0.0198, 0.1483], [0.1848, 0.3402, 0.1782]]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [[0.0000, 0.0788, 0.0470], [0.0134, 0.0000, 0.1884]]
-
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
@@ -90,13 +85,8 @@ def test_gstg_1d_input_with_mask(self) -> None:
 
         gated_input, reg = gstg(input_tensor)
         expected_reg = 1.6849
+        expected_gated_input = [[0.0000, 0.0000, 0.1225], [0.0583, 0.0777, 0.3779]]
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [[0.0000, 0.0000, 0.1225], [0.0583, 0.0777, 0.3779]]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [[0.0000, 0.0000, 0.1577], [0.0736, 0.0981, 0.0242]]
-
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
@@ -137,19 +127,11 @@ def test_gstg_2d_input(self) -> None:
 
         gated_input, reg = gstg(input_tensor)
         expected_reg = 5.0458
+        expected_gated_input = [
+            [[0.0000, 0.0851], [0.0713, 0.3000], [0.2180, 0.1878]],
+            [[0.2538, 0.0000], [0.3391, 0.8501], [0.3633, 0.8913]],
+        ]
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [
-                [[0.0000, 0.0851], [0.0713, 0.3000], [0.2180, 0.1878]],
-                [[0.2538, 0.0000], [0.3391, 0.8501], [0.3633, 0.8913]],
-            ]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [
-                [[0.0000, 0.0788], [0.0470, 0.0139], [0.0000, 0.1960]],
-                [[0.0000, 0.7000], [0.1052, 0.2120], [0.5978, 0.0166]],
-            ]
-
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
@@ -200,19 +182,11 @@ def test_gstg_2d_input_with_mask(self) -> None:
 
         gated_input, reg = gstg(input_tensor)
         expected_reg = 2.5213
+        expected_gated_input = [
+            [[0.0000, 0.0198], [0.0396, 0.0594], [0.2435, 0.3708]],
+            [[0.3696, 0.5954], [0.6805, 0.7655], [0.6159, 0.3921]],
+        ]
 
-        if self.testing_device == "cpu":
-            expected_gated_input = [
-                [[0.0000, 0.0198], [0.0396, 0.0594], [0.2435, 0.3708]],
-                [[0.3696, 0.5954], [0.6805, 0.7655], [0.6159, 0.3921]],
-            ]
-        elif self.testing_device == "cuda":
-            expected_gated_input = [
-                [[0.0000, 0.0788], [0.1577, 0.2365], [0.0000, 0.1174]],
-                [[0.0269, 0.0000], [0.0000, 0.0000], [0.0448, 0.4145]],
-            ]
-
-        # pyre-fixme[61]: `expected_gated_input` is undefined, or not always defined.
         assertTensorAlmostEqual(self, gated_input, expected_gated_input, mode="max")
         assertTensorAlmostEqual(self, reg, expected_reg)
 
diff --git a/tests/module/test_gaussian_stochastic_gates_cuda.py b/tests/module/test_gaussian_stochastic_gates_cuda.py
@@ -3,8 +3,42 @@
 
 # pyre-strict
 
+import unittest
+from unittest.mock import patch
+
+import torch
+from captum.module.gaussian_stochastic_gates import GaussianStochasticGates
+from torch import Tensor
+
 from .test_gaussian_stochastic_gates import TestGaussianStochasticGates
 
 
+# CUDA RNG produces different sequences on different GPU architectures
+# (e.g. V100 vs A100 vs H100) even with the same seed, causing flaky tests.
+# By generating noise on CPU (where torch.manual_seed is deterministic across
+# all hardware) and moving to the device, tests get consistent results
+# regardless of which GPU type runs them in CI.
+def _cpu_rng_sample(self: GaussianStochasticGates, batch_size: int) -> Tensor:
+    if self.training:
+        n = torch.empty(batch_size, self.n_gates)
+        n.normal_(mean=0, std=self.std)
+        return self.mu + n.to(self.mu.device)
+    return self.mu.expand(batch_size, self.n_gates)
+
+
 class TestGaussianStochasticGatesCUDA(TestGaussianStochasticGates):
     testing_device: str = "cuda"
+
+    def setUp(self) -> None:
+        super().setUp()
+        if not torch.cuda.is_available():
+            raise unittest.SkipTest("Skipping GPU test since CUDA not available.")
+        # pyre-fixme[8]: Attribute has type
+        #  `BoundMethod[..., Tensor]`; used as `(...) -> Tensor`.
+        patcher = patch.object(
+            GaussianStochasticGates,
+            "_sample_gate_values",
+            _cpu_rng_sample,
+        )
+        patcher.start()
+        self.addCleanup(patcher.stop)