Add tests for torch.compile stream synchronization fix

soumith · claude · soumith · commit 1c93ab28ea54 · 2025-07-08T10:54:17.000-04:00
Added comprehensive tests to verify the fix for GitHub issue pytorch/pytorch#157363: 1. test_compile_with_linear_layer: - Tests custom CUDA kernels with nn.Linear + torch.compile - Verifies correct behavior with various input sizes (1000, 5000, 10000) - Uses reduce-overhead mode to reproduce the original issue conditions 2. test_compile_custom_only: - Tests custom operations without linear layers - Ensures custom operations work correctly with torch.compile These tests ensure that custom CUDA kernels properly synchronize with PyTorch's CUDA stream when used with torch.compile, preventing race conditions that previously caused incorrect outputs. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/test/test_extension.py b/test/test_extension.py
@@ -6,6 +6,7 @@
 from torch import Tensor
 from typing import Tuple
 import torch.nn.functional as F
+import torch.nn as nn
 
 
 def reference_muladd(a, b, c):
@@ -119,5 +120,54 @@ def test_opcheck_cuda(self):
         self._opcheck("cuda")
 
 
+class TestTorchCompileStreamSync(TestCase):
+    """Test for GitHub issue pytorch/pytorch#157363 - stream synchronization with torch.compile"""
+    
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_compile_with_linear_layer(self):
+        """Test custom CUDA kernels with nn.Linear + torch.compile (the original failing case)"""
+        
+        class Model(nn.Module):
+            def __init__(self, size):
+                super().__init__()
+                self.linear = nn.Linear(size, size, device="cuda", dtype=torch.float32)
+            
+            def forward(self, x):
+                return extension_cpp.ops.mymuladd(self.linear(x), self.linear(x), 0.0)
+        
+        # Test sizes that previously failed
+        for size in [1000, 5000, 10000]:
+            with self.subTest(size=size):
+                torch.manual_seed(42)
+                model = Model(size)
+                x = torch.randn((1, size), device="cuda", dtype=torch.float32)
+                
+                with torch.no_grad():
+                    expected = model(x)
+                    compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+                    actual = compiled_model(x)
+                
+                torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_compile_custom_only(self):
+        """Test custom operations alone with torch.compile"""
+        
+        def model(x):
+            return extension_cpp.ops.mymuladd(x, x, 1.0)
+        
+        for size in [1000, 5000, 10000]:
+            with self.subTest(size=size):
+                torch.manual_seed(42)
+                x = torch.randn((size,), device="cuda", dtype=torch.float32)
+                
+                with torch.no_grad():
+                    expected = model(x)
+                    compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+                    actual = compiled_model(x)
+                
+                torch.testing.assert_close(actual, expected, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     unittest.main()