ROCm
diff --git a/‎test/inductor/test_cpu_repro.py‎
Lines changed: 12 additions & 0 deletions b/‎test/inductor/test_cpu_repro.py‎
Lines changed: 12 additions & 0 deletions
@@ -2644,6 +2644,18 @@ def fn(a, dim, index, b):
             self.common(fn, inps)
             assert metrics.generated_cpp_vec_kernel_count == 2
 
+    def test_large_mean(self):
+        size = (30000, 100000)
+        t = torch.rand(size, dtype=torch.float)
+        op = torch.mean
+        expected = op(t)
+        actual = torch.compile(op)(t)
+        self.assertEqual(expected, actual)
+        with set_num_threads(1):
+            expected = op(t)
+            actual = torch.compile(op)(t)
+            self.assertEqual(expected, actual)
+
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     @requires_vectorization
     @patch("torch.cuda.is_available", lambda: False)