re-use FloorDiv for RShift (pytorch#145898)

ColinPeppler · pytorchmergebot · commit 521588519da9 · 2025-01-29T22:50:22.000Z
I encountered this C++ compilation error. ``` 579 | int64_t var_6 = (static_cast<int64_t>(std::floor((1.0/2.0)*u0)) | static_cast<int64_t>(std::floor((1.0/4.0)*static_cast<int64_t>(std::floor((1.0/2.0)*u0))))) | std::floor((1.0/16.0)*(static_cast<int64_t>(std::floor((1.0/2.0)*u0)) | static_cast<int64_t>(std::floor((1.0/4.0)*static_cast<int64_t>(std::floor((1.0/2.0)*u0)))))); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | | int64_t {aka long int} double ``` Then, I figured out where this std::floor came from with the help of Bob's guard provenance tool. It comes from RShift which is used in `triton.next_power_of_2`. --- Before, we used `std::floor` ``` int64_t var_6 = ( static_cast<int64_t>(std::floor((1.0/2.0)*u0)) | static_cast<int64_t>(std::floor((1.0/4.0)*static_cast<int64_t>(std::floor((1.0/2.0)*u0))))) | std::floor((1.0/16.0)*(static_cast<int64_t>(std::floor((1.0/2.0)*u0)) # no cast to int here. | static_cast<int64_t>(std::floor((1.0/4.0)*static_cast<int64_t>(std::floor((1.0/2.0)*u0)))))); ``` Now, we use `c10::div_floor_integer` instead ``` int64_t var_6 = ( (c10::div_floor_integer(static_cast<int64_t>(u0), static_cast<int64_t>(2L))) | (c10::div_floor_integer(static_cast<int64_t>(u0), static_cast<int64_t>(8L)))) | (c10::div_floor_integer(static_cast<int64_t>((c10::div_floor_integer(static_cast<int64_t>(u0), static_cast<int64_t>(2L))) | (c10::div_floor_integer(static_cast<int64_t>(u0), static_cast<int64_t>(8L)))), static_cast<int64_t>(16L))); ``` Pull Request resolved: pytorch#145898 Approved by: https://github.com/desertfire, https://github.com/bobrenjc93 ghstack dependencies: pytorch#145802
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -2176,6 +2176,33 @@ def forward(self, x):
         )
         self.check_model(model, example_inputs)
 
+    def test_triton_next_power_of_2(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, a, b, lengths):
+                n_elements = a.numel()
+                out = torch.empty_like(a)
+                max_len = int(lengths.max())
+                scaling_factor = triton.next_power_of_2(max_len)
+                add_kernel_with_scaling[(n_elements,)](
+                    a,
+                    b,
+                    out,
+                    n_elements,
+                    scaling_factor,
+                    BLOCK_SIZE=16,
+                )
+                return out
+
+        example_inputs = (
+            torch.randn(2, device=self.device),
+            torch.randn(2, device=self.device),
+            torch.arange(end=4, device=self.device),
+        )
+        self.check_model(Model(), example_inputs)
+
     @common_utils.parametrize("grid_type", [1, 2, 3])
     @common_utils.parametrize("num_dims", [1, 2])
     @common_utils.parametrize("dynamic", [False, True])
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
@@ -578,7 +578,7 @@ class RShift(sympy.Function):
     def eval(cls, base, shift):
         if shift < 0:
             raise ValueError("negative shift count")
-        return base // 2**shift
+        return FloorDiv(base, 2**shift)
 
 
 class MinMaxBase(Expr, LatticeOp):  # type: ignore[misc]