Fix PyTorch inductor tests after #3112 (#3126)

anmyachev · web-flow · commit c74534d7f0a5 · 2025-01-10T21:16:57.000+01:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -17,5 +17,5 @@ echo "Applying PyTorch patches in $REPO_ROOT"
 cd "$REPO_ROOT"
 
 curl -sSL https://github.com/pytorch/pytorch/pull/126516.diff | git apply -
-# build trigger
+# build trigger #3126
 git apply "${SCRIPT_DIR}/pytorch.patch"
diff --git a/scripts/pytorch.patch b/scripts/pytorch.patch
@@ -1,64 +1,165 @@
 diff --git a/test/inductor/test_codegen_triton.py b/test/inductor/test_codegen_triton.py
-index 84264bf1b01..aa9a624d5ac 100644
+index 84264bf1b0..54b6d028cf 100644
 --- a/test/inductor/test_codegen_triton.py
 +++ b/test/inductor/test_codegen_triton.py
-@@ -48,7 +48,7 @@ class TestCodegenTriton(InductorTestCase):
-                 return config.divisible_by_16
+@@ -39,45 +39,31 @@ class TestCodegenTriton(InductorTestCase):
+         s0 = sympy.Symbol("s0", positive=True, integer=True)
+         s1 = sympy.Symbol("s1", positive=True, integer=True)
  
-         self.assertEqual(
+-        def _check_divisibility(config):
+-            try:
+-                from triton.backends.compiler import AttrsDescriptor  # noqa: F401
+-
+-                return config.divisibility_16
+-            except ImportError:
+-                return config.divisible_by_16
+-
+-        self.assertEqual(
 -            (2,),
-+            [(2,)],
-             _check_divisibility(
-                 triton_utils.config_of(
-                     [
-@@ -63,7 +63,7 @@ class TestCodegenTriton(InductorTestCase):
+-            _check_divisibility(
+-                triton_utils.config_of(
+-                    [
+-                        SizeArg("A", two),  # no
+-                        SizeArg("B", eight),  # no
+-                        SizeArg("C", sixteen),  # yes
+-                        SizeArg("D", s0),  # no
+-                        SizeArg("E", s1),  # no
+-                    ]
+-                )
+-            ),
++        config = triton_utils.config_of(
++            [
++                SizeArg("A", two),  # no
++                SizeArg("B", eight),  # no
++                SizeArg("C", sixteen),  # yes
++                SizeArg("D", s0),  # no
++                SizeArg("E", s1),  # no
++            ]
          )
- 
-         self.assertEqual(
+-
+-        self.assertEqual(
 -            (0, 2, 4, 5, 6),
-+            [(0,), (2,), (4,), (5,), (6,)],
-             _check_divisibility(
-                 triton_utils.config_of(
-                     [
+-            _check_divisibility(
+-                triton_utils.config_of(
+-                    [
+-                        SizeArg("A", two * eight),  # 0: yes
+-                        SizeArg("B", eight * s0),  # 1: no
+-                        SizeArg("C", two * eight * s0),  # 2: yes
+-                        SizeArg("D", s0 * s1),  # 3: no
+-                        SizeArg("E", sixteen * s0),  # 4: yes
+-                        SizeArg("F", sixteen * eight * s0 * s1),  # 5: yes
+-                        SizeArg("G", two * eight * s0 * s1),  # 6: yes
+-                    ]
+-                )
+-            ),
++        # check for key
++        config[((2,),)]
++
++        config = triton_utils.config_of(
++            [
++                SizeArg("A", two * eight),  # 0: yes
++                SizeArg("B", eight * s0),  # 1: no
++                SizeArg("C", two * eight * s0),  # 2: yes
++                SizeArg("D", s0 * s1),  # 3: no
++                SizeArg("E", sixteen * s0),  # 4: yes
++                SizeArg("F", sixteen * eight * s0 * s1),  # 5: yes
++                SizeArg("G", two * eight * s0 * s1),  # 6: yes
++            ]
+         )
++        # check for key
++        config[((0,), (2,), (4,), (5,), (6,))]
+ 
+ 
+ if __name__ == "__main__":
 diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
-index 4d7a85029e3..f3d45ea5520 100644
+index f674262ab6..fc2b8a6c27 100644
 --- a/test/inductor/test_triton_kernels.py
 +++ b/test/inductor/test_triton_kernels.py
-@@ -1268,9 +1268,9 @@ def forward(self, x_1, output_1):
-         if dynamic:
-             # when half_n_elements passed to the Triton kernel is
-             # dynamic, equal_to_1 specializaiton can't be enforced
+@@ -55,14 +55,6 @@ if HAS_GPU:
+                 fast_dividef as my_fast_dividef,
+             )
+ 
+-    def _triton_get_ast_equal_to_str(params):
+-        try:
+-            from triton.backends.compiler import AttrsDescriptor  # noqa: F401
+-
+-            return f"'tt.equal_to': {params}"
+-        except ImportError:
+-            return f"equal_to_1={params}"
+-
+     # Define shared triton constants here.
+     CONSTANT_C: tl.constexpr = 4
+     STRING_CONSTANT_C: tl.constexpr = "CONSTANT_C"
+@@ -1266,12 +1258,6 @@ def forward(self, x_1, output_1):
+             torch.compile(f, dynamic=dynamic), x, y
+         )
+ 
+-        if dynamic:
+-            # when half_n_elements passed to the Triton kernel is
+-            # dynamic, equal_to_1 specializaiton can't be enforced
 -            self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
-+            self.assertTrue(_triton_get_ast_equal_to_str([]) in sources[0])
-         else:
+-        else:
 -            self.assertTrue(_triton_get_ast_equal_to_str((3,)) in sources[0])
-+            self.assertTrue(_triton_get_ast_equal_to_str([(3,)]) in sources[0])
          self.assertEqual(compiled_out, eager_out)
  
      @requires_gpu
-@@ -1299,7 +1299,7 @@ def forward(self, x_1, output_1):
+@@ -1298,9 +1284,6 @@ def forward(self, x_1, output_1):
+             torch.compile(f, dynamic=dynamic), x, y
+         )
  
-         # float 1.0 (both literal or symbolic)
-         # should not be added to equal_to_1
+-        # float 1.0 (both literal or symbolic)
+-        # should not be added to equal_to_1
 -        self.assertTrue(_triton_get_ast_equal_to_str(()) in sources[0])
-+        self.assertTrue(_triton_get_ast_equal_to_str([]) in sources[0])
          self.assertEqual(compiled_out, eager_out)
  
      @requires_gpu
 diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
-index c3f72bc5215..03aab72dca9 100644
+index c3f72bc521..016e30790a 100644
 --- a/torch/_higher_order_ops/triton_kernel_wrap.py
 +++ b/torch/_higher_order_ops/triton_kernel_wrap.py
-@@ -239,7 +239,7 @@ def generate_ttir(
+@@ -184,6 +184,7 @@ def generate_ttir(
+     from triton.compiler.compiler import ASTSource
+     from triton.runtime.autotuner import Autotuner
+     from triton.runtime.jit import JITFunction
++    from triton._utils import find_paths_if, get_iterable_path
+ 
+     import torch._inductor.ir
+     from torch._subclasses.fake_tensor import FakeTensor
+@@ -233,24 +234,40 @@ def generate_ttir(
+         name for name, arg in ordered_args.items() if isinstance(arg, Tensor)
+     ]
+ 
+-    def _get_specialization(args):  # type: ignore[no-untyped-def]
++    def _get_specialization(kernel, *args):  # type: ignore[no-untyped-def]
+         try:
+-            from triton.backends.compiler import AttrsDescriptor  # noqa: F401
++            from triton.runtime.jit import create_function_from_signature
  
              target = triton.runtime.driver.active.get_current_target()
              backend = triton.compiler.compiler.make_backend(target)
 -            return backend.get_attrs_descriptor(args, kernel.params)
-+            return backend.get_attrs_descriptor(kernel.params, args)
++            # from: binder = create_function_from_signature(self.signature, self.params, backend)
++            specialization = []
++            # signature
++            for name, kp in zip(kernel.signature.parameters.keys(), kernel.params):
++                if kp.is_constexpr:
++                    specialization.append(f'("constexpr", {name})')
++                else:
++                    is_const = 'True' if kp.is_const else 'False'
++                    specialize = 'False' if kp.do_not_specialize else 'True'
++                    align = 'False' if kp.do_not_specialize_on_alignment else 'True'
++                    ret = f"specialize_impl({name}, specialize_extra, {is_const}, {specialize}, {align})"
++                    if kp.annotation_type:
++                        specialization.append(f'("{kp.annotation_type}",) + {ret}[1:]')
++                    else:
++                        specialization.append(f"{ret}")
++            return specialization
          except ImportError:
              return kernel._get_config(*args)
  
-@@ -248,9 +248,10 @@ def generate_ttir(
+-    specialization = _get_specialization(ordered_args.values())
++    specialization = _get_specialization(kernel, ordered_args.values())
+     constants = {
          name: arg for name, arg in ordered_args.items() if not isinstance(arg, Tensor)
      }
  
@@ -71,7 +172,7 @@ index c3f72bc5215..03aab72dca9 100644
          for i, (name, arg) in enumerate(ordered_args.items())
          if i not in kernel.constexprs
      }
-@@ -258,7 +259,18 @@ def generate_ttir(
+@@ -258,7 +275,22 @@ def generate_ttir(
      triton._C.libtriton.ir.load_dialects(context)
      backend.load_dialects(context)
  
@@ -87,10 +188,105 @@ index c3f72bc5215..03aab72dca9 100644
 +                new_signature[arg_name] = signature[arg_name]
 +        return new_signature
 +
-+    src = ASTSource(kernel, fixup_signature(kernel.arg_names, signature, constants), constants, specialization)
++    attrvals = [x[1] for x in specialization]
++    from triton._utils import find_paths_if, get_iterable_path
++    attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
++    attrs = {k: backend.parse_attr(get_iterable_path(attrvals, k)) for k in attrs}
++    src = ASTSource(kernel, fixup_signature(kernel.arg_names, signature, constants), constants, attrs)
  
      # Triton changes ASTSource.make_ir to take 3/4 arguments. Handle
      # backward compatibility here.
+diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
+index a5fe2d1119..5ea0018957 100644
+--- a/torch/_inductor/ir.py
++++ b/torch/_inductor/ir.py
+@@ -5743,52 +5743,6 @@ class UserDefinedTritonKernel(ExternKernel):
+         for idx, kwarg in enumerate(self.ordered_kwargs_for_cpp_kernel):
+             if kernel.arg_names.index(kwarg) in kernel.constexprs:
+                 constexpr_indices.append(idx)
+-        """
+-        Filter out None args.
+-
+-        see https://github.com/pytorch/pytorch/issues/115344
+-
+-        Two cases for a None arg:
+-        1. The arg is already tl.constexpr, so leave it in
+-        2. The arg is not tl.constexpr so we have to remove it
+-        """
+-        constexpr_indices_set = OrderedSet(constexpr_indices)
+-        REMOVED = object()
+-        raw_args = [
+-            (
+-                (idx, arg)
+-                if (arg is not None) or (arg is None and idx in constexpr_indices_set)
+-                else (idx, REMOVED)
+-            )
+-            for idx, arg in enumerate(raw_args)
+-        ]
+-        removed_none_args = [idx for idx, val in raw_args if val == REMOVED]
+-        raw_args = [val for idx, val in raw_args if val != REMOVED]
+-
+-        # We have to compute the constexpr indices for the new, filtered raw_args
+-        # We also have to adjust equal_to_1.
+-        if removed_none_args:
+-            eq1_indices_set = OrderedSet[int](triton_meta["configs"][0].equal_to_1)
+-            constexpr_indices = []
+-            equal_to_1 = []
+-            index_shift = 0
+-            for idx, kwarg in enumerate(self.ordered_kwargs_for_cpp_kernel):
+-                # every time we encounter an idx we removed, adjust by one to account for it
+-                # So for example if we had [None, const X]
+-                # iter 1:
+-                #   None was removed, adjust=1
+-                # iter 2:
+-                #  X is const at idx=1, but the adjusted idx is 0 now, because None was removed
+-                if idx in removed_none_args:
+-                    index_shift += 1
+-                    continue
+-                arg_index = kernel.arg_names.index(kwarg)
+-                if arg_index in kernel.constexprs:
+-                    constexpr_indices.append(idx - index_shift)
+-                if arg_index in eq1_indices_set:
+-                    equal_to_1.append(idx - index_shift)
+-
+-            triton_meta["configs"][0].equal_to_1 = equal_to_1
+ 
+         # Call to kernel
+         self.codegen_comment(wrapper)
+diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
+index ccd69bf828..2c89659132 100644
+--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
++++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
+@@ -551,28 +551,8 @@ class CppWrapperGpu(CppWrapperCpu):
+             )
+             kernel_var_name = self.generate_load_kernel_once(kernel_name, V.graph)
+ 
+-            # args with value 1 are added into equal_to_1 and constants
+-            # in triton_meta (in the Python codegen) which makes them
+-            # inlined in the PTX and compiled CUBIN
+-            arg_signatures = []
+-            if (
+-                triton_meta is not None
+-                and triton_meta.get("configs")
+-                and triton_meta.get("signature")
+-            ):
+-                equal_to_1 = triton_meta["configs"][0].equal_to_1
+-                call_args = [
+-                    arg for i, arg in enumerate(call_args) if i not in equal_to_1
+-                ]
+-                arg_types = [t for i, t in enumerate(arg_types) if i not in equal_to_1]
+-                # extract the arg signatures from triton_meta
+-                arg_signatures = triton_meta["signature"].values()
+-                arg_signatures = [
+-                    v for i, v in enumerate(arg_signatures) if i not in equal_to_1
+-                ]
+-
+             call_args_str = self.generate_args_decl(
+-                call_args, arg_types, arg_signatures
++                call_args, arg_types, list(triton_meta["signature"].values())
+             )
+             kernel_args_var = f"kernel_args_var_{next(self.kernel_callsite_id)}"
+             self.writeline(f"void* {kernel_args_var}[] = {{{call_args_str}}};")
 diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
 index 00031a56b8d..59086d41b40 100644
 --- a/torch/_inductor/codegen/triton.py
@@ -173,7 +369,7 @@ index fa2a1334380..4d730fd45de 100644
      except ImportError:
          from triton.compiler.compiler import AttrsDescriptor
 diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
-index 281d0e78ba4..3b059a365c9 100644
+index 281d0e78ba..8b3857bf2e 100644
 --- a/torch/_inductor/runtime/triton_heuristics.py
 +++ b/torch/_inductor/runtime/triton_heuristics.py
 @@ -414,10 +414,21 @@ class CachingAutotuner(KernelInterface):
@@ -213,19 +409,40 @@ index 281d0e78ba4..3b059a365c9 100644
          ]
          binary_shared = (
              binary.shared if hasattr(binary, "shared") else binary.metadata.shared
-@@ -952,6 +961,7 @@ class CachingAutotuner(KernelInterface):
+@@ -646,8 +655,11 @@ class CachingAutotuner(KernelInterface):
+             )
+             # reset to zero before evaluating any config
+             self.reset_to_zero_args(*args, **kwargs)
++            new_cloned_args = [*cloned_args]
++            for arg_name, arg_value in launcher.config.kwargs.items():
++                new_cloned_args.insert(self.fn.arg_names.index(arg_name), arg_value)
+             launcher(
+-                *cloned_args,
++                *new_cloned_args,
+                 **cloned_kwargs,
+                 grid=grid,
+                 stream=stream,
+@@ -950,15 +962,21 @@ class CachingAutotuner(KernelInterface):
+                     "stream": stream,
+                 },
              ):
++                new_args = [*args]
++                for arg_name, arg_value in launcher.config.kwargs.items():
++                    new_args.insert(self.fn.arg_names.index(arg_name), arg_value)
                  return launcher(
-                     *args,
-+                    **launcher.config.kwargs,
+-                    *args,
++                    *new_args,
                      **kwargs,
                      grid=grid,
                      stream=stream,
-@@ -959,6 +969,7 @@ class CachingAutotuner(KernelInterface):
+                 )
          else:
++            new_args = [*args]
++            for arg_name, arg_value in launcher.config.kwargs.items():
++                new_args.insert(self.fn.arg_names.index(arg_name), arg_value)
              return launcher(
-                 *args,
-+                **launcher.config.kwargs,
+-                *args,
++                *new_args,
                  **kwargs,
                  grid=grid,
                  stream=stream,