[AOTI][refactor] Move convert_arrayref_tensor_to_tensor logic (pytorch#139030)

desertfire · pytorchmergebot · commit fb2c750e9d14 · 2024-10-28T18:00:41.000Z
Summary: Move convert_arrayref_tensor_to_tensor codegen logic to cpp_wrapper_cpu_array_ref.py Test Plan: CI Differential Revision: D64904187 Pull Request resolved: pytorch#139030 Approved by: https://github.com/hl475
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -822,20 +822,12 @@ def codegen_tensor_item(
         if dtype == torch.float16 or dtype == torch.bfloat16:
             scalar_tmp = f"{scalar}_tmp"
             writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar_tmp};")
-
-            # need convert_arrayref_tensor_to_tensor for ArrayRefTensors
-            tensor = f"convert_arrayref_tensor_to_tensor({tensor})"
-
             writer.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar_tmp}));"
             )
             writer.writeline(f"float {scalar} = float({scalar_tmp});")
         else:
             writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};")
-
-            # need convert_arrayref_tensor_to_tensor for ArrayRefTensors
-            tensor = f"convert_arrayref_tensor_to_tensor({tensor})"
-
             writer.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));"
             )
@@ -939,6 +931,7 @@ def generate_end(self, result):
             f"""
             '''
             )
+
             inductor_entry = CppWrapperCodeCache.load_pybinding(
                 ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
             """
@@ -1014,29 +1007,12 @@ def get_c_shim_func_name(self, kernel):
         return shim_fn
 
     def generate_c_shim_extern_kernel_call(self, kernel, args):
-        wrapped_args = []
         debug_printer_manager = V.graph.wrapper_code.debug_printer
-
-        for x in args:
-            pieces = x.split(", ")
-            for piece in pieces:
-                # We only really *need* convert_arrayref_tensor_to_tensor for
-                # ArrayRefTensors. The code flowing into here uses `0` for nullptr,
-                # which convert_arrayref_tensor_to_tensor would blindly coerce to int,
-                # so just avoid wrapping integers.
-                # Name matching is to find tensor is hacky, but fixing all the
-                # ArrayRefTensor issues is not a priority for now.
-                if isinstance(piece, str) and piece.startswith(
-                    ("buf", "arg", "wrap_with_raii_handle_if_needed")
-                ):
-                    piece = f"convert_arrayref_tensor_to_tensor({piece})"
-                wrapped_args.append(piece)
-
         debug_printer_manager.set_printer_args(args, kernel, None, None, "extern")
         with debug_printer_manager:
             shim_fn = self.get_c_shim_func_name(kernel)
             self.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(wrapped_args)}));"
+                f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
             )
 
     def generate_c_shim_extern_kernel_alloc(self, extern_kernel, args):
@@ -1121,15 +1097,8 @@ def generate_scatter_fallback(
         cpp_kernel_name = self.get_c_shim_func_name(cpp_kernel_name)
         # TODO: consider remove "_out" and add missing inplace variants to fallback_ops.py
         cpp_kernel_name = cpp_kernel_name.replace("__", "_") + "_out"
-        inputs_wrapped = [
-            (
-                f"convert_arrayref_tensor_to_tensor({x})"
-                if isinstance(x, str)
-                else str(x)
-            )
-            for x in inputs
-        ]
-        line = f"{cpp_kernel_name}(convert_arrayref_tensor_to_tensor({output}), {','.join(inputs_wrapped)}"
+        inputs_wrapped = [str(x) for x in inputs]
+        line = f"{cpp_kernel_name}({output}, {','.join(inputs_wrapped)}"
 
         if python_kernel_name.startswith("aten.scatter_reduce"):
             line += f", {','.join(kwargs)}"
@@ -1150,25 +1119,16 @@ def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
         # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding
         # tensor prematurely deallocated, thus this std::vector().data() trick here.
         indices_str = (
-            "std::vector<AtenTensorHandle>{"
-            + (
-                ", ".join(
-                    [f"convert_arrayref_tensor_to_tensor({ind})" for ind in indices]
-                )
-            )
-            + "}.data()"
+            "std::vector<AtenTensorHandle>{" + (", ".join(indices)) + "}.data()"
         )
         args = [
-            f"convert_arrayref_tensor_to_tensor({x})",
+            x,
             indices_str,
             str(len(indices)),
-            f"convert_arrayref_tensor_to_tensor({values})",
+            values,
             accumulate,
         ]
-        args.insert(
-            0, f"convert_arrayref_tensor_to_tensor({x})"
-        )  # set x as the output tensor, this fallback mutates x.
-
+        args.insert(0, x)  # set x as the output tensor, this fallback mutates x.
         self.writeline(self.wrap_kernel_call(kernel, args))
 
     def add_benchmark_harness(self, output):
@@ -1362,12 +1322,10 @@ def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
         return f"RAIIAtenTensorHandle({tmp_name})"
 
     def codegen_reinterpret_view(
-        self, data, size_list, stride_list, offset, writer, dtype=None
+        self, data, size, stride, offset, writer, dtype=None
     ) -> str:
-        dim = str(len(size_list))
+        dim = str(len(size))
         original_offset = offset
-        size = self.codegen_shape_tuple(size_list)
-        stride = self.codegen_shape_tuple(stride_list)
         offset = self.codegen_sizevar(offset)
         call_strs = []
         final_tmp_name = None
@@ -1379,15 +1337,15 @@ def create_reinterpret_call() -> Tuple[str, str]:
                 f"{data.get_name()}",
                 dim,
                 self.codegen_int_array_var(
-                    size,
+                    self.codegen_shape_tuple(size),
                     writer,
-                    known_statically=self.is_statically_known_list_of_ints(size_list),
+                    known_statically=self.is_statically_known_list_of_ints(size),
                     graph=self.get_codegened_graph(),
                 ),
                 self.codegen_int_array_var(
-                    stride,
+                    self.codegen_shape_tuple(stride),
                     writer,
-                    known_statically=self.is_statically_known_list_of_ints(stride_list),
+                    known_statically=self.is_statically_known_list_of_ints(stride),
                     graph=self.get_codegened_graph(),
                 ),
                 offset,
@@ -1417,8 +1375,8 @@ def create_dtypeview_call(reinterpret_call: str) -> Tuple[str, List[str]]:
             return tmp_RAIIAtenTensorHandle, call_strs
 
         if (
-            size_list == data.layout.size
-            and stride_list == data.layout.stride
+            size == data.layout.size
+            and stride == data.layout.stride
             and original_offset == data.layout.offset
         ):
             # pure dtypeview
@@ -1428,7 +1386,7 @@ def create_dtypeview_call(reinterpret_call: str) -> Tuple[str, List[str]]:
                 final_tmp_name = tmp_output_name
                 final_tmp_name_is_RAIIAtenTensorHandle = True
             else:
-                return f"{data.get_name()}"
+                return data.get_name()
         else:
             # firstly create reinterpretview
             final_tmp_name, reinterpret_call = create_reinterpret_call()
@@ -2230,12 +2188,7 @@ def val_to_arg_str(self, val, type_=None) -> str:
         return self.val_to_arg_str_for_prim_type(val, type_)
 
     def create_tmp_raii_handle_var(self, base_handle):
-        if base_handle.startswith(
-            (
-                "convert_arrayref_tensor_to_tensor",
-                "wrap_with_raii_handle_if_needed",
-            )
-        ):
+        if base_handle.startswith(("wrap_with_raii_handle_if_needed",)):
             # wrap_with_raii_handle_if_needed creates a temp RAIIAtenTensorHandle, so we need to
             # explicitly store it. Otherwise, it will be destroyed before the fallback kernel call.
             tmp_var_name = f"var_{next(self.arg_var_id)}"
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py b/torch/_inductor/codegen/cpp_wrapper_cpu_array_ref.py
@@ -775,12 +775,10 @@ def codegen_device_copy(self, src, dst, non_blocking: bool):
         )
 
     def codegen_reinterpret_view(
-        self, data, size_list, stride_list, offset, writer, dtype=None
+        self, data, size, stride, offset, writer, dtype=None
     ) -> str:
-        dim = str(len(size_list))
+        dim = str(len(size))
         original_offset = offset
-        size = self.codegen_shape_tuple(size_list)
-        stride = self.codegen_shape_tuple(stride_list)
         offset = self.codegen_sizevar(offset)
         call_strs = []
         final_tmp_name = None
@@ -792,15 +790,15 @@ def create_reinterpret_call() -> Tuple[str, str]:
                 f"{data.get_name()}",
                 dim,
                 self.codegen_int_array_var(
-                    size,
+                    self.codegen_shape_tuple(size),
                     writer,
-                    known_statically=self.is_statically_known_list_of_ints(size_list),
+                    known_statically=self.is_statically_known_list_of_ints(size),
                     graph=self.get_codegened_graph(),
                 ),
                 self.codegen_int_array_var(
-                    stride,
+                    self.codegen_shape_tuple(stride),
                     writer,
-                    known_statically=self.is_statically_known_list_of_ints(stride_list),
+                    known_statically=self.is_statically_known_list_of_ints(stride),
                     graph=self.get_codegened_graph(),
                 ),
                 offset,
@@ -830,8 +828,8 @@ def create_dtypeview_call(reinterpret_call: str) -> Tuple[str, List[str]]:
             return tmp_RAIIAtenTensorHandle, call_strs
 
         if (
-            size_list == data.layout.size
-            and stride_list == data.layout.stride
+            size == data.layout.size
+            and stride == data.layout.stride
             and original_offset == data.layout.offset
         ):
             # pure dtypeview
@@ -841,7 +839,7 @@ def create_dtypeview_call(reinterpret_call: str) -> Tuple[str, List[str]]:
                 final_tmp_name = tmp_output_name
                 final_tmp_name_is_RAIIAtenTensorHandle = True
             else:
-                return f"{data.get_name()}"
+                return data.get_name()
         else:
             # firstly create reinterpretview
             final_tmp_name, reinterpret_call = create_reinterpret_call()
@@ -861,9 +859,9 @@ def create_dtypeview_call(reinterpret_call: str) -> Tuple[str, List[str]]:
 
         if (
             self.can_stack_allocate_buffer(data)
-            and self.is_statically_known_list_of_ints(size_list)
-            and self.is_statically_known_list_of_ints(stride_list)
-            and ir.is_contiguous_strides_for_shape(stride_list, size_list)
+            and self.is_statically_known_list_of_ints(size)
+            and self.is_statically_known_list_of_ints(stride)
+            and ir.is_contiguous_strides_for_shape(stride, size)
         ):
             return final_tmp_name
 
@@ -986,3 +984,47 @@ def val_to_arg_str(self, val, type_=None) -> str:
             return f"{var_name}, {len(val)}"
 
         return self.val_to_arg_str_for_prim_type(val, type_)
+
+    def codegen_tensor_item(
+        self, dtype: torch.dtype, tensor: str, scalar: str, indented_buffer=None
+    ):
+        dtype_str = str(dtype).split(".")[-1]
+        writer = indented_buffer or self
+
+        if dtype == torch.float16 or dtype == torch.bfloat16:
+            scalar_tmp = f"{scalar}_tmp"
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar_tmp};")
+
+            # need convert_arrayref_tensor_to_tensor for ArrayRefTensors
+            tensor = f"convert_arrayref_tensor_to_tensor({tensor})"
+
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar_tmp}));"
+            )
+            writer.writeline(f"float {scalar} = float({scalar_tmp});")
+        else:
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};")
+
+            # need convert_arrayref_tensor_to_tensor for ArrayRefTensors
+            tensor = f"convert_arrayref_tensor_to_tensor({tensor})"
+
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));"
+            )
+
+    def create_tmp_raii_handle_var(self, base_handle):
+        if base_handle.startswith(
+            (
+                "convert_arrayref_tensor_to_tensor",
+                "wrap_with_raii_handle_if_needed",
+            )
+        ):
+            # wrap_with_raii_handle_if_needed creates a temp RAIIAtenTensorHandle, so we need to
+            # explicitly store it. Otherwise, it will be destroyed before the fallback kernel call.
+            tmp_var_name = f"var_{next(self.arg_var_id)}"
+            return (
+                tmp_var_name,
+                f"RAIIAtenTensorHandle {tmp_var_name} = {base_handle};\n",
+            )
+        else:
+            return "", ""