Remove gpu_custom_call logic. (#9600)

ysiraichi · web-flow · commit 763e5b78d4fc · 2025-08-29T18:29:02.000-03:00
This PR removes the implementation of `gpu_custom_call`. This is in line
with the CUDA deprecation that started on release 2.8.

**Key Changes:**

- Delete both `ops/gpu_custom_call.cpp` and `ops/gpu_custom_call.h`
- (`tensor_methods.{h,cpp}`) Remove `tensor_methods::gpu_custom_call`
- (`ops/xla_ops.{h,cpp}`) Remove `OpKindWrapper xla_gpu_custom_call`
global variable
- (`init_python_bindings.cpp`) Remove the Python API function
`_xla_gpu_custom_call`
- (`init_python_bindings.cpp`) Make `XlaCustomCall` function into a TPU
specific function `TpuCustomCall`
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -38,7 +38,6 @@
 #include "pybind11/pytypes.h"
 #include "pybind11/stl.h"
 #include "pybind11/stl_bind.h"
-#include "status.h"
 #include "torch_xla/csrc/XLANativeFunctions.h"
 #include "torch_xla/csrc/aten_autograd_ops.h"
 #include "torch_xla/csrc/aten_fallback.h"
@@ -345,22 +344,18 @@ std::vector<std::vector<int64_t>> CreateReduceGroups(const py::list& groups) {
   return replica_groups;
 }
 
-std::vector<at::Tensor> XlaCustomCall(
+std::vector<at::Tensor> TpuCustomCall(
     const std::vector<at::Tensor>& inputs, const std::string& payload,
     const std::vector<std::vector<int64_t>>& output_shapes,
-    const std::vector<py::object>& output_dtypes, bool is_tpu) {
+    const std::vector<py::object>& output_dtypes) {
   std::vector<at::ScalarType> dtypes;
   dtypes.reserve(output_dtypes.size());
   for (auto& dtype : output_dtypes) {
     dtypes.push_back(reinterpret_cast<THPDtype*>(dtype.ptr())->scalar_type);
   }
   XLA_ASSIGN_OR_THROW(std::vector<absl_nonnull XLATensorPtr> xla_inputs,
                       bridge::GetXlaTensors(inputs));
-  if (is_tpu) {
-    return bridge::AtenFromXlaTensors(tensor_methods::tpu_custom_call(
-        xla_inputs, payload, output_shapes, dtypes));
-  }
-  return bridge::AtenFromXlaTensors(tensor_methods::gpu_custom_call(
+  return bridge::AtenFromXlaTensors(tensor_methods::tpu_custom_call(
       xla_inputs, payload, output_shapes, dtypes));
 }
 
@@ -3058,8 +3053,7 @@ void InitXlaModuleBindings(py::module m) {
               const std::vector<std::vector<int64_t>>& output_shapes,
               const std::vector<py::object>& output_dtypes)
                -> std::vector<at::Tensor> {
-            return XlaCustomCall(inputs, payload, output_shapes, output_dtypes,
-                                 /*is_tpu=*/true);
+            return TpuCustomCall(inputs, payload, output_shapes, output_dtypes);
            })
       .def("_has_cuda_support",
            []() {
@@ -3069,14 +3063,6 @@ void InitXlaModuleBindings(py::module m) {
             return false;
 #endif
            })
-      .def("_xla_gpu_custom_call",
-           [](const std::vector<at::Tensor>& inputs, const std::string& payload,
-              const std::vector<std::vector<int64_t>>& output_shapes,
-              const std::vector<py::object>& output_dtypes)
-               -> std::vector<at::Tensor> {
-            return XlaCustomCall(inputs, payload, output_shapes, output_dtypes,
-                                 /*is_tpu=*/false);
-           })
       .def("_xla_register_custom_call_target",
            [](const std::string& fn_name, const py::capsule& function_ptr,
               const std::string& platform) {
diff --git a/torch_xla/csrc/ops/gpu_custom_call.cpp b/torch_xla/csrc/ops/gpu_custom_call.cpp
diff --git a/torch_xla/csrc/ops/gpu_custom_call.h b/torch_xla/csrc/ops/gpu_custom_call.h
diff --git a/torch_xla/csrc/ops/xla_ops.cpp b/torch_xla/csrc/ops/xla_ops.cpp
@@ -39,6 +39,5 @@ const OpKindWrapper xla_unselect("xla::unselect");
 const OpKindWrapper xla_update_slice("xla::update_slice");
 const OpKindWrapper xla_custom_sharding("xla::custom_sharding");
 const OpKindWrapper xla_tpu_custom_call("xla::tpu_custom_call");
-const OpKindWrapper xla_gpu_custom_call("xla::gpu_custom_call");
 
 }  // namespace torch_xla
diff --git a/torch_xla/csrc/ops/xla_ops.h b/torch_xla/csrc/ops/xla_ops.h
@@ -64,8 +64,7 @@ extern const OpKindWrapper xla_unselect;
 extern const OpKindWrapper xla_update_slice;
 extern const OpKindWrapper xla_custom_sharding;
 extern const OpKindWrapper xla_tpu_custom_call;
-extern const OpKindWrapper xla_gpu_custom_call;
 
 }  // namespace torch_xla
 
-#endif  // XLA_TORCH_XLA_CSRC_OPS_XLA_OPS_H_
+#endif  // XLA_TORCH_XLA_CSRC_OPS_XLA_OPS_H_
diff --git a/torch_xla/csrc/tensor_methods.cpp b/torch_xla/csrc/tensor_methods.cpp
@@ -65,7 +65,6 @@
 #include "torch_xla/csrc/ops/generic.h"
 #include "torch_xla/csrc/ops/generic_slice.h"
 #include "torch_xla/csrc/ops/get_dimensions_size.h"
-#include "torch_xla/csrc/ops/gpu_custom_call.h"
 #include "torch_xla/csrc/ops/hardtanh_backward.h"
 #include "torch_xla/csrc/ops/index_ops.h"
 #include "torch_xla/csrc/ops/index_select.h"
@@ -767,45 +766,6 @@ void custom_sharding_(
   input->SetShardingSpec(*sharding_spec);
 }
 
-std::vector<XLATensorPtr> gpu_custom_call(
-    const std::vector<XLATensorPtr>& inputs, const std::string& payload,
-    const std::vector<std::vector<int64_t>>& output_shapes,
-    const std::vector<at::ScalarType>& output_dtypes) {
-  XLA_CHECK(inputs.size() > 0) << "inputs are empty";
-
-  std::vector<torch::lazy::Value> values;
-  values.reserve(inputs.size());
-  for (const auto& input : inputs) {
-    values.push_back(input->GetIrValue());
-  }
-
-  XLA_CHECK_EQ(output_shapes.size(), output_dtypes.size());
-  std::vector<xla::Shape> output_xla_shapes;
-  output_xla_shapes.reserve(output_shapes.size());
-  for (size_t i = 0; i < output_shapes.size(); ++i) {
-    output_xla_shapes.push_back(xla::ShapeUtil::MakeShape(
-        MakeXlaPrimitiveType(output_dtypes[i], &(inputs[0]->GetDevice())),
-        output_shapes[i]));
-  }
-
-  auto node = torch_xla::MakeNode<GpuCustomCall>(
-      values, xla::ShapeUtil::MakeTupleShape(output_xla_shapes), payload);
-
-  std::vector<XLATensorPtr> outputs;
-  outputs.reserve(output_shapes.size());
-  for (size_t i = 0; i < output_shapes.size(); ++i) {
-    outputs.push_back(inputs[0]->CreateFrom(torch::lazy::Value(node, i),
-                                            output_dtypes[i],
-                                            /*delay_eager_execution=*/true));
-  }
-  XLAGraphExecutor* graph_executor = XLAGraphExecutor::Get();
-  if (graph_executor->UseEagerMode()) {
-    // Execute the HLO that will run the `custom` and in one hlo
-    graph_executor->ApplyEagerSync(outputs);
-  }
-  return outputs;
-}
-
 std::vector<XLATensorPtr> tpu_custom_call(
     const std::vector<XLATensorPtr>& inputs, const std::string& payload,
     const std::vector<std::vector<int64_t>>& output_shapes,
diff --git a/torch_xla/csrc/tensor_methods.h b/torch_xla/csrc/tensor_methods.h
@@ -103,11 +103,6 @@ void custom_sharding_(
     const std::shared_ptr<XLATensor::ShardingSpec>& spec,
     const CustomSharding::Type& type = CustomSharding::Type::kSharding);
 
-std::vector<XLATensorPtr> gpu_custom_call(
-    const std::vector<XLATensorPtr>& inputs, const std::string& payload,
-    const std::vector<std::vector<int64_t>>& output_shapes,
-    const std::vector<at::ScalarType>& output_dtypes);
-
 std::vector<XLATensorPtr> tpu_custom_call(
     const std::vector<XLATensorPtr>& inputs, const std::string& payload,
     const std::vector<std::vector<int64_t>>& output_shapes,
diff --git a/torch_xla/csrc/xla_lower_util.cpp b/torch_xla/csrc/xla_lower_util.cpp
@@ -1281,31 +1281,6 @@ xla::XlaOp BuildCustomSharding(const xla::XlaOp& input, const std::string& type,
                          output_shape);
 }
 
-std::vector<xla::XlaOp> BuildGpuCustomCall(
-    const std::vector<xla::XlaOp>& inputs, const xla::Shape& output_shape,
-    const std::string& payload) {
-  std::vector<xla::Shape> input_shapes;
-  input_shapes.reserve(inputs.size());
-  for (const auto& input : inputs) {
-    input_shapes.push_back(ShapeHelper::ShapeOfXlaOp(input));
-  }
-
-  XLA_CHECK(inputs.size() > 0) << "inputs are empty";
-  xla::XlaOp outputs = xla::CustomCallWithLayout(
-      inputs[0].builder(),
-      /*call_target_name=*/"triton_kernel_call", inputs, output_shape,
-      input_shapes, payload, false, {}, nullptr,
-      xla::CustomCallSchedule::SCHEDULE_NONE,
-      xla::CustomCallApiVersion::API_VERSION_STATUS_RETURNING);
-  std::vector<xla::XlaOp> result;
-  int num_outputs = output_shape.tuple_shapes_size();
-  result.reserve(num_outputs);
-  for (int i = 0; i < num_outputs; ++i) {
-    result.push_back(xla::GetTupleElement(outputs, i));
-  }
-  return result;
-}
-
 std::vector<xla::XlaOp> BuildTpuCustomCall(
     const std::vector<xla::XlaOp>& inputs, const xla::Shape& output_shape,
     const std::string& payload) {
diff --git a/torch_xla/csrc/xla_lower_util.h b/torch_xla/csrc/xla_lower_util.h
@@ -162,10 +162,6 @@ std::vector<xla::XlaOp> BuildTpuCustomCall(
 xla::XlaOp BuildNms(xla::XlaOp boxes, xla::XlaOp scores,
                     xla::XlaOp iou_threshold);
 
-std::vector<xla::XlaOp> BuildGpuCustomCall(
-    const std::vector<xla::XlaOp>& inputs, const xla::Shape& output_shape,
-    const std::string& payload);
-
 }  // namespace torch_xla
 
 #endif  // XLA_TORCH_XLA_CSRC_XLA_LOWER_UTIL_H_