Remove CUDA specific logic from runtime. (#9598)

ysiraichi · web-flow · commit 004f19e7e6ae · 2025-08-29T18:08:58.000-03:00
This PR removes CUDA specific logic from `torch_xla/csrc/runtime`
directory, as well as uses of deleted functions and environment
variables from outside. This is in line with the CUDA deprecation that
started on release 2.8.

**Key Changes:**

- Removed environment variable `ZERO_COPY_ENABLED`, which was used to
use DLPack for moving without copying tensors from PyTorch CUDA to
PyTorch/XLA XLA:CUDA device
- Removed Python API function `_get_stream_for_cuda_device`, which was
used in `dlpack.py` for DLPack related logic on CUDA capsules
- Removed `ComputationClient::GetCudaStreamForDevice()`, which was used
by the Python API above
- Removed `PjRtComputationClient::RegisterCustomCall()`, since it only
worked when `platform == "CUDA"`
- Removed `GetGpuAllocatorConfig()`
- Removed `from_xla_cuda_to_cuda()` DLPack function
- Removed CUDA branch from `InitializePjRt()`
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
@@ -157,9 +157,6 @@ def _choose_proper_device(self, initialize_on_cuda):
       self.skipTest(
           "Skip this test because it requires xr.device_type()=='CUDA' and torch.cuda.is_available()."
       )
-    os.environ.update({
-        xenv.ZERO_COPY_ENABLED: "1",
-    })
     return "cuda:0"
 
   @skipOnNeuron
@@ -205,9 +202,6 @@ def test_simple_model(self):
       "1",
   )
   def test_simple_model_automoves_tensors(self, zero_copy_enabled):
-    os.environ.update({
-        xenv.ZERO_COPY_ENABLED: zero_copy_enabled,
-    })
     x = torch.tensor(100.0, requires_grad=True, device="cuda:0")
     y = torch.tensor(200.0, requires_grad=True, device="cuda:0")
     original_device = x.device
diff --git a/torch_xla/_dynamo/dynamo_bridge.py b/torch_xla/_dynamo/dynamo_bridge.py
@@ -148,19 +148,10 @@ def _maybe_move_tensors_to_device(tensors: tuple,
     if dynamo_debug:
       print("Moving Tensor {} to device {}".format(tensor, target_device))
 
-    zero_copy_enabled = xu.getenv_as(xenv.ZERO_COPY_ENABLED, bool, defval=False)
-    if zero_copy_enabled and tensor.device.type == 'cuda' and target_device.type == 'xla':
-      # If the input cuda tensor requires gradient, we need to call detach. Otherwise, we'd get the error "RuntimeError: Can't export tensors that require gradient, use tensor.detach()"
-      moved_tensor = torch_xla_dlpack.from_dlpack(tensor.detach())
-    elif zero_copy_enabled and tensor.device.type == 'xla' and target_device.type == 'cuda':
-      # `torch_xla.sync()` is need to make sure the pjrt buffer is valid.
-      torch_xla.sync()
-      moved_tensor = torch_xla_dlpack.from_xla_cuda_to_cuda(tensor)
-    else:
-      # Have to move to CPU before moving it to target device.
-      cpu_device: torch.device = torch.device("cpu")
-      moved_tensor = tensor.to(cpu_device)
-      moved_tensor = moved_tensor.to(target_device)
+    # Have to move to CPU before moving it to target device.
+    cpu_device: torch.device = torch.device("cpu")
+    moved_tensor = tensor.to(cpu_device)
+    moved_tensor = moved_tensor.to(target_device)
 
     # Explicitly have to copy requires_grad attribute because it's dropped
     # with torch.to(..)
diff --git a/torch_xla/core/xla_env_vars.py b/torch_xla/core/xla_env_vars.py
@@ -30,4 +30,3 @@
 RANK = 'RANK'
 WORLD_SIZE = 'WORLD_SIZE'
 LOCAL_WORLD_SIZE = 'LOCAL_WORLD_SIZE'
-ZERO_COPY_ENABLED = 'ZERO_COPY_ENABLED'
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1771,11 +1771,6 @@ void InitXlaModuleBindings(py::module m) {
            []() {
               return runtime::GetComputationClientOrDie()->GetPlatformVersion();
            })
-      .def("_get_stream_for_cuda_device",
-           [](const int device_id) {
-            return runtime::GetComputationClientOrDie()->GetCudaStreamForDevice(
-                device_id);
-           })
       .def("_xla_num_devices",
            []() -> int64_t {
             if (UseVirtualDevice()) {
diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h
@@ -375,8 +375,6 @@ class ComputationClient {
   virtual absl::StatusOr<xla::PjRtDevice*> LookupAddressableDevice(
       int local_device_id) const = 0;
 
-  virtual std::intptr_t GetCudaStreamForDevice(int local_device_id) const = 0;
-
   virtual size_t GetNumLocalDevices() const = 0;
 
   virtual size_t GetNumDevices() const = 0;
diff --git a/torch_xla/csrc/runtime/env_vars.h b/torch_xla/csrc/runtime/env_vars.h
@@ -10,23 +10,19 @@ namespace env {
 inline constexpr char kEnvLocalWorker[] = "LOCAL_WORKER";
 inline constexpr char kEnvTpuConfig[] = "TPU_CONFIG";
 inline constexpr char kEnvNumTpu[] = "TPU_NUM_DEVICES";
-inline constexpr char kEnvNumGpu[] = "GPU_NUM_DEVICES";
 inline constexpr char kEnvNumCpu[] = "CPU_NUM_DEVICES";
 inline constexpr char kEnvTpuvmMode[] = "TPUVM_MODE";
 inline constexpr char kEnvPjRtDevice[] = "PJRT_DEVICE";
 inline constexpr char kEnvPjRtTpuMaxInflightComputations[] =
     "PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS";
 inline constexpr char kEnvPjrtAsyncCpuClient[] = "PJRT_CPU_ASYNC_CLIENT";
-inline constexpr char kEnvPjrtAsyncGpuClient[] = "PJRT_GPU_ASYNC_CLIENT";
 inline constexpr char kEnvTpuLibraryPath[] = "TPU_LIBRARY_PATH";
 inline constexpr char kEnvInferredTpuLibraryPath[] = "PTXLA_TPU_LIBRARY_PATH";
 inline constexpr char kEnvXpuLibraryPath[] = "XPU_LIBRARY_PATH";
 inline constexpr char kEnvNeuronLibraryPath[] = "NEURON_LIBRARY_PATH";
 inline constexpr char kEnvPjrtDistServiceAddr[] = "PJRT_DIST_SERVICE_ADDR";
 inline constexpr char kEnvPjRtLocalProcessCount[] = "PJRT_LOCAL_PROCESS_COUNT";
 inline constexpr char kEnvPjRtLocalRank[] = "PJRT_LOCAL_PROCESS_RANK";
-inline constexpr char kEnvPjrtAllocatorCudaAsync[] =
-    "PJRT_ALLOCATOR_CUDA_ASYNC";
 inline constexpr char kEnvPjrtAllocatorPreallocate[] =
     "PJRT_ALLOCATOR_PREALLOCATE";
 inline constexpr char kEnvPjrtAllocatorFraction[] = "PJRT_ALLOCATOR_FRACTION";
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.cpp b/torch_xla/csrc/runtime/ifrt_computation_client.cpp
@@ -161,8 +161,6 @@ IfrtComputationClient::Create() {
 }
 
 IfrtComputationClient::~IfrtComputationClient() {
-  // In the GPU case, the PjRtClient depends on the DistributedRuntimeClient
-  // tracked in XlaCoordinator, so the PjRtClient must be destroyed first.
   client_ = nullptr;
   coordinator_ = nullptr;
 }
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.h b/torch_xla/csrc/runtime/ifrt_computation_client.h
@@ -110,10 +110,6 @@ class IfrtComputationClient : public ComputationClient {
     XLA_ERROR() << __FUNCTION__ << " not implemented";
   }
 
-  std::intptr_t GetCudaStreamForDevice(int local_device_id) const override {
-    XLA_ERROR() << __FUNCTION__ << " not implemented";
-  }
-
   std::vector<std::string> GetLocalDevices() const override;
 
   std::vector<std::string> GetAllDevices() const override;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cpp b/torch_xla/csrc/runtime/pjrt_computation_client.cpp
@@ -23,7 +23,6 @@
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/literal.h"
-#include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
@@ -152,8 +151,6 @@ PjRtComputationClient::Create() {
 }
 
 PjRtComputationClient::~PjRtComputationClient() {
-  // In the GPU case, the PjRtClient depends on the DistributedRuntimeClient
-  // tracked in XlaCoordinator, so the PjRtClient must be destroyed first.
   client_ = nullptr;
   coordinator_ = nullptr;
 }
@@ -1038,45 +1035,6 @@ ComputationClient::MemoryInfo PjRtComputationClient::GetMemoryInfo(
   };
 }
 
-void PjRtComputationClient::RegisterCustomCall(const std::string& fn_name,
-                                               void* function_ptr,
-                                               const std::string& platform) {
-  if (platform != "CUDA") {
-    XLA_ERROR() << "Custom call targets can only be registered for "
-                   "PJRT CUDA runtime.";
-    return;
-  }
-
-  auto* c_api_client = dynamic_cast<xla::PjRtCApiClient*>(client_.get());
-  if (!c_api_client) {
-    XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(fn_name, function_ptr, platform);
-    return;
-  }
-  const PJRT_Api* pjrt_api = c_api_client->pjrt_c_api();
-
-  // See openxla reference:
-  // https://github.com/openxla/xla/blob/b604c8d87df842002a7a8de79a434026329fbcb2/xla/pjrt/c/pjrt_c_api_gpu_test.cc#L414
-  const PJRT_Extension_Base* next =
-      reinterpret_cast<const PJRT_Extension_Base*>(pjrt_api->extension_start);
-  while (next != nullptr &&
-         next->type !=
-             PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
-    next = next->next;
-  }
-  XLA_CHECK(next) << "Custom call extension not found";
-  PJRT_Gpu_Register_Custom_Call_Args args;
-  args.struct_size = PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE;
-  args.function_name = fn_name.c_str();
-  args.function_name_size = fn_name.size();
-  args.api_version = 0;
-  args.handler_execute = function_ptr;
-  PJRT_Error* error =
-      reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
-  if (error) {
-    XLA_ERROR() << error->status;
-  }
-}
-
 void PjRtComputationClient::OnReadyCallback(
     ComputationClient::DataPtr data, const std::function<void()>& callback) {
   std::shared_ptr<xla::PjRtBuffer> buffer;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.h b/torch_xla/csrc/runtime/pjrt_computation_client.h
@@ -118,17 +118,6 @@ class PjRtComputationClient : public ComputationClient {
         xla::PjRtLocalDeviceId(local_device_id));
   }
 
-  std::intptr_t GetCudaStreamForDevice(int local_device_id) const override {
-    absl::StatusOr<xla::PjRtDevice*> pjrt_device =
-        client_->LookupAddressableDevice(
-            xla::PjRtLocalDeviceId(local_device_id));
-    XLA_CHECK(pjrt_device.ok()) << "Failed to get a PjRt device.";
-    absl::StatusOr<std::intptr_t> stream =
-        pjrt_device.value()->GetStreamForExternalReadyEvents();
-    XLA_CHECK(stream.ok()) << "Failed to get a stream.";
-    return stream.value();
-  }
-
   std::vector<std::string> GetLocalDevices() const override;
 
   std::vector<std::string> GetAllDevices() const override;
@@ -169,7 +158,9 @@ class PjRtComputationClient : public ComputationClient {
       absl::Span<xla::PjRtDevice* const> devices) const;
 
   void RegisterCustomCall(const std::string& fn_name, void* function_ptr,
-                          const std::string& platform) override;
+                          const std::string& platform) override {
+    XLA_ERROR() << __FUNCTION__ << " not implemented";
+  };
 
   void OnReadyCallback(DataPtr data,
                        const std::function<void()>& callback) override;
diff --git a/torch_xla/csrc/runtime/pjrt_registry.cpp b/torch_xla/csrc/runtime/pjrt_registry.cpp
@@ -15,7 +15,6 @@
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
-#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/tfrt_cpu_pjrt_client.h"
@@ -44,23 +43,6 @@ class LibraryPlugin : public PjRtPlugin {
 std::unordered_map<std::string, std::shared_ptr<const PjRtPlugin>>
     pjrt_plugins_ = {{"LIBRARY", std::make_shared<LibraryPlugin>()}};
 
-xla::GpuAllocatorConfig GetGpuAllocatorConfig() {
-  auto allocator_config = xla::GpuAllocatorConfig{};
-  if (sys_util::GetEnvString(env::kEnvPjrtAllocatorCudaAsync, "").empty() &&
-      sys_util::GetEnvString(env::kEnvPjrtAllocatorPreallocate, "").empty() &&
-      sys_util::GetEnvString(env::kEnvPjrtAllocatorFraction, "").empty()) {
-    return allocator_config;
-  }
-  if (sys_util::GetEnvBool(env::kEnvPjrtAllocatorCudaAsync, false)) {
-    allocator_config.kind = xla::GpuAllocatorConfig::Kind::kCudaAsync;
-  }
-  allocator_config.preallocate =
-      sys_util::GetEnvBool(env::kEnvPjrtAllocatorPreallocate, true);
-  allocator_config.memory_fraction =
-      sys_util::GetEnvDouble(env::kEnvPjrtAllocatorFraction, 0.75);
-  return allocator_config;
-}
-
 absl::StatusOr<std::shared_ptr<const PjRtPlugin>> GetPjRtPlugin(
     const std::string& device_type) {
   auto entry = pjrt_plugins_.find(device_type);
@@ -167,59 +149,6 @@ InitializePjRt(const std::string& device_type) {
   } else if (device_type == "TPU_LEGACY") {
     return XLA_ERROR_WITH_LOCATION(absl::InvalidArgumentError(
         "TPU_LEGACY client is no longer available."));
-  } else if (device_type == "CUDA") {
-    TORCH_WARN("The XLA:CUDA device is deprecated in release 2.8. ",
-               "Future releases might remove XLA:CUDA support entirely. ",
-               "Use the PyTorch native CUDA backend, instead.")
-    TF_VLOG(1) << "Initializing PjRt GPU client...";
-    bool async = sys_util::GetEnvBool(env::kEnvPjrtAsyncGpuClient, true);
-    int local_process_rank = sys_util::GetEnvInt(env::kEnvPjRtLocalRank, 0);
-    int global_process_rank = sys_util::GetEnvInt("RANK", local_process_rank);
-    int local_world_size = sys_util::GetEnvInt("LOCAL_WORLD_SIZE", 1);
-    int global_world_size = sys_util::GetEnvInt("WORLD_SIZE", local_world_size);
-
-    TF_VLOG(3) << "Getting StreamExecutorGpuClient for node_id="
-               << global_process_rank << ", num_nodes=" << global_world_size
-               << ", local_process_rank=" << local_process_rank
-               << ", local_world_size=" << local_world_size
-               << ", spmd case=" << sys_util::GetEnvBool("XLA_USE_SPMD", false)
-               << ", PJRT_LOCAL_PROCESS_RANK="
-               << sys_util::GetEnvString(env::kEnvPjRtLocalRank, "")
-               << ", RANK=" << sys_util::GetEnvString("RANK", "")
-               << ", LOCAL_WORLD_SIZE="
-               << sys_util::GetEnvString("LOCAL_WORLD_SIZE", "")
-               << ", WORLD_SIZE=" << sys_util::GetEnvString("WORLD_SIZE", "");
-    std::optional<std::set<int>> allowed_devices;
-    if (local_world_size > 1) {
-      allowed_devices = std::set{local_process_rank};
-    }
-
-    std::shared_ptr<xla::KeyValueStoreInterface> kv_store;
-    if (global_world_size > 1) {
-      // Use the distributed key-value store from DistributedRuntimeClient.
-      std::string master_addr =
-          runtime::sys_util::GetEnvString("MASTER_ADDR", "localhost");
-      std::string port = runtime::sys_util::GetEnvString(
-          "XLA_COORDINATOR_PORT", XlaCoordinator::kDefaultCoordinatorPort);
-      XLA_ASSIGN_OR_RETURN(
-          coordinator,
-          XlaCoordinator::Create(global_process_rank, global_world_size,
-                                 master_addr, port));
-      std::shared_ptr<xla::DistributedRuntimeClient> distributed_client =
-          coordinator->GetClient();
-      kv_store = xla::GetDistributedKeyValueStore(distributed_client,
-                                                  /*key_prefix=*/"gpu:");
-    }
-
-    xla::GpuClientOptions options;
-    options.allocator_config = GetGpuAllocatorConfig();
-    options.node_id = global_process_rank;
-    options.num_nodes = global_world_size;
-    options.allowed_devices = allowed_devices;
-    options.platform_name = "gpu";
-    options.should_stage_host_to_device_transfers = true;
-    options.kv_store = kv_store;
-    XLA_ASSIGN_OR_RETURN(client, xla::GetStreamExecutorGpuClient(options));
   } else if (device_type == "XPU") {
     TF_VLOG(1) << "Initializing PjRt XPU client...";
     XLA_RETURN_IF_ERROR(pjrt::LoadPjrtPlugin(
diff --git a/torch_xla/utils/dlpack.py b/torch_xla/utils/dlpack.py
@@ -13,40 +13,12 @@ def to_dlpack(xla_tensor: Any):
 def from_dlpack(ext_tensor: Any):
   if hasattr(ext_tensor, '__dlpack_device__') and hasattr(
       ext_tensor, '__dlpack__'):
-    device_type, device_id = ext_tensor.__dlpack_device__()
-    if device_type == DLDeviceType.kDLGPU:
-      stream = torch_xla._XLAC._get_stream_for_cuda_device(device_id)
-      dlpack = ext_tensor.__dlpack__(stream=stream)
-    else:
-      dlpack = ext_tensor.__dlpack__()
+    device_type, _ = ext_tensor.__dlpack_device__()
+    if device_type != DLDeviceType.kDLCPU:
+      raise ValueError(
+          "PyTorch/XLA DLPack implementation currently only supports CPU.")
+    dlpack = ext_tensor.__dlpack__()
   else:
     dlpack = ext_tensor
 
   return torch_xla._XLAC._from_dlpack(dlpack)
-
-
-def from_xla_cuda_to_cuda(tensor):
-  assert torch.cuda.is_available()
-  assert tensor.device.type == "xla", "The tensor is not an XLA tensor"
-  is_xla_cuda = True if xu.getenv_as("PJRT_DEVICE", str,
-                                     "").lower() == "cuda" else False
-  assert is_xla_cuda, "The XLA tensor is not on CUDA"
-  # consumer is torch, producer is torch_xla
-
-  # Similar logic as torch.utils.dlpack.from_dlpack
-  # https://github.com/pytorch/pytorch/blob/b0ef363972203b163cddc95e4c6054b8221c2300/torch/utils/dlpack.py#L114-L115
-  # The array API specify that the default legacy stream must be passed
-  # with a value of 1 for CUDA
-  device_id = tensor.device.index
-  stream = torch_xla._XLAC._get_stream_for_cuda_device(device_id)
-  stream = 1 if stream == 0 else stream
-  assert stream is None or type(stream) is int
-  external_stream = torch.cuda.ExternalStream(stream)
-  current_stream = torch.cuda.current_stream()
-  if external_stream != current_stream:
-    event = torch.cuda.Event()
-    event.record(current_stream)
-    external_stream.wait_event(event)
-  dlpack = to_dlpack(tensor)
-  cuda_tensor = torch.utils.dlpack.from_dlpack(dlpack)
-  return cuda_tensor

Original file line number	Diff line number	Diff line change
`@@ -161,8 +161,6 @@ IfrtComputationClient::Create() {`
`161`	`161`	`}`
`162`	`162`
`163`	`163`	`IfrtComputationClient::~IfrtComputationClient() {`
`164`		`- // In the GPU case, the PjRtClient depends on the DistributedRuntimeClient`
`165`		`- // tracked in XlaCoordinator, so the PjRtClient must be destroyed first.`
`166`	`164`	`client_ = nullptr;`
`167`	`165`	`coordinator_ = nullptr;`
`168`	`166`	`}`
Original file line number	Diff line number	Diff line change
`@@ -110,10 +110,6 @@ class IfrtComputationClient : public ComputationClient {`
`110`	`110`	`XLA_ERROR() << __FUNCTION__ << " not implemented";`
`111`	`111`	`}`
`112`	`112`
`113`		`- std::intptr_t GetCudaStreamForDevice(int local_device_id) const override {`
`114`		`- XLA_ERROR() << __FUNCTION__ << " not implemented";`
`115`		`- }`
`116`		`-`
`117`	`113`	`std::vector<std::string> GetLocalDevices() const override;`
`118`	`114`
`119`	`115`	`std::vector<std::string> GetAllDevices() const override;`