Remove CUDA specific path from internal Python packages. (#9606)

ysiraichi · web-flow · commit c0eeb5791b2e · 2025-09-02T13:16:38.000-03:00
This PR removes CUDA specific code from internal Python packages, such
as `_dynamo`, files in `_internal`, and the main `__init__.py` file.
This is in line with the CUDA deprecation that started on release 2.8.

**Key Changes:**

- (`torch_xla/__init__.py`) Removed GPU specific OpenXLA flag
- (`torch_xla/_dynamo/dynamo_bridge.py`) Removed CUDA tensor movement
- As far as I know, mainly created for the zero overhead CUDA tensor
movement
diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
@@ -31,8 +31,6 @@ def _set_missing_flags(flags, sets):
 def _setup_xla_flags():
   flags = os.environ.get('XLA_FLAGS', '').split(' ')
   flags = _set_missing_flags(flags, (('xla_cpu_enable_fast_math', 'false'),))
-  flags = _set_missing_flags(flags,
-                             (('xla_gpu_force_compilation_parallelism', '8'),))
   os.environ['XLA_FLAGS'] = ' '.join(flags)
 
 
diff --git a/torch_xla/_dynamo/dynamo_bridge.py b/torch_xla/_dynamo/dynamo_bridge.py
@@ -119,48 +119,6 @@ def _get_input_arg_device(input_args: tuple) -> torch.device:
     return device
 
 
-# Returns True if all the input args are on a CUDA device.
-def _args_on_cuda(input_args: tuple) -> bool:
-  input_device: torch.device = _get_input_arg_device(input_args)
-  if input_device is None:
-    return False
-
-  return input_device.type == "cuda"
-
-
-# Given an input list, moves the tensors to the given target_device.
-# The output order will be the same as the input. Non tensors will also still
-# be in the list.
-def _maybe_move_tensors_to_device(tensors: tuple,
-                                  target_device: torch.device) -> tuple:
-  assert target_device, "Moving tensors to None device not supported"
-
-  moved_tensors = []
-  for tensor in tensors:
-    if not isinstance(tensor, torch.Tensor):
-      moved_tensors.append(tensor)
-      continue
-
-    if tensor.device == target_device:
-      moved_tensors.append(tensor)
-      continue
-
-    if dynamo_debug:
-      print("Moving Tensor {} to device {}".format(tensor, target_device))
-
-    # Have to move to CPU before moving it to target device.
-    cpu_device: torch.device = torch.device("cpu")
-    moved_tensor = tensor.to(cpu_device)
-    moved_tensor = moved_tensor.to(target_device)
-
-    # Explicitly have to copy requires_grad attribute because it's dropped
-    # with torch.to(..)
-    moved_tensor.requires_grad = tensor.requires_grad
-    moved_tensors.append(moved_tensor)
-
-  return tuple(moved_tensors)
-
-
 def _split_xla_args_tensor_sym_constant(args):
   tensors = deque(maxlen=len(args))
   constants = []
@@ -552,14 +510,6 @@ def optimized_mod(*args: tuple):
        special_return_handler, xla_args_need_update) = extract_graph_helper(
            xla_model, sym_constants_to_graph_vars)
 
-    original_device: torch.device = _get_input_arg_device(args)
-    is_cuda_args: bool = False
-    if original_device:
-      is_cuda_args = original_device.type == "cuda"
-
-    if is_cuda_args:
-      args = _maybe_move_tensors_to_device(args, torch_xla.device())
-
     if not config.skip_input_data_check:
       # `torch_xla.sync()` needs to be blocking since we want to access args's
       # XLADatas and they can't be placeholder.
@@ -610,11 +560,7 @@ def optimized_mod(*args: tuple):
 
     # First few elements might be xla_args that needs to be in place updated
     result = res[len(xla_args_need_update):]
-
     result = none_remover.add_nones(result)
-    if is_cuda_args:
-      result = _maybe_move_tensors_to_device(tuple(result), original_device)
-
     if len(result) == 1:
       return result[0]
     else:
@@ -802,10 +748,6 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
 
 
 def extract_compiled_graph_helper(xla_model: torch.fx.GraphModule, xla_args):
-  if _args_on_cuda(xla_args):
-    xla_args = tuple(
-        _maybe_move_tensors_to_device(xla_args, torch_xla.device()))
-
   # Synchronize xla_args, so that each FunctionalTensorWrapper argument updates its
   # value reference before actually computing it.
   for a in xla_args:
diff --git a/torch_xla/_internal/gpu.py b/torch_xla/_internal/gpu.py
diff --git a/torch_xla/_internal/pjrt.py b/torch_xla/_internal/pjrt.py
@@ -12,7 +12,7 @@
 import torch_xla.core.xla_env_vars as xenv
 import torch_xla.core.xla_model as xm
 import torch_xla.distributed.xla_backend
-from torch_xla._internal import tpu, gpu, neuron
+from torch_xla._internal import tpu, neuron
 from torch_xla import runtime
 import torch_xla.utils.utils as xu
 from torch_xla.experimental import plugins
@@ -149,8 +149,6 @@ def run_multiprocess(fn: Callable[..., R],
     num_processes = plugins.default().physical_chip_count()
   elif runtime.device_type() == 'TPU':
     num_processes = tpu.num_local_processes()
-  elif runtime.device_type() == 'CUDA':
-    num_processes = gpu.num_local_processes()
   elif runtime.device_type() == 'NEURON':
     num_processes = neuron.num_local_processes()
   else:
@@ -220,8 +218,6 @@ def _initialize_single_process(local_rank: int, local_world_size: int):
 
 def spawn_threads(fn: Callable, args: Tuple = ()) -> None:
   """Run function in one process with one thread per addressable device."""
-  assert runtime.device_type() not in (
-      'CUDA'), "spawn_threads does not support GPU device"
   spawn_fn = _SpawnFn(fn, *args)
   _run_thread_per_device(
       local_rank=0,