tenstorrent · sshonTT · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/.bazelrc b/.bazelrc
@@ -79,18 +79,6 @@ build:native_arch_posix --host_copt=-march=native
 
 build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=1
 
-build:cuda --repo_env TF_NEED_CUDA=1
-# "sm" means we emit only cubin, which is forward compatible within a GPU generation.
-# "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations.
-build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda --@local_config_cuda//:enable_cuda
-build:cuda --define=xla_python_enable_gpu=true
-build:cuda --cxxopt=-DXLA_CUDA=1
-
-# Coverage with cuda/gcc/nvcc requires manually setting coverage flags.
-coverage:cuda --per_file_copt=third_party/.*,torch_xla/.*@--coverage
-coverage:cuda --linkopt=-lgcov
-
 build:acl --define==build_with_acl=true
 
 build:nonccl --define=no_nccl_support=true
@@ -103,21 +91,20 @@ build:short_logs --output_filter=DONT_MATCH_ANYTHING
 #build:tpu --@xla//xla/python:enable_tpu=true
 build:tpu --define=with_tpu_support=true
 
-# Run tests serially with TPU and GPU (only 1 device is available).
+# Run tests serially with TPU (only 1 device is available).
 test:tpu --local_test_jobs=1
-test:cuda --local_test_jobs=1
 
 #########################################################################
 # RBE config options below.
 # Flag to enable remote config
 common --experimental_repo_remote_exec
 
 # Inherit environmental variables that are used in testing.
-test --test_env=TPU_NUM_DEVICES --test_env=GPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
+test --test_env=TPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
 test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS
 test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL
 test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS
-test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=PJRT_GPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
+test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
 test --test_env=PJRT_LOCAL_PROCESS_RANK
 
 # This environmental variable is important for properly integrating with XLA.

diff --git a/.circleci/build.sh b/.circleci/build.sh
@@ -50,7 +50,6 @@ source $XLA_DIR/xla_env
 export GCLOUD_SERVICE_KEY_FILE="$XLA_DIR/default_credentials.json"
 export SILO_NAME='cache-silo-ci-dev-3.8_cuda_12.1'  # cache bucket for CI
 export BUILD_CPP_TESTS='1'
-export TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_70,sm_75,compute_80,$TF_CUDA_COMPUTE_CAPABILITIES"
 build_torch_xla $XLA_DIR
 
 popd
diff --git a/.circleci/common.sh b/.circleci/common.sh
@@ -112,6 +112,8 @@ function build_torch_xla() {
   # Need to uncomment the line below.
   # Currently it fails upstream XLA CI.
   # pip install plugins/cuda -v
+  pip install --pre torch_xla[pallas] --index-url https://us-python.pkg.dev/ml-oss-artifacts-published/jax/simple/ --find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
   popd
 }
 
@@ -156,26 +158,12 @@ function run_torch_xla_cpp_tests() {
     fi
 
     if [ "$USE_COVERAGE" != "0" ]; then
-      if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
-        lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
-      else
-        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
-      fi
+      PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
+      cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
       genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
       mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
     else
-      # Shard GPU testing
-      if [ -x "$(command -v nvidia-smi)" ]; then
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
-        PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
-      else
-        PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
-      fi
+      PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
     fi
   popd
 }
@@ -194,11 +182,6 @@ function run_torch_xla_tests() {
   RUN_CPP="${RUN_CPP_TESTS:0}"
   RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
 
-  if [ -x "$(command -v nvidia-smi)" ]; then
-    num_devices=$(nvidia-smi --list-gpus | wc -l)
-    echo "Found $num_devices GPU devices..."
-    export GPU_NUM_DEVICES=$num_devices
-  fi
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
   export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 

diff --git a/.devcontainer/gpu-internal/devcontainer.json b/.devcontainer/gpu-internal/devcontainer.json
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
@@ -13,5 +13,5 @@ Error messages and stack traces are also helpful.
 
 ## System Info
 
-- reproducible on XLA backend [CPU/TPU/CUDA]:
+- reproducible on XLA backend [CPU/TPU]:
 - torch_xla version:
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -46,7 +46,7 @@ Steps to reproduce the behavior:
 
 ## Environment
 
- - Reproducible on XLA backend [CPU/TPU/CUDA]:
+ - Reproducible on XLA backend [CPU/TPU]:
  - torch_xla version:
 
 

diff --git a/.github/ci.md b/.github/ci.md
@@ -3,22 +3,22 @@
 PyTorch and PyTorch/XLA use CI to lint, build, and test each PR that is
 submitted. All CI tests should succeed before the PR is merged into master.
 PyTorch CI pins PyTorch/XLA to a specific commit. On the other hand, PyTorch/XLA
-CI pulls PyTorch from master unless a pin is manually provided. This README will
-go through the reasons of these pins, how to pin a PyTorch/XLA PR to an upstream
-PyTorch PR, and how to coordinate a merge for breaking PyTorch changes.
+CI pulls PyTorch from `.torch_commit` unless a pin is manually provided. This
+README will go through the reasons of these pins, how to pin a PyTorch/XLA PR
+to an upstream PyTorch PR, and how to coordinate a merge for breaking PyTorch
+changes.
 
 ## Usage
 
-### Pinning PyTorch PR in PyTorch/XLA PR
+### Temporarily Pinning PyTorch PR in PyTorch/XLA PR
 
 Sometimes a PyTorch/XLA PR needs to be pinned to a specific PyTorch PR to test
-new features, fix breaking changes, etc. Since PyTorch/XLA CI pulls from PyTorch
-master by default, we need to manually provide a PyTorch pin. In a PyTorch/XLA
-PR, PyTorch can be manually pinned by creating a `.torch_pin` file at the root
-of the repository. The `.torch_pin` should have the corresponding PyTorch PR
-number prefixed by "#". Take a look at [example
-here](https://github.com/pytorch/xla/pull/7313). Before the PyTorch/XLA PR gets
-merged, the `.torch_pin` must be deleted.
+new features, fix breaking changes, etc. In a PyTorch/XLA PR, PyTorch can be
+manually pinned by creating a `.torch_pin` file at the root of the repository.
+The `.torch_pin` should have the corresponding PyTorch PR number prefixed by
+"#". Take a look at [example here](https://github.com/pytorch/xla/pull/7313).
+Before the PyTorch/XLA PR gets merged, the `.torch_pin` must be deleted and
+`.torch_commit` updated.
 
 ### Coordinating merges for breaking PyTorch PRs
 
@@ -35,29 +35,42 @@ fail. Steps for fixing and merging such breaking PyTorch change is as following:
    PyTorch PR to pin the PyTorch/XLA to the commit hash created in step 1 by
    updating `pytorch/.github/ci_commit_pins/xla.txt`.
 1. Once CI tests are green on both ends, merge PyTorch PR.
-1. Remove the `.torch_pin` in PyTorch/XLA PR and merge. To be noted, `git commit
-   --amend` should be avoided in this step as PyTorch CI will keep using the
-   commit hash created in step 1 until other PRs update that manually or the
-   nightly buildbot updates that automatically.
+1. Remove the `.torch_pin` in PyTorch/XLA PR and update the `.torch_commit` to
+   the hash of the merged PyTorch PR. To be noted, `git commit --amend` should
+   be avoided in this step as PyTorch CI will keep using the commit hash
+   created in step 1 until other PRs update that manually or the nightly
+   buildbot updates that automatically.
 1. Finally, don't delete your branch until 2 days later. See step 4 for
    explanations.
 
 ### Running TPU tests on PRs
 
-The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU and
-GPU. The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
+The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU.
+The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
+
+## Update the PyTorch Commit Pin
+
+In order to reduce development burden of PyTorch/XLA, starting from #9654, we
+started pinning PyTorch using the `.torch_commit` file. This should reduce the
+number of times a PyTorch PR breaks our most recent commits. However, this also
+requires maintenance, i.e. someone has to keep updating the PyTorch commit so
+as to make sure it's always supporting (almost) the latest PyTorch versions.
+
+Updating the PyTorch commit pin is, theoretically, simple. You just have to run
+`scripts/update_deps.py --pytorch` file, and open a PR. In practice, you may
+encounter a few compilation errors, or even segmentation faults.
 
 ## CI Environment
 
 Before the CI in this repository runs, we build a base dev image. These are the
 same images we recommend in our VSCode `.devcontainer` setup and nightly build
-to ensure consistency between environments. We produce variants with and without
-CUDA, configured in `infra/ansible` (build config) and
-`infra/tpu-pytorch-releases/dev_images.tf` (build triggers).
+to ensure consistency between environments. We produce variants configured in
+`infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf`
+(build triggers).
 
 The CI runs in two environments:
 
-1. Organization self-hosted runners for CPU and GPU: used for almost every step
+1. Organization self-hosted runners for CPU: used for almost every step
    of the CI. These runners are managed by PyTorch and have access to the shared
    ECR repository.
 1. TPU self-hosted runners: these are managed by us and are only available in
@@ -68,48 +81,35 @@ The CI runs in two environments:
 
 We have two build paths for each CI run:
 
-- `torch_xla`: we build the main package to support both TPU and GPU[^1], along
+- `torch_xla`: we build the main package to support TPU, along
   with a CPU build of `torch` from HEAD. This build step exports the
   `torch-xla-wheels` artifact for downstream use in tests.
   - Some CI tests also require `torchvision`. To reduce flakiness, we compile
     `torchvision` from [`torch`'s CI pin][pytorch-vision-pin].
   - C++ tests are piggybacked onto the same build and uploaded in the
     `cpp-test-bin` artifact.
-- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of
-  either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus,
-  this build should be almost entirely cached, unless your PR changes the XLA
-  pin or adds a patch.
 
-Both the main package build and plugin build are configured with ansible at
-`infra/ansible`, although they run in separate stages (`stage=build_srcs` vs
-`stage=build_plugin`). This is the same configuration we use for our nightly and
-release builds.
+The main package build is configured with ansible at `infra/ansible`. This is
+the same configuration we use for our nightly and release builds.
 
-The CPU and GPU test configs are defined in the same file, `_test.yml`. Since
+The CPU test config is defined in the file `_test.yml`. Since
 some of the tests come from the upstream PyTorch repository, we check out
 PyTorch at the same git rev as the `build` step (taken from
 `torch_xla.version.__torch_gitrev__`). The tests are split up into multiple
 groups that run in parallel; the `matrix` section of `_test.yml` corresponds to
 in `.github/scripts/run_tests.sh`.
 
 CPU tests run immediately after the `torch_xla` build completes. This will
-likely be the first test feedback on your commit. GPU tests will launch when
-both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is
-much slower due to the number of possible optimizations, and the GPU chips
-themselves are quite outdated, so these tests will take longer to run than the
-CPU tests.
+likely be the first test feedback on your commit. 
 
 ![CPU tests launch when `torch_xla` is
 complete](../docs/assets/ci_test_dependency.png)
 
-![GPU tests also depend on CUDA
-plugin](../docs/assets/ci_test_dependency_gpu.png)
-
 For the C++ test groups in either case, the test binaries are pre-built during
 the build phase and packaged in `cpp-test-bin`. This will only be downloaded if
 necessary.
 
-[^1]: Note: both GPU and TPU support require their respective plugins to be
+[^1]: Note: TPU support require its respective plugins to be
     installed. This package will _not_ work on either out of the box.
 
 ### TPU CI
@@ -165,13 +165,6 @@ good" commit to prevent accidental changes from PyTorch/XLA to break PyTorch CI
 without warning. PyTorch has hundreds of commits each week, and this pin ensures
 that PyTorch/XLA as a downstream package does not cause failures in PyTorch CI.
 
-#### Why does PyTorch/XLA CI pull from PyTorch master?
-
-[PyTorch/XLA CI pulls PyTorch from master][pull-pytorch-master] unless a PyTorch
-pin is manually provided. PyTorch/XLA is a downstream package to PyTorch, and
-pulling from master ensures that PyTorch/XLA will stay up-to-date and works with
-the latest PyTorch changes.
-
 #### TPU CI is broken
 
 If the TPU CI won't run, try to debug using the following steps:

diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
@@ -30,14 +30,7 @@ function run_torch_xla_cpp_tests() {
 
   TORCH_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch').get_filename()))")
   export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${TORCH_DIR}/lib
-  if [ -x "$(command -v nvidia-smi)" ]; then
-    CUDA_PLUGIN_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch_xla_cuda_plugin').get_filename()))")
-    export PJRT_LIBRARY_PATH=$CUDA_PLUGIN_DIR/lib/pjrt_c_api_gpu_plugin.so
-    export PJRT_DEVICE=LIBRARY
-    export PJRT_DYNAMIC_PLUGINS=1
-  else
-    export PJRT_DEVICE=CPU
-  fi
+  export PJRT_DEVICE=CPU
   export XLA_EXPERIMENTAL="nonzero:masked_select:nms"
 
   test_names=("test_aten_xla_tensor_1"
@@ -55,6 +48,7 @@ function run_torch_xla_cpp_tests() {
                "test_tensor"
                # disable test_xla_backend_intf since it is flaky on upstream
                #"test_xla_backend_intf"
+               "test_xla_generator"
                "test_xla_sharding"
                "test_runtime"
                "test_status_dont_show_cpp_stacktraces"
@@ -83,11 +77,6 @@ PYTORCH_DIR=$1
 XLA_DIR=$2
 USE_COVERAGE="${3:-0}"
 
-if [ -x "$(command -v nvidia-smi)" ]; then
-  num_devices=$(nvidia-smi --list-gpus | wc -l)
-  echo "Found $num_devices GPU devices..."
-  export GPU_NUM_DEVICES=$num_devices
-fi
 export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
 export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 

diff --git a/.github/upstream/Dockerfile b/.github/upstream/Dockerfile
@@ -15,11 +15,6 @@ ARG tpuvm=""
 # Disable CUDA for PyTorch
 ENV USE_CUDA "0"
 
-# Enable CUDA for XLA
-ENV XLA_CUDA "${cuda}"
-ENV TF_CUDA_COMPUTE_CAPABILITIES "${cuda_compute}"
-ENV TF_CUDA_PATHS "/usr/local/cuda,/usr/include,/usr"
-
 # CUDA build guidance
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility

diff --git a/.github/upstream/install_conda.sh b/.github/upstream/install_conda.sh
@@ -27,6 +27,10 @@ function install_and_setup_conda() {
   fi
   export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 
+  # Accept Conda channels' ToS automatically.
+  # Ref: https://github.com/pytorch/pytorch/issues/158438#issuecomment-3084935777
+  export CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes"
+
   conda update -y -n base conda
   conda install -y python=$PYTHON_VERSION