ROCm · pragupta · Oct 1, 2025 · Sep 18, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
     export USE_SYSTEM_NCCL=1
@@ -42,13 +41,9 @@ else
         echo "Bundling CUDA libraries with wheel for aarch64."
     else
         echo "Using nvidia libs from pypi for aarch64."
-        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
-        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
         echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
         export USE_NVIDIA_PYPI_LIBS=1
     fi
 
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     folder = os.path.dirname(wheel_path)
     os.mkdir(f"{folder}/tmp")
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
 
     # Check if we should use PyPI NVIDIA libraries or bundle system libraries
     use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
@@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
         ]
 
         # CUDA version-specific libraries
-        if "130" in desired_cuda:
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
             version_specific_libs = [
                 "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
                 "/usr/local/cuda/lib64/libcublas.so.13",
@@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                 "/usr/local/cuda/lib64/libcusolver.so.12",
                 "/usr/local/cuda/lib64/libnvJitLink.so.13",
                 "/usr/local/cuda/lib64/libnvrtc.so.13",
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
             ]
         elif "12" in desired_cuda:
             # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                 "/usr/local/cuda/lib64/libnvrtc.so.12",
                 f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
             ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
 
         # Combine all libraries
         libs_to_copy = common_libs + version_specific_libs
@@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str:
             f"/{folder}/dist/{repaired_wheel_name}",
         )
     else:
-        repaired_wheel_name = wheel_name.replace(
-            "linux_aarch64", "manylinux_2_28_aarch64"
-        )
-        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
-        os.rename(
-            f"/{folder}/dist/{wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
 
     print(f"Copying {repaired_wheel_name} to artifacts")
     shutil.copy2(
@@ -319,7 +317,7 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = ""
     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
     if enable_cuda:
         build_vars += "MAX_JOBS=5 "

diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
         try:
             with socket.create_connection((addr, port), timeout=timeout):
                 return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
             if i == attempt_cnt - 1:
                 raise
             time.sleep(timeout)
@@ -1004,7 +1004,7 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.10"
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)

@@ -214,8 +214,7 @@ case "$tag" in
     TRITON=yes
     ;;
   pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -263,13 +262,10 @@ case "$tag" in
     TRITON_CPU=yes
     ;;
   pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)

@@ -59,9 +59,13 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh

@@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
@@ -1,2 +1,2 @@
-transformers==4.54.0
+transformers==4.56.0
 soxr==0.5.0
@@ -0,0 +1 @@
+7fe50dc3da2069d6645d9deb8c017a876472a977
@@ -42,22 +42,27 @@ install_pip_dependencies() {
   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
   # numba and scipy version used in PyTorch CI
   conda_run pip uninstall -y numba scipy
+  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
+  pip_install yaspin==3.1.0
 
   popd
 }
 
 setup_executorch() {
-  pushd executorch
-
   export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
 
   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
-  popd
 }
 
-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
+if [ $# -eq 0 ]; then
+  clone_executorch
+  install_buck2
+  install_conda_dependencies
+  install_pip_dependencies
+  pushd executorch
+  setup_executorch
+  popd
+else
+  "$@"
+fi
@@ -2,6 +2,11 @@
 
 set -ex
 
+# for pip_install function
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
+
 ver() {
     printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }
@@ -109,8 +114,7 @@ EOF
         rm -rf HIP clr
     fi
 
-    # temporary hipblasLT dependency install
-    apt install libmsgpackc2
+    pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
 
     # Cleanup
     apt-get autoclean && apt-get clean
@@ -122,8 +126,8 @@ install_centos() {
   yum update -y
   yum install -y kmod
   yum install -y wget
-  
-  if [[ $OS_VERSION == 9 ]]; then 
+
+  if [[ $OS_VERSION == 9 ]]; then
       dnf install -y openblas-serial
       dnf install -y dkms kernel-headers kernel-devel
   else
@@ -195,6 +199,8 @@ install_centos() {
       sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
   done
 
+  pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
+
   # Cleanup
   yum clean all
   rm -rf /var/cache/yum

@@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@@ -322,8 +323,6 @@ lxml==5.3.0 ; python_version <= "3.12"
 lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
-# Python-3.9 binaries
-
 PyGithub==2.3.0
 
 sympy==1.13.3

@@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably

@@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}
 
 # Install rocm
 ARG ROCM_VERSION
+RUN mkdir ci_commit_pins
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
 COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
-RUN rm install_rocm.sh
+RUN rm install_rocm.sh common_utils.sh
+RUN rm -r ci_commit_pins
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
 RUN rm install_rocm_magma.sh

diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh
@@ -7,4 +7,4 @@ set -ex
 
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -41,7 +41,6 @@ def sample_vllm_test_library():
                 "pytest -v -s basic_correctness/test_cumem.py",
                 "pytest -v -s basic_correctness/test_basic_correctness.py",
                 "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
             ],
         },
         "vllm_basic_models_test": {
@@ -68,15 +67,12 @@ def sample_vllm_test_library():
                         "-v",
                         "-s",
                         "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                         "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                         "--ignore=entrypoints/llm/test_collective_rpc.py",
                     ]
                 ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
             ],
         },
         "vllm_regression_test": {

diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -66,6 +66,11 @@ class VllmBuildParameters:
         "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
     )
 
+    # the cleaning script to remove torch dependencies from pip
+    cleaning_script: Path = env_path_field(
+        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
+    )
+
     # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
     output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
 
@@ -160,6 +165,7 @@ def run(self):
         logger.info("Running vllm build with inputs: %s", inputs)
         vllm_commit = clone_vllm()
 
+        self.cp_torch_cleaning_script(inputs)
         self.cp_dockerfile_if_exist(inputs)
         # cp torch wheels from root direct to vllm workspace if exist
         self.cp_torch_whls_if_exist(inputs)
@@ -205,6 +211,11 @@ def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
         copy(inputs.torch_whls_path, tmp_dir)
         return tmp_dir
 
+    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
+        script = get_path(inputs.cleaning_script, resolve=True)
+        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
+        copy(script, vllm_script)
+
     def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
         if not inputs.use_local_dockerfile:
             logger.info("using vllm default dockerfile.torch_nightly for build")
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		56392aa978594cc155fa8af48cd949f5b5f1823a
		e0dda9059d082537cee36be6c5e4fe3b18c880c0
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,4 +7,4 @@ set -ex

		SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

		USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
		USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh