ROCm
diff --git a/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py‎
Lines changed: 33 additions & 24 deletions b/‎.ci/aarch64_linux/aarch64_wheel_ci_build.py‎
Lines changed: 33 additions & 24 deletions
diff --git a/‎.ci/docker/ci_commit_pins/executorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/executorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_executorch.sh‎
Lines changed: 1 addition & 2 deletions b/‎.ci/docker/common/install_executorch.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/docker/common/install_halide.sh‎
Lines changed: 3 additions & 1 deletion b/‎.ci/docker/common/install_halide.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.ci/docker/common/install_inductor_benchmark_deps.sh‎
Lines changed: 0 additions & 7 deletions b/‎.ci/docker/common/install_inductor_benchmark_deps.sh‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎.ci/docker/triton_version.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/triton_version.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/manywheel/build_common.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/manywheel/build_common.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/pytorch/check_binary.sh‎
Lines changed: 0 additions & 10 deletions b/‎.ci/pytorch/check_binary.sh‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎.ci/pytorch/install_cache_xla.sh‎
Lines changed: 35 additions & 16 deletions b/‎.ci/pytorch/install_cache_xla.sh‎
Lines changed: 35 additions & 16 deletions
diff --git a/‎.ci/pytorch/smoke_test/check_gomp.py‎
Lines changed: 74 additions & 0 deletions b/‎.ci/pytorch/smoke_test/check_gomp.py‎
Lines changed: 74 additions & 0 deletions
@@ -55,9 +55,22 @@ def build_ArmComputeLibrary() -> None:
         shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 
 
-def update_wheel(wheel_path, desired_cuda) -> None:
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     """
-    Update the cuda wheel libraries
+    Package the cuda wheel libraries
     """
     folder = os.path.dirname(wheel_path)
     wheelname = os.path.basename(wheel_path)
@@ -88,30 +101,19 @@ def update_wheel(wheel_path, desired_cuda) -> None:
         "/usr/lib64/libgfortran.so.5",
         "/acl/build/libarm_compute.so",
         "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
     ]
-    if enable_cuda:
-        libs_to_copy += [
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-        if "126" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-        elif "128" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-    else:
+
+    if "128" in desired_cuda:
         libs_to_copy += [
-            "/opt/OpenBLAS/lib/libopenblas.so.0",
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
         ]
+
     # Copy libraries to unzipped_folder/a/lib
     for lib_path in libs_to_copy:
         lib_name = os.path.basename(lib_path)
@@ -120,6 +122,13 @@ def update_wheel(wheel_path, desired_cuda) -> None:
             f"cd {folder}/tmp/torch/lib/; "
             f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
         )
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
     os.mkdir(f"{folder}/cuda_wheel")
     os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
     shutil.move(
@@ -242,6 +251,6 @@ def parse_arguments():
         print("Updating Cuda Dependency")
         filename = os.listdir("/pytorch/dist/")
         wheel_path = f"/pytorch/dist/{filename[0]}"
-        update_wheel(wheel_path, desired_cuda)
+        package_cuda_wheel(wheel_path, desired_cuda)
     pytorch_wheel_name = complete_wheel("/pytorch/")
     print(f"Build Complete. Created {pytorch_wheel_name}..")
@@ -1 +1 @@
-01a22b6f16d117454b7d21ebdc691b0785b84a7f
+ebe8522378c3f9944aaaef44868f5ececdd845fc
@@ -50,8 +50,7 @@ setup_executorch() {
   pushd executorch
 
   export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
 
   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
   popd
 
@@ -35,7 +35,9 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
+# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
+#       Context: https://github.com/pytorch/pytorch/issues/150420
+cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
 
@@ -14,13 +14,6 @@ function install_timm() {
   local commit
   commit=$(get_pinned_commit timm)
 
-  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
-  # I'm using nightly here instead. We just need to package to be able to install
-  # TIMM. Removing this once vision has a release on 3.13
-  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
-    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
-  fi
-
   pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
   # Clean up
   conda_run pip uninstall -y cmake torch torchvision triton
 
@@ -1 +1 @@
-3.3.0
+3.3.1
@@ -333,8 +333,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
             # ROCm workaround for roctracer dlopens
             if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                 patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies
-            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                 patchedpath=$destpath
             else
                 patchedpath=$(fname_with_sha256 $destpath)
 
@@ -59,16 +59,6 @@ else
   export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
 fi
 
-###############################################################################
-# Setup XPU ENV
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-  set +u
-  # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
-fi
-
 ###############################################################################
 # Check GCC ABI
 ###############################################################################
 
@@ -1,31 +1,50 @@
 #!/bin/bash
 
 # Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
+# image, which has sccache installed but doesn't write the stubs.  This is
+# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
+# that will always return the same thing, ex checks for for rocm, CUDA, changing
+# the path where sccache is installed, not changing /etc/environment, and not
+# installing/downloading sccache as it is already in the docker image.
 
 set -ex -o pipefail
 
-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
 mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"
 
-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
 function write_sccache_stub() {
   # Unset LD_PRELOAD for ps because of asan + ps issues
   # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
+  if [ "$1" == "gcc" ]; then
+    # Do not call sccache recursively when dumping preprocessor argument
+    # For some reason it's very important for the first cached nvcc invocation
+    cat >"/tmp/cache/bin/$1" <<EOF
+#!/bin/sh
+
+# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
+for arg in "\$@"; do
+  if [ "\$arg" = "-E" ]; then
+    exec $(which "$1") "\$@"
+  fi
+done
+
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache $(which "$1") "\$@"
+else
+  exec $(which "$1") "\$@"
+fi
+EOF
+  else
+    cat >"/tmp/cache/bin/$1" <<EOF
+#!/bin/sh
+
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache $(which "$1") "\$@"
+else
+  exec $(which "$1") "\$@"
+fi
+EOF
+  fi
   chmod a+x "/tmp/cache/bin/$1"
 }
 
 
@@ -0,0 +1,74 @@
+import ctypes
+import os
+import sys
+from pathlib import Path
+
+
+def get_gomp_thread():
+    """
+    Retrieves the maximum number of OpenMP threads after loading the `libgomp.so.1` library
+    and the `libtorch_cpu.so` library. It then queries the
+    maximum number of threads available for OpenMP parallel regions using the
+    `omp_get_max_threads` function.
+
+    Returns:
+        int: The maximum number of OpenMP threads available.
+
+    Notes:
+        - The function assumes the default path for `libgomp.so.1` on AlmaLinux OS.
+        - The path to `libtorch_cpu.so` is constructed based on the Python executable's
+          installation directory.
+        - This function is specific to environments where PyTorch and OpenMP are used
+          together and may require adjustments for other setups.
+    """
+    python_path = Path(sys.executable).resolve()
+    python_prefix = (
+        python_path.parent.parent
+    )  # Typically goes to the Python installation root
+
+    # Get the additional ABI flags (if any); it may be an empty string.
+    abiflags = getattr(sys, "abiflags", "")
+
+    # Construct the Python directory name correctly (e.g., "python3.13t").
+    python_version = (
+        f"python{sys.version_info.major}.{sys.version_info.minor}{abiflags}"
+    )
+
+    libtorch_cpu_path = (
+        python_prefix
+        / "lib"
+        / python_version
+        / "site-packages"
+        / "torch"
+        / "lib"
+        / "libtorch_cpu.so"
+    )
+
+    # use the default gomp path of AlmaLinux OS
+    libgomp_path = "/usr/lib64/libgomp.so.1"
+
+    os.environ["GOMP_CPU_AFFINITY"] = "0-3"
+
+    libgomp = ctypes.CDLL(libgomp_path)
+    libgomp = ctypes.CDLL(libtorch_cpu_path)
+
+    libgomp.omp_get_max_threads.restype = ctypes.c_int
+    libgomp.omp_get_max_threads.argtypes = []
+
+    omp_max_threads = libgomp.omp_get_max_threads()
+    return omp_max_threads
+
+
+def main():
+    omp_max_threads = get_gomp_thread()
+    print(
+        f"omp_max_threads after loading libgomp.so and libtorch_cpu.so: {omp_max_threads}"
+    )
+    if omp_max_threads == 1:
+        raise RuntimeError(
+            "omp_max_threads is 1. Check whether libgomp.so is loaded twice."
+        )
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-01a22b6f16d117454b7d21ebdc691b0785b84a7f`
	`1`	`+ebe8522378c3f9944aaaef44868f5ececdd845fc`