Skip to content

Commit 2b05fbd

Browse files
Merge remote-tracking branch 'upstream/main' into rocm7.1_internal_testing_IFU_2025-09-08
# Conflicts: # .ci/docker/ci_commit_pins/triton.txt # .ci/docker/requirements-ci.txt # aten/src/ATen/Context.cpp # aten/src/ATen/cuda/tunable/GemmHipblaslt.h # aten/src/ATen/native/ConvUtils.h # aten/src/ATen/native/Convolution.cpp # aten/src/ATen/native/Normalization.cpp # aten/src/ATen/native/cuda/Blas.cpp # aten/src/ATen/native/miopen/Conv_miopen.cpp # requirements.txt # test/distributed/_tools/test_fsdp2_mem_tracker.py # test/distributed/tensor/parallel/test_tp_examples.py # test/dynamo/test_activation_checkpointing.py # test/dynamo/test_structured_trace.py # test/inductor/test_aot_inductor.py # test/inductor/test_combo_kernels.py # test/test_matmul_cuda.py # test/test_sparse.py # torch/_higher_order_ops/triton_kernel_wrap.py # torch/_inductor/choices.py # torch/_inductor/codegen/triton.py # torch/testing/_internal/common_cuda.py
2 parents 681e60e + bc4176c commit 2b05fbd

File tree

2,118 files changed

+110178
-87227
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

2,118 files changed

+110178
-87227
lines changed

.bc-linter.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
version: 1
2+
paths:
3+
include:
4+
- "**/*.py"
5+
exclude:
6+
- ".*"
7+
- ".*/**"
8+
- "**/.*/**"
9+
- "**/.*"
10+
- "**/_*/**"
11+
- "**/_*.py"
12+
- "**/test/**"
13+
- "**/benchmarks/**"
14+
- "**/test_*.py"
15+
- "**/*_test.py"

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
77
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
88
fi
99

10+
if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
11+
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
12+
fi
13+
14+
# Compress the fatbin with -compress-mode=size for CUDA 13
15+
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
16+
export TORCH_NVCC_FLAGS="-compress-mode=size"
17+
fi
18+
1019
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
1120
source $SCRIPTPATH/aarch64_ci_setup.sh
1221

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -77,44 +77,66 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
7777
wheelname = os.path.basename(wheel_path)
7878
os.mkdir(f"{folder}/tmp")
7979
os.system(f"unzip {wheel_path} -d {folder}/tmp")
80-
libs_to_copy = [
81-
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
80+
# Common libraries for all CUDA versions
81+
common_libs = [
82+
# Non-NVIDIA system libraries
83+
"/lib64/libgomp.so.1",
84+
"/usr/lib64/libgfortran.so.5",
85+
"/acl/build/libarm_compute.so",
86+
"/acl/build/libarm_compute_graph.so",
87+
# Common CUDA libraries (same for all versions)
88+
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
89+
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
90+
"/usr/local/lib/libnvpl_lapack_core.so.0",
91+
"/usr/local/lib/libnvpl_blas_core.so.0",
8292
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
8393
"/usr/local/cuda/lib64/libcudnn.so.9",
84-
"/usr/local/cuda/lib64/libcublas.so.12",
85-
"/usr/local/cuda/lib64/libcublasLt.so.12",
86-
"/usr/local/cuda/lib64/libcudart.so.12",
87-
"/usr/local/cuda/lib64/libcufft.so.11",
88-
"/usr/local/cuda/lib64/libcusparse.so.12",
8994
"/usr/local/cuda/lib64/libcusparseLt.so.0",
90-
"/usr/local/cuda/lib64/libcusolver.so.11",
9195
"/usr/local/cuda/lib64/libcurand.so.10",
9296
"/usr/local/cuda/lib64/libnccl.so.2",
93-
"/usr/local/cuda/lib64/libnvJitLink.so.12",
94-
"/usr/local/cuda/lib64/libnvrtc.so.12",
97+
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
9598
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
9699
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
97100
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
98101
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
99102
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
100103
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
101104
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
102-
"/lib64/libgomp.so.1",
103-
"/usr/lib64/libgfortran.so.5",
104-
"/acl/build/libarm_compute.so",
105-
"/acl/build/libarm_compute_graph.so",
106-
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
107-
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
108-
"/usr/local/lib/libnvpl_lapack_core.so.0",
109-
"/usr/local/lib/libnvpl_blas_core.so.0",
105+
"/usr/local/cuda/lib64/libcufile.so.0",
106+
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
107+
"/usr/local/cuda/lib64/libcusparse.so.12",
110108
]
111109

112-
if "129" in desired_cuda:
113-
libs_to_copy += [
114-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
115-
"/usr/local/cuda/lib64/libcufile.so.0",
116-
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
110+
# CUDA version-specific libraries
111+
if "130" in desired_cuda:
112+
version_specific_libs = [
113+
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
114+
"/usr/local/cuda/lib64/libcublas.so.13",
115+
"/usr/local/cuda/lib64/libcublasLt.so.13",
116+
"/usr/local/cuda/lib64/libcudart.so.13",
117+
"/usr/local/cuda/lib64/libcufft.so.12",
118+
"/usr/local/cuda/lib64/libcusolver.so.12",
119+
"/usr/local/cuda/lib64/libnvJitLink.so.13",
120+
"/usr/local/cuda/lib64/libnvrtc.so.13",
121+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
117122
]
123+
elif "12" in desired_cuda:
124+
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
125+
minor_version = desired_cuda[-1]
126+
version_specific_libs = [
127+
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
128+
"/usr/local/cuda/lib64/libcublas.so.12",
129+
"/usr/local/cuda/lib64/libcublasLt.so.12",
130+
"/usr/local/cuda/lib64/libcudart.so.12",
131+
"/usr/local/cuda/lib64/libcufft.so.11",
132+
"/usr/local/cuda/lib64/libcusolver.so.11",
133+
"/usr/local/cuda/lib64/libnvJitLink.so.12",
134+
"/usr/local/cuda/lib64/libnvrtc.so.12",
135+
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
136+
]
137+
138+
# Combine all libraries
139+
libs_to_copy = common_libs + version_specific_libs
118140

119141
# Copy libraries to unzipped_folder/a/lib
120142
for lib_path in libs_to_copy:
@@ -208,7 +230,7 @@ def parse_arguments():
208230
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
209231
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
210232
if enable_cuda:
211-
build_vars = "MAX_JOBS=5 " + build_vars
233+
build_vars += "MAX_JOBS=5 "
212234

213235
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
214236
desired_cuda = os.getenv("DESIRED_CUDA")

.ci/aarch64_linux/build_aarch64_wheel.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -438,9 +438,7 @@ def build_torchvision(
438438
)
439439
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
440440
elif build_version is not None:
441-
build_vars += (
442-
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
443-
)
441+
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
444442
if host.using_docker():
445443
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
446444

@@ -495,9 +493,7 @@ def build_torchdata(
495493
)
496494
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
497495
elif build_version is not None:
498-
build_vars += (
499-
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
500-
)
496+
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
501497
if host.using_docker():
502498
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
503499

@@ -553,9 +549,7 @@ def build_torchtext(
553549
)
554550
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
555551
elif build_version is not None:
556-
build_vars += (
557-
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
558-
)
552+
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
559553
if host.using_docker():
560554
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
561555

@@ -613,9 +607,7 @@ def build_torchaudio(
613607
)
614608
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
615609
elif build_version is not None:
616-
build_vars += (
617-
f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
618-
)
610+
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
619611
if host.using_docker():
620612
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
621613

.ci/docker/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
120120
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
121121
```bash
122122
docker build \
123-
....
124-
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
123+
....
124+
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
125125
```
126126

127127
3. **Update Dockerfile logic**:

.ci/docker/almalinux/Dockerfile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ FROM cuda as cuda12.9
6464
RUN bash ./install_cuda.sh 12.9
6565
ENV DESIRED_CUDA=12.9
6666

67+
FROM cuda as cuda13.0
68+
RUN bash ./install_cuda.sh 13.0
69+
ENV DESIRED_CUDA=13.0
70+
6771
FROM ${ROCM_IMAGE} as rocm
6872
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
6973
ADD ./common/install_mkl.sh install_mkl.sh
@@ -76,10 +80,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
7680
RUN bash ./install_mnist.sh
7781

7882
FROM base as all_cuda
79-
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
8083
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
8184
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
8285
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
86+
COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
8387

8488
# Final step
8589
FROM ${BASE_TARGET} as final

0 commit comments

Comments
 (0)