Skip to content

Commit 97f8ab5

Browse files
committed
Merge remote-tracking branch 'upstream/main' into rocm7.1_internal_testing
2 parents d5c9823 + 625108e commit 97f8ab5

File tree

7,301 files changed

+354796
-149457
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

7,301 files changed

+354796
-149457
lines changed

.bazelrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ build --cxxopt=--std=c++17
22
build --copt=-I.
33
# Bazel does not support including its cc_library targets as system
44
# headers. We work around this for generated code
5-
# (e.g. c10/macros/cmake_macros.h) by making the generated directory a
5+
# (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a
66
# system include path.
77
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
88
build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin

.ci/aarch64_linux/aarch64_ci_build.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@ set -eux -o pipefail
33

44
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
55

6-
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
7-
export TORCH_CUDA_ARCH_LIST="9.0"
8-
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
9-
export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
6+
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
7+
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
108
fi
119

1210
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -27,6 +25,7 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
2725
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
2826
else
2927
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
28+
export USE_SYSTEM_NCCL=1
3029
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
3130
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
3231
fi

.ci/aarch64_linux/aarch64_wheel_ci_build.py

Lines changed: 57 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -31,40 +31,55 @@ def build_ArmComputeLibrary() -> None:
3131
"build=native",
3232
]
3333
acl_install_dir = "/acl"
34-
acl_checkout_dir = "ComputeLibrary"
35-
os.makedirs(acl_install_dir)
36-
check_call(
37-
[
38-
"git",
39-
"clone",
40-
"https://github.com/ARM-software/ComputeLibrary.git",
41-
"-b",
42-
"v25.02",
43-
"--depth",
44-
"1",
45-
"--shallow-submodules",
46-
]
47-
)
34+
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
35+
if os.path.isdir(acl_install_dir):
36+
shutil.rmtree(acl_install_dir)
37+
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
38+
check_call(
39+
[
40+
"git",
41+
"clone",
42+
"https://github.com/ARM-software/ComputeLibrary.git",
43+
"-b",
44+
"v25.02",
45+
"--depth",
46+
"1",
47+
"--shallow-submodules",
48+
]
49+
)
4850

4951
check_call(
50-
["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
51-
+ acl_build_flags,
52+
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
5253
cwd=acl_checkout_dir,
5354
)
54-
for d in ["arm_compute", "include", "utils", "support", "src"]:
55+
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
5556
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
5657

5758

58-
def update_wheel(wheel_path, desired_cuda) -> None:
59+
def replace_tag(filename) -> None:
60+
with open(filename) as f:
61+
lines = f.readlines()
62+
for i, line in enumerate(lines):
63+
if line.startswith("Tag:"):
64+
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
65+
print(f"Updated tag from {line} to {lines[i]}")
66+
break
67+
68+
with open(filename, "w") as f:
69+
f.writelines(lines)
70+
71+
72+
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
5973
"""
60-
Update the cuda wheel libraries
74+
Package the cuda wheel libraries
6175
"""
6276
folder = os.path.dirname(wheel_path)
6377
wheelname = os.path.basename(wheel_path)
6478
os.mkdir(f"{folder}/tmp")
6579
os.system(f"unzip {wheel_path} -d {folder}/tmp")
6680
libs_to_copy = [
6781
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
82+
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
6883
"/usr/local/cuda/lib64/libcudnn.so.9",
6984
"/usr/local/cuda/lib64/libcublas.so.12",
7085
"/usr/local/cuda/lib64/libcublasLt.so.12",
@@ -74,7 +89,7 @@ def update_wheel(wheel_path, desired_cuda) -> None:
7489
"/usr/local/cuda/lib64/libcusparseLt.so.0",
7590
"/usr/local/cuda/lib64/libcusolver.so.11",
7691
"/usr/local/cuda/lib64/libcurand.so.10",
77-
"/usr/local/cuda/lib64/libnvToolsExt.so.1",
92+
"/usr/local/cuda/lib64/libnccl.so.2",
7893
"/usr/local/cuda/lib64/libnvJitLink.so.12",
7994
"/usr/local/cuda/lib64/libnvrtc.so.12",
8095
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
@@ -88,30 +103,19 @@ def update_wheel(wheel_path, desired_cuda) -> None:
88103
"/usr/lib64/libgfortran.so.5",
89104
"/acl/build/libarm_compute.so",
90105
"/acl/build/libarm_compute_graph.so",
106+
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
107+
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
108+
"/usr/local/lib/libnvpl_lapack_core.so.0",
109+
"/usr/local/lib/libnvpl_blas_core.so.0",
91110
]
92-
if enable_cuda:
93-
libs_to_copy += [
94-
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
95-
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
96-
"/usr/local/lib/libnvpl_lapack_core.so.0",
97-
"/usr/local/lib/libnvpl_blas_core.so.0",
98-
]
99-
if "126" in desired_cuda:
100-
libs_to_copy += [
101-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
102-
"/usr/local/cuda/lib64/libcufile.so.0",
103-
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
104-
]
105-
elif "128" in desired_cuda:
106-
libs_to_copy += [
107-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
108-
"/usr/local/cuda/lib64/libcufile.so.0",
109-
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
110-
]
111-
else:
111+
112+
if "129" in desired_cuda:
112113
libs_to_copy += [
113-
"/opt/OpenBLAS/lib/libopenblas.so.0",
114+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
115+
"/usr/local/cuda/lib64/libcufile.so.0",
116+
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
114117
]
118+
115119
# Copy libraries to unzipped_folder/a/lib
116120
for lib_path in libs_to_copy:
117121
lib_name = os.path.basename(lib_path)
@@ -120,6 +124,13 @@ def update_wheel(wheel_path, desired_cuda) -> None:
120124
f"cd {folder}/tmp/torch/lib/; "
121125
f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
122126
)
127+
128+
# Make sure the wheel is tagged with manylinux_2_28
129+
for f in os.scandir(f"{folder}/tmp/"):
130+
if f.is_dir() and f.name.endswith(".dist-info"):
131+
replace_tag(f"{f.path}/WHEEL")
132+
break
133+
123134
os.mkdir(f"{folder}/cuda_wheel")
124135
os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
125136
shutil.move(
@@ -194,8 +205,10 @@ def parse_arguments():
194205
).decode()
195206

196207
print("Building PyTorch wheel")
197-
build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
198-
os.system("cd /pytorch; python setup.py clean")
208+
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
209+
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
210+
if enable_cuda:
211+
build_vars = "MAX_JOBS=5 " + build_vars
199212

200213
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
201214
desired_cuda = os.getenv("DESIRED_CUDA")
@@ -242,6 +255,6 @@ def parse_arguments():
242255
print("Updating Cuda Dependency")
243256
filename = os.listdir("/pytorch/dist/")
244257
wheel_path = f"/pytorch/dist/{filename[0]}"
245-
update_wheel(wheel_path, desired_cuda)
258+
package_cuda_wheel(wheel_path, desired_cuda)
246259
pytorch_wheel_name = complete_wheel("/pytorch/")
247260
print(f"Build Complete. Created {pytorch_wheel_name}..")

.ci/caffe2/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,3 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
1010
built on Jenkins and are used in triggered builds already have this
1111
environment variable set in their manifest. Also see
1212
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
13-
14-
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.

.ci/caffe2/test.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
1313
echo 'Skipping tests'
1414
exit 0
1515
fi
16-
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
17-
# temporary to locate some kernel issues on the CI nodes
18-
export HSAKMT_DEBUG_LEVEL=4
19-
fi
2016
# These additional packages are needed for circleci ROCm builds.
2117
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
2218
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by

.ci/docker/README.md

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,106 @@ See `build.sh` for valid build environments (it's the giant switch).
3434
./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
3535

3636
# Set flags (see build.sh) and build image
37-
sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
37+
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
3838
```
39+
40+
## [Guidance] Adding a New Base Docker Image
41+
42+
### Background
43+
44+
The base Docker images in directory `.ci/docker/` are built by the `docker-builds.yml` workflow. Those images are used throughout the PyTorch CI/CD pipeline. You should only create or modify a base Docker image if you need specific environment changes or dependencies before building PyTorch on CI.
45+
46+
1. **Automatic Rebuilding**:
47+
- The Docker image building process is triggered automatically when changes are made to files in the `.ci/docker/*` directory
48+
- This ensures all images stay up-to-date with the latest dependencies and configurations
49+
50+
2. **Image Reuse in PyTorch Build Workflows** (example: linux-build):
51+
- The images generated by `docker-builds.yml` are reused in `_linux-build.yml` through the `calculate-docker-image` step
52+
- The `_linux-build.yml` workflow:
53+
- Pulls the Docker image determined by the `calculate-docker-image` step
54+
- Runs a Docker container with that image
55+
- Executes `.ci/pytorch/build.sh` inside the container to build PyTorch
56+
57+
3. **Usage in Test Workflows** (example: linux-test):
58+
- The same Docker images are also used in `_linux-test.yml` for running tests
59+
- The `_linux-test.yml` workflow follows a similar pattern:
60+
- It uses the `calculate-docker-image` step to determine which Docker image to use
61+
- It pulls the Docker image and runs a container with that image
62+
- It installs the wheels from the artifacts generated by PyTorch build jobs
63+
- It executes test scripts (like `.ci/pytorch/test.sh` or `.ci/pytorch/multigpu-test.sh`) inside the container
64+
65+
### Understanding File Purposes
66+
67+
#### `.ci/docker/build.sh` vs `.ci/pytorch/build.sh`
68+
- **`.ci/docker/build.sh`**:
69+
- Used for building base Docker images
70+
- Executed by the `docker-builds.yml` workflow to pre-build Docker images for CI
71+
- Contains configurations for different Docker build environments
72+
73+
- **`.ci/pytorch/build.sh`**:
74+
- Used for building PyTorch inside a Docker container
75+
- Called by workflows like `_linux-build.yml` after the Docker container is started
76+
- Builds PyTorch wheels and other artifacts
77+
78+
#### `.ci/docker/ci_commit_pins/` vs `.github/ci_commit_pins`
79+
- **`.ci/docker/ci_commit_pins/`**:
80+
- Used for pinning dependency versions during base Docker image building
81+
- Ensures consistent environments for building PyTorch
82+
- Changes here trigger base Docker image rebuilds
83+
84+
- **`.github/ci_commit_pins`**:
85+
- Used for pinning dependency versions during PyTorch building and tests
86+
- Ensures consistent dependencies for PyTorch across different builds
87+
- Used by build scripts running inside Docker containers
88+
89+
### Step-by-Step Guide for Adding a New Base Docker Image
90+
91+
#### 1. Add Pinned Commits (If Applicable)
92+
93+
We use pinned commits for build stability. The `nightly.yml` workflow checks and updates pinned commits for certain repository dependencies daily.
94+
95+
If your new Docker image needs a library installed from a specific pinned commit or built from source:
96+
97+
1. Add the repository you want to track in `nightly.yml` and `merge-rules.yml`
98+
2. Add the initial pinned commit in `.ci/docker/ci_commit_pins/`. The text filename should match the one defined in step 1
99+
100+
#### 2. Configure the Base Docker Image
101+
1. **Add new Base Docker image configuration** (if applicable):
102+
103+
Add the configuration in `.ci/docker/build.sh`. For example:
104+
```bash
105+
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-new1)
106+
CUDA_VERSION=12.8.1
107+
ANACONDA_PYTHON_VERSION=3.12
108+
GCC_VERSION=11
109+
VISION=yes
110+
KATEX=yes
111+
UCX_COMMIT=${_UCX_COMMIT}
112+
UCC_COMMIT=${_UCC_COMMIT}
113+
TRITON=yes
114+
NEW_ARG_1=yes
115+
;;
116+
```
117+
118+
2. **Add build arguments to Docker build command**:
119+
120+
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
121+
```bash
122+
docker build \
123+
....
124+
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
125+
```
126+
127+
3. **Update Dockerfile logic**:
128+
129+
Update the Dockerfile to use the new argument. For example, in `ubuntu/Dockerfile`:
130+
```dockerfile
131+
ARG NEW_ARG_1
132+
# Set up environment for NEW_ARG_1
133+
RUN if [ -n "${NEW_ARG_1}" ]; then bash ./do_something.sh; fi
134+
```
135+
136+
4. **Add the Docker configuration** in `.github/workflows/docker-builds.yml`:
137+
138+
The `docker-builds.yml` workflow pre-builds the Docker images whenever changes occur in the `.ci/docker/` directory. This includes the
139+
pinned commit updates.

0 commit comments

Comments
 (0)