Skip to content

Commit b4673d3

Browse files
fix a few ci issues (#3833)
1 parent 1b33503 commit b4673d3

File tree

9 files changed

+62
-31
lines changed

9 files changed

+62
-31
lines changed

.github/scripts/filter-matrix.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
jetpack_cuda_versions: List[str] = ["cu126"]
1616

1717
jetpack_container_image: str = "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
18-
sbsa_container_image: str = "quay.io/pypa/manylinux_2_34_aarch64"
18+
sbsa_container_image: str = "quay.io/pypa/manylinux_2_39_aarch64"
1919

2020

2121
def validate_matrix(matrix_dict: Dict[str, Any]) -> None:
@@ -41,19 +41,15 @@ def filter_matrix_item(
4141
# Skipping disabled CUDA version
4242
return False
4343
if is_jetpack:
44-
if limit_pr_builds:
45-
# pr build,matrix passed from test-infra is cu128, python 3.9, change to cu126, python 3.10
46-
item["desired_cuda"] = "cu126"
47-
item["python_version"] = "3.10"
44+
# pr build,matrix passed from test-infra is cu126,cu128 and cu130, python 3.10, filter to cu126, python 3.10
45+
# nightly/main build, matrix passed from test-infra is cu126, cu128 and cu130, all python versions, filter to cu126, python 3.10
46+
if (
47+
item["python_version"] in jetpack_python_versions
48+
and item["desired_cuda"] in jetpack_cuda_versions
49+
):
4850
item["container_image"] = jetpack_container_image
4951
return True
50-
else:
51-
# nightly/main build, matrix passed from test-infra is cu128, all python versions, change to cu126, python 3.10
52-
if item["python_version"] in jetpack_python_versions:
53-
item["desired_cuda"] = "cu126"
54-
item["container_image"] = jetpack_container_image
55-
return True
56-
return False
52+
return False
5753
else:
5854
if item["gpu_arch_type"] == "cuda-aarch64":
5955
# pytorch image:pytorch/manylinuxaarch64-builder:cuda12.8 comes with glibc2.28

.github/scripts/install-cuda-aarch64.sh

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,42 @@ install_cuda_aarch64() {
88
# CUDA_MAJOR_VERSION: cu128 --> 12
99
CUDA_MAJOR_VERSION=${CU_VERSION:2:2}
1010
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
11-
# nccl version must match libtorch_cuda.so was built with https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
12-
dnf -y install cuda-compiler-${CU_VER}.aarch64 \
11+
12+
# nccl version must match libtorch_cuda.so was built with
13+
if [[ ${CU_VERSION:0:4} == "cu12" ]]; then
14+
# cu12: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu12.txt
15+
if [[ ${CU_VERSION} == "cu128" ]]; then
16+
nccl_version="2.26.2-1"
17+
elif [[ ${CU_VERSION} == "cu126" ]]; then
18+
nccl_version="2.24.3-1"
19+
else
20+
# removed cu129 support from pytorch upstream
21+
echo "Unsupported CUDA version: ${CU_VERSION}"
22+
exit 1
23+
fi
24+
elif [[ ${CU_VERSION:0:4} == "cu13" ]]; then
25+
# cu13: https://github.com/pytorch/pytorch/blob/main/.ci/docker/ci_commit_pins/nccl-cu13.txt
26+
nccl_version="2.27.7-1"
27+
fi
28+
29+
dnf --nogpgcheck -y install cuda-compiler-${CU_VER}.aarch64 \
1330
cuda-libraries-${CU_VER}.aarch64 \
1431
cuda-libraries-devel-${CU_VER}.aarch64 \
15-
libnccl-2.27.3-1+cuda${CU_DOT_VER} libnccl-devel-2.27.3-1+cuda${CU_DOT_VER} libnccl-static-2.27.3-1+cuda${CU_DOT_VER}
32+
libnccl-${nccl_version}+cuda${CU_DOT_VER} libnccl-devel-${nccl_version}+cuda${CU_DOT_VER} libnccl-static-${nccl_version}+cuda${CU_DOT_VER}
1633
dnf clean all
17-
18-
nvshmem_version=3.3.9
34+
# nvshmem version is from https://github.com/pytorch/pytorch/blob/f9fa138a3910bd1de1e7acb95265fa040672a952/.ci/docker/common/install_cuda.sh#L67
35+
nvshmem_version=3.3.24
1936
nvshmem_path="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${CUDA_MAJOR_VERSION}/txz/agnostic/aarch64"
20-
nvshmem_filename="libnvshmem_cuda12-linux-sbsa-${nvshmem_version}.tar.gz"
21-
curl -L ${nvshmem_path}/${nvshmem_filename} -o nvshmem.tar.gz
22-
tar -xzf nvshmem.tar.gz
23-
cp -a libnvshmem/lib/* /usr/local/cuda/lib64/
24-
cp -a libnvshmem/include/* /usr/local/cuda/include/
25-
rm -rf nvshmem.tar.gz nvshmem
37+
nvshmem_prefix="libnvshmem-linux-sbsa-${nvshmem_version}_cuda${CUDA_MAJOR_VERSION}-archive"
38+
nvshmem_tarname="${nvshmem_prefix}.tar.xz"
39+
curl -L ${nvshmem_path}/${nvshmem_tarname} -o nvshmem.tar.xz
40+
tar -xJf nvshmem.tar.xz
41+
cp -a ${nvshmem_prefix}/lib/* /usr/local/cuda/lib64/
42+
cp -a ${nvshmem_prefix}/include/* /usr/local/cuda/include/
43+
rm -rf nvshmem.tar.xz ${nvshmem_prefix}
2644
echo "nvshmem ${nvshmem_version} for cuda ${CUDA_MAJOR_VERSION} installed successfully"
2745

46+
export PATH=/usr/local/cuda/bin:$PATH
2847
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/include:/usr/lib64:$LD_LIBRARY_PATH
2948
ls -lart /usr/local/
3049
nvcc --version
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# for now we only need to install cuda_dss for jetpack
2+
install_cuda_dss_aarch64() {
3+
echo "install cuda_dss for ${CU_VERSION}"
4+
arch_path='sbsa'
5+
# version is from https://github.com/pytorch/pytorch/blob/22c5e8c17c7551c9dd2855589ae774c1e147343a/.ci/docker/common/install_cudss.sh
6+
CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive"
7+
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz
8+
# only for cuda 12
9+
tar xf ${CUDSS_NAME}.tar.xz
10+
cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/
11+
cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/
12+
}

.github/workflows/build_linux.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -348,8 +348,8 @@ jobs:
348348
source "${BUILD_ENV_FILE}"
349349
WHEEL_NAME=$(ls "${{ inputs.repository }}/dist/")
350350
echo "$WHEEL_NAME"
351-
if [[ ${{ inputs.is-jetpack }} == true ]]; then
352-
echo "Skipping smoke test for jetpack, since it is not the actual jetpack environment"
351+
if [[ ${{ inputs.architecture }} == "aarch64" ]]; then
352+
echo "Skipping smoke test for aarch64, since it is not an actual gpu runner"
353353
else
354354
${CONDA_RUN} pip install "${{ inputs.repository }}/dist/$WHEEL_NAME"
355355
# Checking that we have a pinned version of torch in our dependency tree

MODULE.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ new_local_repository(
5151
new_local_repository(
5252
name = "cuda_win",
5353
build_file = "@//third_party/cuda:BUILD",
54-
path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.9/",
54+
path = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/",
5555
)
5656

5757
http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

packaging/pre_build_script.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ if [[ $(uname -m) == "aarch64" ]]; then
1212
if [[ ${os_name} == "ubuntu" ]]; then
1313
IS_JETPACK=true
1414
apt-get update
15-
apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip
15+
apt-get install -y ninja-build gettext curl libopenblas-dev zip unzip libfmt-dev
1616
else
1717
IS_SBSA=true
1818
yum install -y ninja-build gettext zip unzip
19+
yum install -y fmt-devel
1920
fi
2021
else
2122
BAZEL_PLATFORM="amd64"
23+
yum install -y fmt-devel
2224
fi
2325

2426

@@ -43,6 +45,8 @@ pip uninstall -y torch torchvision
4345

4446
if [[ ${IS_JETPACK} == true ]]; then
4547
# install torch 2.8 for jp6.2
48+
source .github/scripts/install-cuda-dss.sh
49+
install_cuda_dss_aarch64
4650
pip install torch==2.8.0 --index-url=https://pypi.jetson-ai-lab.io/jp6/cu126/
4751
else
4852
TORCH=$(grep "^torch>" py/requirements.txt)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def load_dep_info():
8585

8686
dir_path = os.path.join(str(get_root_dir()), "py")
8787

88-
IS_AARCH64 = platform.uname().processor == "aarch64"
88+
IS_AARCH64 = platform.machine() == "aarch64"
8989
IS_JETPACK = False
9090

9191
PY_ONLY = False

tests/py/dynamo/partitioning/test_hierarchical_partitioning.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,17 +242,17 @@ def test_hierarchical_adjacency_partition_with_two_backends_with_torch_executed_
242242
)
243243
from torch_tensorrt.dynamo.lowering import (
244244
get_decompositions,
245+
post_lowering,
245246
pre_export_lowering,
246247
)
247248

248249
model = self.SimpleModel().cuda().eval()
249250
example_input = torch.randn(1, 3, 224, 224).cuda()
250-
251251
exported_program = torch.export.export(model, (example_input,))
252252
exported_program = pre_export_lowering(exported_program)
253253
exported_program = exported_program.run_decompositions(get_decompositions())
254254
gm = exported_program.module()
255-
255+
gm = post_lowering(gm)
256256
partitioned_graph, _ = partitioning.hierarchical_adjacency_partition(
257257
gm,
258258
min_block_size=1,

toolchains/ci_workspaces/MODULE.bazel.tmpl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ new_local_repository(
4545
new_local_repository(
4646
name = "cuda_l4t",
4747
build_file = "@//third_party/cuda:BUILD",
48-
path = "/usr/local/cuda-12.9",
48+
path = "/usr/local/cuda-12.6",
4949
)
5050

5151
new_local_repository(

0 commit comments

Comments
 (0)