Skip to content

Commit 6f12be2

Browse files
pytorchbotatalmanseemethere
authored
CUDA 13.0 builds fix on Amazon Linux 2023 (pytorch#164893)
CUDA 13.0 builds fix on Amazon Linux 2023 (pytorch#164870) During 2.9 rc testing I am seeing an issue on Amazon Linux 2023 with CUDA 13.0 builds This is related to: pytorch#152756 Workflow: https://github.com/pytorch/test-infra/actions/runs/18324074610/job/52184079262 Error: ``` WARNING: There was an error checking the latest version of pip. + python3.11 .ci/pytorch/smoke_test/smoke_test.py --package torchonly Traceback (most recent call last): File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 333, in _load_global_deps ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL) File "/usr/lib64/python3.11/ctypes/__init__.py", line 376, in __init__ self._handle = _dlopen(self._name, mode) ^^^^^^^^^^^^^^^^^^^^^^^^^ OSError: libcudart.so.13: cannot open shared object file: No such file or directory During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/pytorch/pytorch/.ci/pytorch/smoke_test/smoke_test.py", line 12, in <module> import torch File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 425, in <module> _load_global_deps() File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 383, in _load_global_deps _preload_cuda_deps(lib_folder, lib_name) File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 317, in _preload_cuda_deps raise ValueError(f"{lib_name} not found in the system path {sys.path}") Traceback (most recent call last): ValueError: libnvToolsExt.so.*[0-9] not found in the system path ['/pytorch/pytorch/.ci/pytorch/smoke_test', '/usr/lib64/python311.zip', '/usr/lib64/python3.11', '/usr/lib64/python3.11/lib-dynload', '/usr/local/lib64/python3.11/site-packages', '/usr/local/lib/python3.11/site-packages', '/usr/lib64/python3.11/site-packages', '/usr/lib/python3.11/site-packages'] File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 102, in <module> main() File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t 7d9c5bd403cac9a9ee824d63a1d6f6057ecce89a7daa94a81617dbf8eff0ff2e /exec failed with exit code 1 ``` Pull Request resolved: pytorch#164870 Approved by: https://github.com/Camyll (cherry picked from commit 483f4e0) Co-authored-by: atalman <[email protected]> Co-authored-by: Eli Uriegas <[email protected]>
1 parent 42f0c2c commit 6f12be2

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

torch/__init__.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
302302
return nvidia_lib_paths + lib_paths
303303

304304

305-
def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
305+
def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None: # type: ignore[valid-type]
306306
"""Preloads cuda deps if they could not be found otherwise."""
307307
# Should only be called on Linux if default path resolution have failed
308308
assert platform.system() == "Linux", "Should only be called on Linux"
@@ -313,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
313313
if candidate_lib_paths:
314314
lib_path = candidate_lib_paths[0]
315315
break
316-
if not lib_path:
316+
if not lib_path and required:
317317
raise ValueError(f"{lib_name} not found in the system path {sys.path}")
318-
ctypes.CDLL(lib_path)
318+
if lib_path:
319+
ctypes.CDLL(lib_path)
319320

320321

321322
# See Note [Global dependencies]
@@ -354,8 +355,6 @@ def _load_global_deps() -> None:
354355
except OSError as err:
355356
# Can only happen for wheel with cuda libs as PYPI deps
356357
# As PyTorch is not purelib, but nvidia-*-cu12 is
357-
from torch.version import cuda as cuda_version
358-
359358
cuda_libs: dict[str, str] = {
360359
"cublas": "libcublas.so.*[0-9]",
361360
"cudnn": "libcudnn.so.*[0-9]",
@@ -369,7 +368,6 @@ def _load_global_deps() -> None:
369368
"cusparselt": "libcusparseLt.so.*[0-9]",
370369
"cusolver": "libcusolver.so.*[0-9]",
371370
"nccl": "libnccl.so.*[0-9]",
372-
"nvtx": "libnvToolsExt.so.*[0-9]",
373371
"nvshmem": "libnvshmem_host.so.*[0-9]",
374372
"cufile": "libcufile.so.*[0-9]",
375373
}
@@ -381,6 +379,9 @@ def _load_global_deps() -> None:
381379
raise err
382380
for lib_folder, lib_name in cuda_libs.items():
383381
_preload_cuda_deps(lib_folder, lib_name)
382+
383+
# libnvToolsExt is Optional Dependency
384+
_preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
384385
ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
385386

386387

0 commit comments

Comments
 (0)