Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions python/ray/_private/runtime_env/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,113 @@

# Whether to use ray whl when `install_ray` is True in the container.
RAY_PODMAN_UES_WHL_PACKAGE = env_bool("RAY_PODMAN_UES_WHL_PACKAGE", False)

VGPU_DEVICES = os.getenv("NVIDIA_VISIBLE_DEVICES")

NVIDIA_PATTERNS=[
# x86_64 libraries
"/usr/lib/x86_64-linux-gnu/libnvidia-*.so",
"/usr/lib/x86_64-linux-gnu/libcuda.so",
"/usr/lib/x86_64-linux-gnu/libvdpau_nvidia.so",
"/usr/lib/x86_64-linux-gnu/libnvcuvid.so",
"/usr/lib/x86_64-linux-gnu/libnvidia-*.so.*",
"/usr/lib/x86_64-linux-gnu/libcuda.so.*",
"/usr/lib/x86_64-linux-gnu/libvdpau_nvidia.so.*",
"/usr/lib/x86_64-linux-gnu/libnvcuvid.so.*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-cfg.so*",
"/usr/lib/x86_64-linux-gnu/libcudadebugger.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-opencl.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-gpucomp.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-allocator.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-pkcs11.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-pkcs11-openssl3.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-nvvm.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-ngx.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-encode.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-opticalflow.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-eglcore.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-glcore.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-tls.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-glsi.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-fbc.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-rtcore.so*",
"/usr/lib/x86_64-linux-gnu/libnvoptix.so*",
"/usr/lib/x86_64-linux-gnu/libGLX_nvidia.so*",
"/usr/lib/x86_64-linux-gnu/libEGL_nvidia.so*",
"/usr/lib/x86_64-linux-gnu/libGLESv2_nvidia.so*",
"/usr/lib/x86_64-linux-gnu/libGLESv1_CM_nvidia.so*",
"/usr/lib/x86_64-linux-gnu/libnvidia-glvkspirv.so*",
# i386 libraries
"/usr/lib/i386-linux-gnu/libnvidia-*.so",
"/usr/lib/i386-linux-gnu/libcuda.so",
"/usr/lib/i386-linux-gnu/libvdpau_nvidia.so",
"/usr/lib/i386-linux-gnu/libnvcuvid.so",
"/usr/lib/i386-linux-gnu/libnvidia-*.so.*",
"/usr/lib/i386-linux-gnu/libcuda.so.*",
"/usr/lib/i386-linux-gnu/libvdpau_nvidia.so.*",
"/usr/lib/i386-linux-gnu/libnvcuvid.so.*",
"/usr/lib/i386-linux-gnu/libnvidia-ml.so*",
"/usr/lib/i386-linux-gnu/libnvidia-opencl.so*",
"/usr/lib/i386-linux-gnu/libnvidia-gpucomp.so*",
"/usr/lib/i386-linux-gnu/libnvidia-ptxjitcompiler.so*",
"/usr/lib/i386-linux-gnu/libnvidia-allocator.so*",
"/usr/lib/i386-linux-gnu/libnvidia-nvvm.so*",
"/usr/lib/i386-linux-gnu/libnvidia-encode.so*",
"/usr/lib/i386-linux-gnu/libnvidia-opticalflow.so*",
"/usr/lib/i386-linux-gnu/libnvidia-eglcore.so*",
"/usr/lib/i386-linux-gnu/libnvidia-glcore.so*",
"/usr/lib/i386-linux-gnu/libnvidia-tls.so*",
"/usr/lib/i386-linux-gnu/libnvidia-glsi.so*",
"/usr/lib/i386-linux-gnu/libnvidia-fbc.so*",
"/usr/lib/i386-linux-gnu/libGLX_nvidia.so*",
"/usr/lib/i386-linux-gnu/libEGL_nvidia.so*",
"/usr/lib/i386-linux-gnu/libGLESv2_nvidia.so*",
"/usr/lib/i386-linux-gnu/libGLESv1_CM_nvidia.so*",
"/usr/lib/i386-linux-gnu/libnvidia-glvkspirv.so*",
# lib64 libraries
"/usr/lib64/libnvidia-*.so",
"/usr/lib64/libcuda.so",
"/usr/lib64/libGLX_nvidia.so",
"/usr/lib64/libEGL_nvidia.so",
"/usr/lib64/libGLESv*_nvidia.so",
"/usr/lib64/libnvidia-*.so.*",
"/usr/lib64/libcuda.so.*",
"/usr/lib64/libGLX_nvidia.so.*",
"/usr/lib64/libEGL_nvidia.so.*",
"/usr/lib64/libGLESv*_nvidia.so.*",
"/usr/lib64/libnvidia-egl-gbm.so*",
"/usr/lib64/libnvidia-egl-wayland.so*",
# lib libraries
"/usr/lib/libnvidia-*.so",
"/usr/lib/libcuda.so",
"/usr/lib/libvdpau_nvidia.so",
"/usr/lib/libnvcuvid.so",
"/usr/lib/libnvidia-*.so.*",
"/usr/lib/libcuda.so.*",
"/usr/lib/libvdpau_nvidia.so.*",
"/usr/lib/libnvcuvid.so.*",
# firmware files
"/usr/lib/firmware/nvidia/*/gsp_*.bin",
"/usr/share/nvidia/nvoptix.bin",
# Vulkan and EGL config files
"/etc/vulkan/icd.d/nvidia*.json",
"/etc/vulkan/implicit_layer.d/nvidia*.json",
"/usr/share/egl/egl_external_platform.d/*nvidia*.json",
"/usr/share/glvnd/egl_vendor.d/*nvidia*.json",
# NVIDIA binaries
"/usr/bin/nvidia-*",
"/usr/bin/nv-fabricmanager",
"/usr/bin/nvidia-smi",
"/usr/bin/nvidia-debugdump",
"/usr/bin/nvidia-persistenced",
"/usr/bin/nvidia-cuda-mps-control",
"/usr/bin/nvidia-cuda-mps-server",
# Xorg modules
"/usr/lib64/xorg/modules/drivers/nvidia_drv.so*",
"/usr/lib64/xorg/modules/extensions/libglxserver_nvidia.so*",
# Additional system libraries
"/usr/lib64/libsysconf-alipay.so*",
"/usr/local/cuda/compat/",
]
126 changes: 118 additions & 8 deletions python/ray/_private/runtime_env/image_uri.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import os
import tempfile
import shlex
from typing import List, Optional
import glob
from typing import List, Optional, Set
import ray._private.runtime_env.constants as runtime_env_constants
from ray._private.runtime_env.context import RuntimeEnvContext
from ray._private.runtime_env.plugin import RuntimeEnvPlugin
from ray._private.ray_constants import RAY_NODE_LOGS_DIR

from ray._private.utils import (
get_ray_site_packages_path,
discover_files_by_patterns,
get_ray_python_path,
get_pyenv_path,
try_parse_default_mount_points,
try_parse_container_run_options,
Expand Down Expand Up @@ -208,7 +210,6 @@ def _modify_container_context_impl(
container_command.append(context.working_dir)
else:
container_command.append(os.getcwd())
container_command.append("--cap-add=AUDIT_WRITE")

redirected_pyenv_folder = None
if container_install_ray or container_pip_packages:
Expand All @@ -219,26 +220,28 @@ def _modify_container_context_impl(
container_to_host_mount_dict[get_ray_whl_dir()] = get_ray_whl_dir()

if not container_install_ray:
# mount ray package site path
host_site_packages_path = get_ray_site_packages_path()
# Mount only ray package path.
# Do not overwrite podmans' site packages because it may include necessary packages.
host_site_packages_path = get_ray_python_path()

# If the user specifies a `py_executable` in the container
# and it starts with the ${PYENV_ROOT} environment variable (indicating a PYENV-managed executable),
# we define `redirected_pyenv_folder` as `ray/.pyenv`.
# This ensures that all .pyenv-related paths are redirected
# to avoid overwriting the container's internal PYENV environment
# (which defaults to `/home/admin/.pyenv`).
if py_executable and py_executable.startswith(get_pyenv_path()):
host_pyenv_path = get_pyenv_path()
if py_executable and host_pyenv_path and py_executable.startswith(host_pyenv_path):
redirected_pyenv_folder = "ray/.pyenv"

host_pyenv_path = get_pyenv_path()
container_pyenv_path = host_pyenv_path
if redirected_pyenv_folder:
container_pyenv_path = host_pyenv_path.replace(
".pyenv", redirected_pyenv_folder
)
context.container[redirected_pyenv_folder] = redirected_pyenv_folder
container_to_host_mount_dict[container_pyenv_path] = host_pyenv_path
if container_pyenv_path:
container_to_host_mount_dict[container_pyenv_path] = host_pyenv_path
container_to_host_mount_dict[host_site_packages_path] = host_site_packages_path

# For loop `run options` and append each item to the command line of podman
Expand Down Expand Up @@ -273,6 +276,113 @@ def _modify_container_context_impl(
# `try_update_container_command` function, so direct inclusion of `--env` flags here
# is avoided to ensure proper ordering and dynamic updates.
container_command.append(runtime_env_constants.CONTAINER_ENV_PLACEHOLDER)


if not runtime_env_constants.VGPU_DEVICES:
# CPU mode. Only needs AUDIT_WRITE
container_command.append("--cap-add=AUDIT_WRITE")
else:
# GPU mode
# Use glob patterns to discover and mount NVIDIA libraries dynamically
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use glob patterns to discover and mount NVIDIA libraries dynamically.

logger.info("Mounting gpu devices and drivers")
Comment on lines +286 to +287

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This large block of code for GPU support contains many hardcoded paths and glob patterns for NVIDIA drivers and libraries. This approach can be brittle, as it depends on a specific driver installation layout which might vary across different Linux distributions or driver versions. Have you considered leveraging existing tools like nvidia-container-toolkit or nvidia-container-cli to get the required mount paths and devices? This would make the implementation more robust and less dependent on hardcoded paths.


# Use sets to store unique mount destinations
volume_mounts: Set[str] = set()
device_mounts: Set[str] = set()
mount_commands = ["--privileged"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-critical critical

Using --privileged grants the container almost all the capabilities of the host machine, which is a significant security risk. Since you are already explicitly mounting necessary devices and libraries, could the --privileged flag be avoided? It's recommended to use more granular permissions, like specific --cap-add flags, instead of giving full privileges.

Suggested change
mount_commands = ["--privileged"]
mount_commands = []


# NVIDIA device mounts - validate existence
nvidia_devices = [
"/dev/nvidiactl",
"/dev/nvidia-uvm",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-modeset"
]

for device_path in nvidia_devices:
if os.path.exists(device_path):
volume_mounts.add(f"{device_path}:{device_path}")

# Discover NVIDIA GPU devices dynamically using glob
gpu_device_pattern = "/dev/nvidia[0-9]*"
for device_path in glob.glob(gpu_device_pattern):
if os.path.exists(device_path):
device_mounts.add(device_path)


# Discover NCCL installations dynamically
nccl_patterns = [
"/usr/local/nccl*"
]

for pattern in nccl_patterns:
for nccl_path in glob.glob(pattern):
if os.path.exists(nccl_path) and os.path.isdir(nccl_path):
volume_mounts.add(f"{nccl_path}:{nccl_path}:ro")

# Vulkan and EGL mounts - validate existence
vulkan_egl_paths = [
"/etc/vulkan",
"/usr/share/egl",
"/usr/share/glvnd"
]

for path in vulkan_egl_paths:
if os.path.exists(path) and os.path.isdir(path):
volume_mounts.add(f"{path}:{path}:ro")

# RDMA/InfiniBand mounts - validate existence
rdma_paths = [
"/usr/lib64/libibverbs",
"/usr/lib64/librdmacm",
"/sys/class/infiniband",
"/dev/infiniband"
]

for path in rdma_paths:
if os.path.exists(path):
if os.path.isdir(path):
volume_mounts.add(f"{path}:{path}:ro")
else: # device file
volume_mounts.add(f"{path}:{path}")

# Shared memory for NCCL - validate existence
if os.path.exists("/dev/shm"):
volume_mounts.add("/dev/shm:/dev/shm")

# NVIDIA runtime sockets - validate existence
socket_paths = [
"/run/nvidia-persistenced/socket",
"/run/nvidia-fabricmanager/socket"
]

for socket_path in socket_paths:
if os.path.exists(socket_path):
volume_mounts.add(f"{socket_path}:{socket_path}")

# Additional system mounts - validate existence
system_paths = [
"/etc/ld.so.conf.d",
]

for path in system_paths:
if os.path.exists(path):
volume_mounts.add(f"{path}:{path}:ro")

# Define all NVIDIA library and file patterns
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only support NVIDIA?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually supports Ant's VGPU. This is an inner feature

discover_files_by_patterns(runtime_env_constants.NVIDIA_PATTERNS, volume_mounts, mount_mode="ro", file_only=False)

# Add unique volume mounts to container command
for mount in volume_mounts:
mount_commands.extend(["-v", mount])

# Add unique device mounts to container command
for device in device_mounts:
mount_commands.extend(["--device", device])

container_command.extend(mount_commands)


container_command.append("--entrypoint")
# Some docker image use conda to run python, it depend on ~/.bashrc.
# So we need to use bash as container entrypoint.
Expand Down
67 changes: 64 additions & 3 deletions python/ray/_private/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
Sequence,
Tuple,
Union,
Set,
Collection
)
import glob

from google.protobuf import json_format

Expand Down Expand Up @@ -1798,19 +1801,77 @@ def try_update_container_command(
) > 0 else None
return container_command


def get_ray_site_packages_path():
"""
Get ray package site path
"""
ray_path = Path(ray.__path__[0])
return str(ray_path.parent.absolute())

# Return only ray path.
def get_ray_python_path():
"""
Get ray path in site packages
"""
ray_path = Path(ray.__path__[0])
return str(ray_path)

def get_pyenv_path():
# Get the pyenv path automatically instead of hard code.
return os.environ.get("PYENV_ROOT", ray_constants.RAY_DEFAULT_PYENV_ROOT)
return os.environ.get("PYENV_ROOT")


def discover_files_by_patterns(
patterns: Collection[str],
volume_mounts: Set[str],
mount_mode: str = "ro",
file_only: bool = False,
logger: Optional[logging.Logger] = logger,
) -> None:
"""
Discover files using glob patterns and add them to volume mounts.

This is a utility function to reduce code duplication in GPU support detection.
It handles the common pattern of:
1. Taking a list of glob patterns
2. Finding matching files/directories
3. Validating their existence
4. Adding them to volume mounts with appropriate mount options

Args:
patterns: Collection of glob patterns to search for
volume_mounts: Set to add discovered mount points to
mount_mode: Mount mode ("ro" for read-only, "" for read-write)
file_only: If True, only add regular files (not directories)
logger: Logger instance for debug messages
"""
for pattern in patterns:
try:
matches = glob.glob(pattern)
if not matches and logger:
logger.debug(f"No matches found for pattern: {pattern}")

for path in matches:
if not os.path.exists(path):
if logger:
logger.debug(f"Path does not exist: {path}")
continue

if file_only and not os.path.isfile(path):
if logger:
logger.debug(f"Skipping non-file: {path}")
continue

mount_spec = f"{path}:{path}"
if mount_mode:
mount_spec += f":{mount_mode}"

volume_mounts.add(mount_spec)
if logger:
logger.debug(f"Added mount: {mount_spec}")

except Exception as e:
if logger:
logger.warning(f"Error processing pattern '{pattern}': {e}")

def get_current_python_info():
"""
Expand Down
Loading
Loading