Skip to content

Commit c950381

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents fe5e457 + 22f07c2 commit c950381

File tree

8 files changed

+28
-39
lines changed

8 files changed

+28
-39
lines changed

examples/hpo-raytune/notebook/raytune-oai-demo-mlmd.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
" min_memory=4,\n",
5555
" max_memory=4,\n",
5656
" num_gpus=0,\n",
57-
" image=\"quay.io/modh/ray:2.35.0-py39-cu121\", \n",
57+
" image=\"quay.io/modh/ray:2.35.0-py311-cu121\", \n",
5858
"))"
5959
]
6060
},

examples/hpo-raytune/notebook/raytune-oai-demo.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
" min_memory=4,\n",
5656
" max_memory=4,\n",
5757
" num_gpus=0,\n",
58-
" image=\"quay.io/modh/ray:2.35.0-py39-cu121\"\n",
58+
" image=\"quay.io/modh/ray:2.35.0-py311-cu121\"\n",
5959
"))"
6060
]
6161
},

examples/ray-finetune-llm-deepspeed/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ This example has been validated on the following configurations:
129129
head_memory_limits=96,
130130
head_extended_resource_requests={'amd.com/gpu':1},
131131
worker_extended_resource_requests={'amd.com/gpu':1},
132-
image="quay.io/rhoai/ray:2.35.0-py39-rocm61-torch24-fa26",
132+
image="quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26",
133133
)
134134
```
135135
* Ray job:

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,12 @@
7070
" head_memory_requests=128,\n",
7171
" head_memory_limits=256,\n",
7272
" # Use the following parameters with NVIDIA GPUs\n",
73-
" image=\"quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26\",\n",
73+
" # Ensure the Python version in the notebook image matches the version used in the Ray cluster to avoid compatibility issues\n",
74+
" image=\"quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26\",\n",
7475
" head_extended_resource_requests={'nvidia.com/gpu':1},\n",
7576
" worker_extended_resource_requests={'nvidia.com/gpu':1},\n",
7677
" # Or replace them with these parameters for AMD GPUs\n",
77-
" # image=\"quay.io/rhoai/ray:2.35.0-py39-rocm61-torch24-fa26\",\n",
78+
" # image=\"quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26\",\n",
7879
" # head_extended_resource_requests={'amd.com/gpu':1},\n",
7980
" # worker_extended_resource_requests={'amd.com/gpu':1},\n",
8081
"))"

examples/stable-diffusion-dreambooth/notebook/00 Intro.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
" worker_memory_limits=8, \n",
6262
" head_extended_resource_requests={'nvidia.com/gpu':0},\n",
6363
" worker_extended_resource_requests={'nvidia.com/gpu':0},\n",
64-
" image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
64+
" image=\"quay.io/modh/ray:2.35.0-py311-cu121\",\n",
6565
" write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
6666
" local_queue=\"local-queue\"\n",
6767
" ))"

examples/stable-diffusion-dreambooth/notebook/raycluster.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ spec:
2020
- env:
2121
- name: NVIDIA_VISIBLE_DEVICES
2222
value: "void"
23-
image: 'quay.io/project-codeflare/ray:latest-py39-cu118'
23+
image: 'quay.io/modh/ray:2.35.0-py311-cu121'
2424
imagePullPolicy: Always
2525
lifecycle:
2626
preStop:
@@ -60,7 +60,7 @@ spec:
6060
template:
6161
spec:
6262
containers:
63-
- image: 'quay.io/project-codeflare/ray:latest-py39-cu118'
63+
- image: 'quay.io/modh/ray:2.35.0-py311-cu121'
6464
lifecycle:
6565
preStop:
6666
exec:

images/runtime/training/cuda/Dockerfile

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
## Global Args ######################################################
2-
ARG BASE_UBI_IMAGE_TAG=latest
3-
ARG PYTHON_VERSION=3.11
2+
ARG IMAGE_TAG=1-77.1729776556
3+
ARG PYTHON_VERSION=311
44

55
# use UBI9 latest
6-
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
6+
FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base
77

88
LABEL name="training:py311-cuda121-torch241" \
99
summary="CUDA 12.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
@@ -16,24 +16,18 @@ LABEL name="training:py311-cuda121-torch241" \
1616
COPY LICENSE.md /licenses/cuda-license.md
1717

1818
# Set the working directory in the container
19+
USER 0
1920
WORKDIR /app
2021

21-
# remove subscription-manager and install python3.11
22-
RUN dnf remove -y --disableplugin=subscription-manager \
23-
subscription-manager \
24-
&& dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
25-
&& python -m ensurepip --upgrade \
26-
&& python -m pip install --upgrade pip \
27-
&& python -m pip install --upgrade setuptools \
28-
&& dnf update -y \
29-
&& dnf clean all
30-
31-
RUN dnf remove -y python3-requests && \
32-
pip install --no-cache-dir requests==2.32.3
22+
# upgrade requests package
23+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
3324

3425
## CUDA Base ###################################################################
3526
FROM base AS cuda-base
3627

28+
# Install CUDA
29+
WORKDIR /opt/app-root/bin
30+
3731
# Ref: https://docs.nvidia.com/cuda/archive/12.1.0/cuda-toolkit-release-notes/
3832
ENV CUDA_VERSION=12.1.0 \
3933
NV_CUDA_LIB_VERSION=12.1.0-1 \
@@ -106,3 +100,7 @@ RUN pip install --no-cache-dir -U "micropipenv[toml]"
106100
COPY Pipfile.lock ./
107101

108102
RUN micropipenv install && rm -f ./Pipfile.lock
103+
104+
# Restore user workspace
105+
USER 1001
106+
WORKDIR /opt/app-root/src

images/runtime/training/rocm/Dockerfile

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
## Global Args ######################################################
2-
ARG BASE_UBI_IMAGE_TAG=latest
3-
ARG PYTHON_VERSION=3.11
2+
ARG IMAGE_TAG=1-77.1729776556
3+
ARG PYTHON_VERSION=311
44

55
# use UBI9 latest
6-
FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
6+
FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base
77

88
LABEL name="training:py311-rocm61-torch241" \
99
summary="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
@@ -16,23 +16,13 @@ LABEL name="training:py311-rocm61-torch241" \
1616
COPY LICENSE.md /licenses/rocm-license.md
1717

1818
# Set the working directory in the container
19+
USER 0
1920
WORKDIR /app
2021

21-
# remove subscription-manager and install python3.11
22-
RUN dnf remove -y --disableplugin=subscription-manager \
23-
subscription-manager \
24-
&& dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
25-
&& python -m ensurepip --upgrade \
26-
&& python -m pip install --upgrade pip \
27-
&& python -m pip install --upgrade setuptools \
28-
&& dnf update -y \
29-
&& dnf clean all
30-
31-
RUN dnf remove -y python3-requests && \
32-
pip install --no-cache-dir requests==2.32.3
22+
# upgrade requests package
23+
RUN pip install --no-cache-dir --upgrade requests==2.32.3
3324

3425
# Install ROCm
35-
USER 0
3626
WORKDIR /opt/app-root/bin
3727

3828
ARG ROCM_VERSION=6.1.2

0 commit comments

Comments
 (0)