Skip to content

Commit 62c8665

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents c132a8a + bac8162 commit 62c8665

File tree

7 files changed

+464
-216
lines changed

7 files changed

+464
-216
lines changed

images/runtime/training/cuda/Pipfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ torch = "==2.4.1"
1313
sentencepiece = "<0.3,>=0.1.99"
1414
tokenizers = "<1.0,>=0.13.3"
1515
tqdm = "<5.0,>=4.66.2"
16-
trl = ">=0.15.1"
16+
trl = ">=0.15.2"
1717
protobuf = "<6.0.0,>=5.28.0"
1818
simpleeval = "<1.0,>=0.9.13"
1919
safetensors = "*"
@@ -27,6 +27,7 @@ pydantic = ">=2.7.0"
2727
deepspeed = ">=0.14.3"
2828
aiofiles = ">=23.2.1"
2929
async-timeout = "==4.0.3"
30+
tensorboard = "2.19.0"
3031

3132
[dev-packages]
3233

images/runtime/training/cuda/Pipfile.lock

Lines changed: 212 additions & 102 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

images/runtime/training/rocm/Dockerfile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,19 @@ RUN micropipenv install && \
6464
chmod -R g+w /opt/app-root/lib/python3.11/site-packages && \
6565
fix-permissions /opt/app-root -P
6666

67+
# Install Flash Attention
68+
ENV GPU_ARCHS=gfx90a;gfx941;gfx942
69+
70+
RUN pip install wheel ninja
71+
72+
RUN export TMP_DIR=$(mktemp -d) \
73+
&& cd $TMP_DIR \
74+
&& git clone --depth 1 --branch v2.7.4 https://github.com/Dao-AILab/flash-attention.git \
75+
&& cd flash-attention \
76+
&& git submodule update --init
77+
&& MAX_JOBS="16" python3 setup.py install --verbose \
78+
&& rm -rf $TMP_DIR
79+
6780
# Restore user workspace
6881
USER 1001
6982

images/runtime/training/rocm/Pipfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pytorch-triton-rocm = {version = "*", index = "pytorch"}
1919
sentencepiece = "<0.3,>=0.1.99"
2020
tokenizers = "<1.0,>=0.13.3"
2121
tqdm = "<5.0,>=4.66.2"
22-
trl = ">=0.15.1"
22+
trl = ">=0.15.2"
2323
protobuf = "<6.0.0,>=5.28.0"
2424
simpleeval = "<1.0,>=0.9.13"
2525
safetensors = "*"
@@ -33,6 +33,7 @@ pydantic = ">=2.7.0"
3333
deepspeed = ">=0.14.3"
3434
aiofiles = ">=23.2.1"
3535
async-timeout = "==4.0.3"
36+
tensorboard = "2.19.0"
3637

3738
[dev-packages]
3839

images/runtime/training/rocm/Pipfile.lock

Lines changed: 212 additions & 102 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package kfto
1919
import (
2020
"strings"
2121
"testing"
22+
"time"
2223

2324
. "github.com/onsi/gomega"
2425
. "github.com/project-codeflare/codeflare-common/support"
@@ -41,10 +42,11 @@ func TestMnistSDK(t *testing.T) {
4142
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
4243

4344
requiredChangesInNotebook := map[string]string{
44-
"${api_url}": GetOpenShiftApiUrl(test),
45-
"${password}": userToken,
46-
"${num_gpus}": "0",
47-
"${namespace}": namespace.Name,
45+
"${api_url}": GetOpenShiftApiUrl(test),
46+
"${password}": userToken,
47+
"${num_gpus}": "0",
48+
"${namespace}": namespace.Name,
49+
"${training_image}": GetCudaTrainingImage(),
4850
}
4951

5052
jupyterNotebook := string(readFile(test, "resources/mnist_kfto.ipynb"))
@@ -81,7 +83,7 @@ func TestMnistSDK(t *testing.T) {
8183
Should(WithTransform(PyTorchJobConditionRunning, Equal(v1.ConditionTrue)))
8284

8385
// Make sure that the job eventually succeeds
84-
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp")).
86+
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp"), TestTimeoutLong, 1*time.Second).
8587
Should(WithTransform(PyTorchJobConditionSucceeded, Equal(v1.ConditionTrue)))
8688

8789
// TODO: write torch job logs?

tests/kfto/resources/mnist_kfto.ipynb

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "ebdb3af3",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"%pip install -U kubeflow-training"
11+
]
12+
},
313
{
414
"cell_type": "code",
515
"execution_count": 6,
@@ -19,7 +29,7 @@
1929
},
2030
{
2131
"cell_type": "code",
22-
"execution_count": 7,
32+
"execution_count": null,
2333
"id": "72dd1751",
2434
"metadata": {},
2535
"outputs": [],
@@ -28,7 +38,8 @@
2838
"num_gpus = \"${num_gpus}\"\n",
2939
"openshift_api_url = \"${api_url}\"\n",
3040
"namespace = \"${namespace}\"\n",
31-
"token = \"${password}\""
41+
"token = \"${password}\"\n",
42+
"training_image= \"${training_image}\""
3243
]
3344
},
3445
{
@@ -57,9 +68,9 @@
5768
" train_func=train_func,\n",
5869
" num_workers=1,\n",
5970
" resources_per_worker={\"gpu\": num_gpus},\n",
60-
" base_image=\"quay.io/kpostlet/torch-train:with-minivision\",\n",
61-
" # packages_to_install=[\"torchvision==0.19.0\", \"--target=/tmp/lib\"],\n",
62-
" # env_vars={\"PYTHONPATH\": \"/tmp/lib:$PYTHONPATH\", \"NCCL_DEBUG\": \"INFO\", \"TORCH_DISTRIBUTED_DEBUG\": \"DETAIL\"}\n",
71+
" base_image=training_image,\n",
72+
" packages_to_install=[\"torchvision==0.19.0\",\"minio==7.2.13\", \"--target=/tmp/lib\"],\n",
73+
" env_vars={\"PYTHONPATH\": \"/tmp/lib:$PYTHONPATH\", \"NCCL_DEBUG\": \"INFO\", \"TORCH_DISTRIBUTED_DEBUG\": \"DETAIL\"}\n",
6374
")"
6475
]
6576
},

0 commit comments

Comments
 (0)