Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
pipelinesascode.tekton.dev/cancel-in-progress: "true"
pipelinesascode.tekton.dev/max-keep-runs: "3"
pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch == "main" &&
("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml".pathChanged())
("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-cuda128-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml".pathChanged())
creationTimestamp: null
labels:
appstudio.openshift.io/application: runtime-training
Expand Down
2 changes: 1 addition & 1 deletion .tekton/training-runtime-py312-cuda128-torch290-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
pipelinesascode.tekton.dev/cancel-in-progress: "false"
pipelinesascode.tekton.dev/max-keep-runs: "3"
pipelinesascode.tekton.dev/on-cel-expression: event == "push" && target_branch == "main" &&
("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-push.yaml".pathChanged())
("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-cuda128-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-push.yaml".pathChanged())
creationTimestamp: null
labels:
appstudio.openshift.io/application: runtime-training
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
pipelinesascode.tekton.dev/cancel-in-progress: "true"
pipelinesascode.tekton.dev/max-keep-runs: "3"
pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch == "main" &&
("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml".pathChanged())
("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-rocm64-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml".pathChanged())
creationTimestamp: null
labels:
appstudio.openshift.io/application: runtime-training
Expand Down
2 changes: 1 addition & 1 deletion .tekton/training-runtime-py312-rocm64-torch290-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ metadata:
pipelinesascode.tekton.dev/cancel-in-progress: "false"
pipelinesascode.tekton.dev/max-keep-runs: "3"
pipelinesascode.tekton.dev/on-cel-expression: event == "push" && target_branch == "main" &&
("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-push.yaml".pathChanged())
("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-rocm64-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-push.yaml".pathChanged())
creationTimestamp: null
labels:
appstudio.openshift.io/application: runtime-training
Expand Down
2 changes: 1 addition & 1 deletion images/runtime/training/py312-cuda128-torch290/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ name = "pytorch-cu128"
[packages]
torch = {version = "==2.9.0", index = "pytorch-cu128"}
accelerate = "==1.12.0"
transformers = "==4.57.2"
transformers = "==4.57.1"
peft = "==0.18.1"
datasets = "==4.3.0"
tqdm = "==4.67.1"
Expand Down
439 changes: 231 additions & 208 deletions images/runtime/training/py312-cuda128-torch290/Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion images/runtime/training/py312-rocm64-torch290/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ torch = {version = "==2.9.0+rocm6.4", index = "pytorch-rocm64"}
torchvision = {version = "==0.24.0+rocm6.4", index = "pytorch-rocm64"}
pytorch-triton-rocm = {version = "==3.5.0", index = "pytorch-rocm64"}
accelerate = "==1.12.0"
transformers = "==4.57.2"
transformers = "==4.57.1"
peft = "==0.18.1"
datasets = "==4.3.0"
tqdm = "==4.67.1"
Expand Down
288 changes: 133 additions & 155 deletions images/runtime/training/py312-rocm64-torch290/Pipfile.lock

Large diffs are not rendered by default.

28 changes: 6 additions & 22 deletions tests/trainer/cluster_training_runtimes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,9 @@ import (

. "github.com/opendatahub-io/distributed-workloads/tests/common"
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
trainerutils "github.com/opendatahub-io/distributed-workloads/tests/trainer/utils"
)

type ClusterTrainingRuntime struct {
Name string
RHOAIImage string
}

var expectedRuntimes = []ClusterTrainingRuntime{
{Name: "torch-distributed", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: "torch-distributed-rocm", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
{Name: "torch-distributed-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: "torch-distributed-rocm64-torch29-py312", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
{Name: "training-hub", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: "training-hub-th05-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
}

// defaultClusterTrainingRuntime is used across integration tests
var defaultClusterTrainingRuntime = expectedRuntimes[0].Name

func TestDefaultClusterTrainingRuntimes(t *testing.T) {
Tags(t, Smoke)
test := With(t)
Expand All @@ -54,8 +38,8 @@ func TestDefaultClusterTrainingRuntimes(t *testing.T) {
registryName := GetExpectedRegistry(test)

// Build a map of expected runtimes for quick lookup
expectedRuntimeMap := make(map[string]ClusterTrainingRuntime)
for _, runtime := range expectedRuntimes {
expectedRuntimeMap := make(map[string]trainerutils.ClusterTrainingRuntime)
for _, runtime := range trainerutils.ExpectedRuntimes {
expectedRuntimeMap[runtime.Name] = runtime
}

Expand Down Expand Up @@ -108,7 +92,7 @@ func TestDefaultClusterTrainingRuntimes(t *testing.T) {

// Verify all expected runtimes are present
var missingRuntimes []string
for _, expected := range expectedRuntimes {
for _, expected := range trainerutils.ExpectedRuntimes {
if !foundRuntimes[expected.Name] {
missingRuntimes = append(missingRuntimes, expected.Name)
}
Expand All @@ -129,7 +113,7 @@ func TestRunTrainJobWithDefaultClusterTrainingRuntimes(t *testing.T) {
Tags(t, Sanity)
test := With(t)

for _, runtime := range expectedRuntimes {
for _, runtime := range trainerutils.ExpectedRuntimes {
test.T().Logf("Running TrainJob with ClusterTrainingRuntime: %s", runtime.Name)

// Create a namespace
Expand All @@ -139,7 +123,7 @@ func TestRunTrainJobWithDefaultClusterTrainingRuntimes(t *testing.T) {
trainJob := createTrainJob(test, namespace, runtime.Name)

// Wait for TrainJob completion
test.Eventually(TrainJob(test, namespace, trainJob.Name), TestTimeoutLong).
test.Eventually(TrainJob(test, namespace, trainJob.Name), TestTimeoutDouble).
Should(WithTransform(TrainJobConditionComplete, Equal(metav1.ConditionTrue)))

test.T().Logf("TrainJob with ClusterTrainingRuntime '%s' completed successfully", runtime.Name)
Expand Down
7 changes: 5 additions & 2 deletions tests/trainer/resources/mnist.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -425,11 +425,14 @@
"metadata": {},
"outputs": [],
"source": [
"training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
"if not training_runtime_name:\n",
" raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
"\n",
"try:\n",
" torch_runtime = client.get_runtime(\"torch-distributed\")\n",
" torch_runtime = client.get_runtime(training_runtime_name)\n",
"except Exception as e:\n",
" raise RuntimeError(\"Runtime 'torch-distributed' not found or not accessible\") from e"
" raise RuntimeError(f\"Runtime '{training_runtime_name}' not found or not accessible\") from e"
]
},
{
Expand Down
13 changes: 11 additions & 2 deletions tests/trainer/resources/osft.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -400,15 +400,19 @@
"metadata": {},
"outputs": [],
"source": [
"training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
"if not training_runtime_name:\n",
" raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
"\n",
"th_runtime = None\n",
"for runtime in client.list_runtimes():\n",
" if runtime.name == \"training-hub03-cuda128-torch28-py312\":\n",
" if runtime.name == training_runtime_name:\n",
" th_runtime = runtime\n",
" print(\"Found runtime: \" + str(th_runtime))\n",
" break\n",
"\n",
"if th_runtime is None:\n",
" raise RuntimeError(\"Required runtime 'training-hub03-cuda128-torch28-py312' not found\")"
" raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")"
]
},
{
Expand Down Expand Up @@ -438,6 +442,11 @@
" \"XDG_CACHE_HOME\": \"/opt/app-root/src/.cache\",\n",
" \"NCCL_DEBUG\": \"INFO\",\n",
" },\n",
" resources_per_node={\n",
" \"cpu\": 4,\n",
" \"memory\": \"32Gi\",\n",
" \"nvidia.com/gpu\": 1\n",
" },\n",
" ),\n",
" options=[\n",
" PodTemplateOverrides(\n",
Expand Down
7 changes: 6 additions & 1 deletion tests/trainer/resources/rhai_features.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,13 @@
"metadata": {},
"outputs": [],
"source": [
"training_runtime_name = os.getenv(\"TRAINING_RUNTIME\", \"torch-distributed\")\n",
"training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
"if not training_runtime_name:\n",
" raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
"\n",
"torch_runtime = trainer_client.get_runtime(training_runtime_name)\n",
"if torch_runtime is None:\n",
" raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")\n",
"print(f\"Got runtime: {torch_runtime.name}\")"
]
},
Expand Down
13 changes: 11 additions & 2 deletions tests/trainer/resources/sft.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -474,15 +474,19 @@
"outputs": [],
"source": [
"# Find the TrainingHub runtime\n",
"training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
"if not training_runtime_name:\n",
" raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
"\n",
"th_runtime = None\n",
"for runtime in client.list_runtimes():\n",
" if runtime.name == \"training-hub03-cuda128-torch28-py312\":\n",
" if runtime.name == training_runtime_name:\n",
" th_runtime = runtime\n",
" print(\"Found runtime: \" + str(th_runtime))\n",
" break\n",
"\n",
"if th_runtime is None:\n",
" raise RuntimeError(\"Required runtime 'training-hub03-cuda128-torch28-py312' not found\")"
" raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")"
]
},
{
Expand Down Expand Up @@ -512,6 +516,11 @@
" \"XDG_CACHE_HOME\": \"/opt/app-root/src/.cache\",\n",
" \"NCCL_DEBUG\": \"INFO\",\n",
" },\n",
" resources_per_node={\n",
" \"cpu\": 4,\n",
" \"memory\": \"32Gi\",\n",
" \"nvidia.com/gpu\": 1\n",
" },\n",
" ),\n",
" options=[\n",
" PodTemplateOverrides(\n",
Expand Down
2 changes: 2 additions & 0 deletions tests/trainer/sdk_tests/fashion_mnist_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,13 @@ func RunFashionMnistCpuDistributedTraining(t *testing.T) {
"export AWS_DEFAULT_ENDPOINT='%s'; export AWS_ACCESS_KEY_ID='%s'; "+
"export AWS_SECRET_ACCESS_KEY='%s'; export AWS_STORAGE_BUCKET='%s'; "+
"export AWS_STORAGE_BUCKET_MNIST_DIR='%s'; "+
"export TRAINING_RUNTIME='%s'; "+
"python -m pip install --quiet --no-cache-dir ipykernel papermill boto3==1.34.162 && "+
"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
endpoint, accessKey, secretKey, bucket, prefix,
trainerutils.DefaultClusterTrainingRuntime,
notebookName,
)
command := []string{"/bin/sh", "-c", shellCmd}
Expand Down
2 changes: 2 additions & 0 deletions tests/trainer/sdk_tests/osft_traininghub_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,13 @@ func RunOsftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
"export AWS_SECRET_ACCESS_KEY='%s'; "+
"export AWS_STORAGE_BUCKET='%s'; "+
"export AWS_STORAGE_BUCKET_OSFT_DIR='%s'; "+
"export TRAINING_RUNTIME='%s'; "+
"python -m pip install --quiet --no-cache-dir --break-system-packages ipykernel papermill boto3==1.34.162 && "+
"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
endpoint, accessKey, secretKey, bucket, prefix,
trainerutils.DefaultTrainingHubRuntime,
osftNotebookName,
)
command := []string{"/bin/sh", "-c", shellCmd}
Expand Down
4 changes: 2 additions & 2 deletions tests/trainer/sdk_tests/rhai_features_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,11 @@ func runRhaiFeaturesTestWithConfig(t *testing.T, config RhaiFeatureConfig) {

// Determine GPU resource label (empty for CPU) and training runtime
gpuResourceLabel := ""
trainingRuntime := "torch-distributed" // Default for CPU and NVIDIA
trainingRuntime := trainerutils.DefaultClusterTrainingRuntime // Default for CPU and NVIDIA
if config.Accelerator.IsGpu() {
gpuResourceLabel = config.Accelerator.ResourceLabel
if config.Accelerator == AMD {
trainingRuntime = "torch-distributed-rocm"
trainingRuntime = trainerutils.DefaultClusterTrainingRuntimeROCm
}
}

Expand Down
2 changes: 2 additions & 0 deletions tests/trainer/sdk_tests/sft_traininghub_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,13 @@ func RunSftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
"export AWS_SECRET_ACCESS_KEY='%s'; "+
"export AWS_STORAGE_BUCKET='%s'; "+
"export AWS_STORAGE_BUCKET_SFT_DIR='%s'; "+
"export TRAINING_RUNTIME='%s'; "+
"python -m pip install --quiet --no-cache-dir --break-system-packages ipykernel papermill boto3==1.34.162 && "+
"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
endpoint, accessKey, secretKey, bucket, prefix,
trainerutils.DefaultTrainingHubRuntime,
sftNotebookName,
)
command := []string{"/bin/sh", "-c", shellCmd}
Expand Down
7 changes: 4 additions & 3 deletions tests/trainer/trainer_kueue_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (

. "github.com/opendatahub-io/distributed-workloads/tests/common"
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
trainerutils "github.com/opendatahub-io/distributed-workloads/tests/trainer/utils"
)

const (
Expand Down Expand Up @@ -145,7 +146,7 @@ func TestKueueDefaultLocalQueueLabelInjection(t *testing.T) {
},
Spec: trainerv1alpha1.TrainJobSpec{
RuntimeRef: trainerv1alpha1.RuntimeRef{
Name: defaultClusterTrainingRuntime,
Name: trainerutils.DefaultClusterTrainingRuntime,
},
Trainer: &trainerv1alpha1.Trainer{
Command: []string{"echo", "test"},
Expand Down Expand Up @@ -223,7 +224,7 @@ func TestKueueWorkloadPreemptionSuspendsTrainJob(t *testing.T) {
},
Spec: trainerv1alpha1.TrainJobSpec{
RuntimeRef: trainerv1alpha1.RuntimeRef{
Name: defaultClusterTrainingRuntime,
Name: trainerutils.DefaultClusterTrainingRuntime,
},
Trainer: &trainerv1alpha1.Trainer{
Command: []string{"sleep", "120"},
Expand Down Expand Up @@ -305,7 +306,7 @@ func TestKueueWorkloadInadmissibleWithNonExistentLocalQueue(t *testing.T) {
},
Spec: trainerv1alpha1.TrainJobSpec{
RuntimeRef: trainerv1alpha1.RuntimeRef{
Name: defaultClusterTrainingRuntime,
Name: trainerutils.DefaultClusterTrainingRuntime,
},
Trainer: &trainerv1alpha1.Trainer{
Command: []string{"echo", "test"},
Expand Down
44 changes: 44 additions & 0 deletions tests/trainer/utils/utils_runtimes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
Copyright 2025.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package trainer

// ClusterTrainingRuntime represents a ClusterTrainingRuntime with its expected RHOAI image
type ClusterTrainingRuntime struct {
Name string
RHOAIImage string
}

const (
// DefaultClusterTrainingRuntime is the default runtime for CUDA accelerators
DefaultClusterTrainingRuntime = "torch-distributed"

// DefaultClusterTrainingRuntimeROCm is the default runtime for AMD/ROCm accelerators
DefaultClusterTrainingRuntimeROCm = "torch-distributed-rocm"

// DefaultTrainingHubRuntime is the default runtime for SFT/OSFT workloads
DefaultTrainingHubRuntime = "training-hub"
)

// ExpectedRuntimes is the list of expected ClusterTrainingRuntimes on the cluster
var ExpectedRuntimes = []ClusterTrainingRuntime{
{Name: DefaultClusterTrainingRuntime, RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: DefaultClusterTrainingRuntimeROCm, RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
{Name: "torch-distributed-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: "torch-distributed-rocm64-torch29-py312", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
{Name: DefaultTrainingHubRuntime, RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
{Name: "training-hub-th05-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
}
Loading