opendatahub-io · github-actions · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 23, 2026
diff --git a/.tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml b/.tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml
@@ -9,7 +9,7 @@ metadata:
     pipelinesascode.tekton.dev/cancel-in-progress: "true"
     pipelinesascode.tekton.dev/max-keep-runs: "3"
     pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch == "main" &&
-      ("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml".pathChanged()) 
+      ("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-cuda128-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-pull-request.yaml".pathChanged()) 
   creationTimestamp: null
   labels:
     appstudio.openshift.io/application: runtime-training

diff --git a/.tekton/training-runtime-py312-cuda128-torch290-push.yaml b/.tekton/training-runtime-py312-cuda128-torch290-push.yaml
@@ -8,7 +8,7 @@ metadata:
     pipelinesascode.tekton.dev/cancel-in-progress: "false"
     pipelinesascode.tekton.dev/max-keep-runs: "3"
     pipelinesascode.tekton.dev/on-cel-expression: event == "push" && target_branch == "main" &&
-      ("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-push.yaml".pathChanged())    
+      ("images/runtime/training/py312-cuda128-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-cuda128-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-cuda128-torch290-push.yaml".pathChanged())    
   creationTimestamp: null
   labels:
     appstudio.openshift.io/application: runtime-training

diff --git a/.tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml b/.tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml
@@ -9,7 +9,7 @@ metadata:
     pipelinesascode.tekton.dev/cancel-in-progress: "true"
     pipelinesascode.tekton.dev/max-keep-runs: "3"
     pipelinesascode.tekton.dev/on-cel-expression: event == "pull_request" && target_branch == "main" &&
-      ("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml".pathChanged()) 
+      ("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-rocm64-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-pull-request.yaml".pathChanged()) 
   creationTimestamp: null
   labels:
     appstudio.openshift.io/application: runtime-training

diff --git a/.tekton/training-runtime-py312-rocm64-torch290-push.yaml b/.tekton/training-runtime-py312-rocm64-torch290-push.yaml
@@ -8,7 +8,7 @@ metadata:
     pipelinesascode.tekton.dev/cancel-in-progress: "false"
     pipelinesascode.tekton.dev/max-keep-runs: "3"
     pipelinesascode.tekton.dev/on-cel-expression: event == "push" && target_branch == "main" &&
-      ("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-push.yaml".pathChanged()) 
+      ("images/runtime/training/py312-rocm64-torch290/Dockerfile".pathChanged() || "images/runtime/training/py312-rocm64-torch290/Pipfile.lock".pathChanged() || ".tekton/training-runtime-py312-rocm64-torch290-push.yaml".pathChanged()) 
   creationTimestamp: null
   labels:
     appstudio.openshift.io/application: runtime-training

diff --git a/images/runtime/training/py312-cuda128-torch290/Pipfile b/images/runtime/training/py312-cuda128-torch290/Pipfile
@@ -11,7 +11,7 @@ name = "pytorch-cu128"
 [packages]
 torch = {version = "==2.9.0", index = "pytorch-cu128"}
 accelerate = "==1.12.0"
-transformers = "==4.57.2"
+transformers = "==4.57.1"
 peft = "==0.18.1"
 datasets = "==4.3.0"
 tqdm = "==4.67.1"

diff --git a/images/runtime/training/py312-cuda128-torch290/Pipfile.lock b/images/runtime/training/py312-cuda128-torch290/Pipfile.lock
diff --git a/images/runtime/training/py312-rocm64-torch290/Pipfile b/images/runtime/training/py312-rocm64-torch290/Pipfile
@@ -13,7 +13,7 @@ torch = {version = "==2.9.0+rocm6.4", index = "pytorch-rocm64"}
 torchvision = {version = "==0.24.0+rocm6.4", index = "pytorch-rocm64"}
 pytorch-triton-rocm = {version = "==3.5.0", index = "pytorch-rocm64"}
 accelerate = "==1.12.0"
-transformers = "==4.57.2"
+transformers = "==4.57.1"
 peft = "==0.18.1"
 datasets = "==4.3.0"
 tqdm = "==4.67.1"

diff --git a/images/runtime/training/py312-rocm64-torch290/Pipfile.lock b/images/runtime/training/py312-rocm64-torch290/Pipfile.lock
diff --git a/tests/trainer/cluster_training_runtimes_test.go b/tests/trainer/cluster_training_runtimes_test.go
@@ -27,25 +27,9 @@ import (
 
 	. "github.com/opendatahub-io/distributed-workloads/tests/common"
 	. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
+	trainerutils "github.com/opendatahub-io/distributed-workloads/tests/trainer/utils"
 )
 
-type ClusterTrainingRuntime struct {
-	Name       string
-	RHOAIImage string
-}
-
-var expectedRuntimes = []ClusterTrainingRuntime{
-	{Name: "torch-distributed", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
-	{Name: "torch-distributed-rocm", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
-	{Name: "torch-distributed-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
-	{Name: "torch-distributed-rocm64-torch29-py312", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
-	{Name: "training-hub", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
-	{Name: "training-hub-th05-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
-}
-
-// defaultClusterTrainingRuntime is used across integration tests
-var defaultClusterTrainingRuntime = expectedRuntimes[0].Name
-
 func TestDefaultClusterTrainingRuntimes(t *testing.T) {
 	Tags(t, Smoke)
 	test := With(t)
@@ -54,8 +38,8 @@ func TestDefaultClusterTrainingRuntimes(t *testing.T) {
 	registryName := GetExpectedRegistry(test)
 
 	// Build a map of expected runtimes for quick lookup
-	expectedRuntimeMap := make(map[string]ClusterTrainingRuntime)
-	for _, runtime := range expectedRuntimes {
+	expectedRuntimeMap := make(map[string]trainerutils.ClusterTrainingRuntime)
+	for _, runtime := range trainerutils.ExpectedRuntimes {
 		expectedRuntimeMap[runtime.Name] = runtime
 	}
 
@@ -108,7 +92,7 @@ func TestDefaultClusterTrainingRuntimes(t *testing.T) {
 
 	// Verify all expected runtimes are present
 	var missingRuntimes []string
-	for _, expected := range expectedRuntimes {
+	for _, expected := range trainerutils.ExpectedRuntimes {
 		if !foundRuntimes[expected.Name] {
 			missingRuntimes = append(missingRuntimes, expected.Name)
 		}
@@ -129,7 +113,7 @@ func TestRunTrainJobWithDefaultClusterTrainingRuntimes(t *testing.T) {
 	Tags(t, Sanity)
 	test := With(t)
 
-	for _, runtime := range expectedRuntimes {
+	for _, runtime := range trainerutils.ExpectedRuntimes {
 		test.T().Logf("Running TrainJob with ClusterTrainingRuntime: %s", runtime.Name)
 
 		// Create a namespace
@@ -139,7 +123,7 @@ func TestRunTrainJobWithDefaultClusterTrainingRuntimes(t *testing.T) {
 		trainJob := createTrainJob(test, namespace, runtime.Name)
 
 		// Wait for TrainJob completion
-		test.Eventually(TrainJob(test, namespace, trainJob.Name), TestTimeoutLong).
+		test.Eventually(TrainJob(test, namespace, trainJob.Name), TestTimeoutDouble).
 			Should(WithTransform(TrainJobConditionComplete, Equal(metav1.ConditionTrue)))
 
 		test.T().Logf("TrainJob with ClusterTrainingRuntime '%s' completed successfully", runtime.Name)

diff --git a/tests/trainer/resources/mnist.ipynb b/tests/trainer/resources/mnist.ipynb
@@ -425,11 +425,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
+    "if not training_runtime_name:\n",
+    "    raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
     "\n",
     "try:\n",
-    "    torch_runtime = client.get_runtime(\"torch-distributed\")\n",
+    "    torch_runtime = client.get_runtime(training_runtime_name)\n",
     "except Exception as e:\n",
-    "    raise RuntimeError(\"Runtime 'torch-distributed' not found or not accessible\") from e"
+    "    raise RuntimeError(f\"Runtime '{training_runtime_name}' not found or not accessible\") from e"
    ]
   },
   {

diff --git a/tests/trainer/resources/osft.ipynb b/tests/trainer/resources/osft.ipynb
@@ -400,15 +400,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
+    "if not training_runtime_name:\n",
+    "    raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
+    "\n",
     "th_runtime = None\n",
     "for runtime in client.list_runtimes():\n",
-    "    if runtime.name == \"training-hub03-cuda128-torch28-py312\":\n",
+    "    if runtime.name == training_runtime_name:\n",
     "        th_runtime = runtime\n",
     "        print(\"Found runtime: \" + str(th_runtime))\n",
     "        break\n",
     "\n",
     "if th_runtime is None:\n",
-    "    raise RuntimeError(\"Required runtime 'training-hub03-cuda128-torch28-py312' not found\")"
+    "    raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")"
    ]
   },
   {
@@ -438,6 +442,11 @@
     "            \"XDG_CACHE_HOME\": \"/opt/app-root/src/.cache\",\n",
     "            \"NCCL_DEBUG\": \"INFO\",\n",
     "        },\n",
+    "        resources_per_node={\n",
+    "            \"cpu\": 4,\n",
+    "            \"memory\": \"32Gi\",\n",
+    "            \"nvidia.com/gpu\": 1\n",
+    "        },\n",
     "    ),\n",
     "    options=[\n",
     "        PodTemplateOverrides(\n",

diff --git a/tests/trainer/resources/rhai_features.ipynb b/tests/trainer/resources/rhai_features.ipynb
@@ -192,8 +192,13 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "training_runtime_name = os.getenv(\"TRAINING_RUNTIME\", \"torch-distributed\")\n",
+        "training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
+        "if not training_runtime_name:\n",
+        "    raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
+        "\n",
         "torch_runtime = trainer_client.get_runtime(training_runtime_name)\n",
+        "if torch_runtime is None:\n",
+        "    raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")\n",
         "print(f\"Got runtime: {torch_runtime.name}\")"
       ]
     },

diff --git a/tests/trainer/resources/sft.ipynb b/tests/trainer/resources/sft.ipynb
@@ -474,15 +474,19 @@
    "outputs": [],
    "source": [
     "# Find the TrainingHub runtime\n",
+    "training_runtime_name = os.getenv(\"TRAINING_RUNTIME\")\n",
+    "if not training_runtime_name:\n",
+    "    raise RuntimeError(\"TRAINING_RUNTIME environment variable is required\")\n",
+    "\n",
     "th_runtime = None\n",
     "for runtime in client.list_runtimes():\n",
-    "    if runtime.name == \"training-hub03-cuda128-torch28-py312\":\n",
+    "    if runtime.name == training_runtime_name:\n",
     "        th_runtime = runtime\n",
     "        print(\"Found runtime: \" + str(th_runtime))\n",
     "        break\n",
     "\n",
     "if th_runtime is None:\n",
-    "    raise RuntimeError(\"Required runtime 'training-hub03-cuda128-torch28-py312' not found\")"
+    "    raise RuntimeError(f\"Required runtime '{training_runtime_name}' not found\")"
    ]
   },
   {
@@ -512,6 +516,11 @@
     "            \"XDG_CACHE_HOME\": \"/opt/app-root/src/.cache\",\n",
     "            \"NCCL_DEBUG\": \"INFO\",\n",
     "        },\n",
+    "        resources_per_node={\n",
+    "            \"cpu\": 4,\n",
+    "            \"memory\": \"32Gi\",\n",
+    "            \"nvidia.com/gpu\": 1\n",
+    "        },\n",
     "    ),\n",
     "    options=[\n",
     "        PodTemplateOverrides(\n",

diff --git a/tests/trainer/sdk_tests/fashion_mnist_tests.go b/tests/trainer/sdk_tests/fashion_mnist_tests.go
@@ -89,11 +89,13 @@ func RunFashionMnistCpuDistributedTraining(t *testing.T) {
 			"export AWS_DEFAULT_ENDPOINT='%s'; export AWS_ACCESS_KEY_ID='%s'; "+
 			"export AWS_SECRET_ACCESS_KEY='%s'; export AWS_STORAGE_BUCKET='%s'; "+
 			"export AWS_STORAGE_BUCKET_MNIST_DIR='%s'; "+
+			"export TRAINING_RUNTIME='%s'; "+
 			"python -m pip install --quiet --no-cache-dir ipykernel papermill boto3==1.34.162 && "+
 			"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
 			"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
 		support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
 		endpoint, accessKey, secretKey, bucket, prefix,
+		trainerutils.DefaultClusterTrainingRuntime,
 		notebookName,
 	)
 	command := []string{"/bin/sh", "-c", shellCmd}

diff --git a/tests/trainer/sdk_tests/osft_traininghub_tests.go b/tests/trainer/sdk_tests/osft_traininghub_tests.go
@@ -91,11 +91,13 @@ func RunOsftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
 			"export AWS_SECRET_ACCESS_KEY='%s'; "+
 			"export AWS_STORAGE_BUCKET='%s'; "+
 			"export AWS_STORAGE_BUCKET_OSFT_DIR='%s'; "+
+			"export TRAINING_RUNTIME='%s'; "+
 			"python -m pip install --quiet --no-cache-dir --break-system-packages ipykernel papermill boto3==1.34.162 && "+
 			"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
 			"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
 		support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
 		endpoint, accessKey, secretKey, bucket, prefix,
+		trainerutils.DefaultTrainingHubRuntime,
 		osftNotebookName,
 	)
 	command := []string{"/bin/sh", "-c", shellCmd}

diff --git a/tests/trainer/sdk_tests/rhai_features_tests.go b/tests/trainer/sdk_tests/rhai_features_tests.go
@@ -144,11 +144,11 @@ func runRhaiFeaturesTestWithConfig(t *testing.T, config RhaiFeatureConfig) {
 
 	// Determine GPU resource label (empty for CPU) and training runtime
 	gpuResourceLabel := ""
-	trainingRuntime := "torch-distributed" // Default for CPU and NVIDIA
+	trainingRuntime := trainerutils.DefaultClusterTrainingRuntime // Default for CPU and NVIDIA
 	if config.Accelerator.IsGpu() {
 		gpuResourceLabel = config.Accelerator.ResourceLabel
 		if config.Accelerator == AMD {
-			trainingRuntime = "torch-distributed-rocm"
+			trainingRuntime = trainerutils.DefaultClusterTrainingRuntimeROCm
 		}
 	}
 

diff --git a/tests/trainer/sdk_tests/sft_traininghub_tests.go b/tests/trainer/sdk_tests/sft_traininghub_tests.go
@@ -91,11 +91,13 @@ func RunSftTrainingHubMultiGpuDistributedTraining(t *testing.T) {
 			"export AWS_SECRET_ACCESS_KEY='%s'; "+
 			"export AWS_STORAGE_BUCKET='%s'; "+
 			"export AWS_STORAGE_BUCKET_SFT_DIR='%s'; "+
+			"export TRAINING_RUNTIME='%s'; "+
 			"python -m pip install --quiet --no-cache-dir --break-system-packages ipykernel papermill boto3==1.34.162 && "+
 			"if python -m papermill -k python3 /opt/app-root/notebooks/%s /opt/app-root/src/out.ipynb --log-output; "+
 			"then echo 'NOTEBOOK_STATUS: SUCCESS'; else echo 'NOTEBOOK_STATUS: FAILURE'; fi; sleep infinity",
 		support.GetOpenShiftApiUrl(test), userToken, namespace.Name, rwxPvc.Name,
 		endpoint, accessKey, secretKey, bucket, prefix,
+		trainerutils.DefaultTrainingHubRuntime,
 		sftNotebookName,
 	)
 	command := []string{"/bin/sh", "-c", shellCmd}

diff --git a/tests/trainer/trainer_kueue_integration_test.go b/tests/trainer/trainer_kueue_integration_test.go
@@ -34,6 +34,7 @@ import (
 
 	. "github.com/opendatahub-io/distributed-workloads/tests/common"
 	. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
+	trainerutils "github.com/opendatahub-io/distributed-workloads/tests/trainer/utils"
 )
 
 const (
@@ -145,7 +146,7 @@ func TestKueueDefaultLocalQueueLabelInjection(t *testing.T) {
 		},
 		Spec: trainerv1alpha1.TrainJobSpec{
 			RuntimeRef: trainerv1alpha1.RuntimeRef{
-				Name: defaultClusterTrainingRuntime,
+				Name: trainerutils.DefaultClusterTrainingRuntime,
 			},
 			Trainer: &trainerv1alpha1.Trainer{
 				Command: []string{"echo", "test"},
@@ -223,7 +224,7 @@ func TestKueueWorkloadPreemptionSuspendsTrainJob(t *testing.T) {
 		},
 		Spec: trainerv1alpha1.TrainJobSpec{
 			RuntimeRef: trainerv1alpha1.RuntimeRef{
-				Name: defaultClusterTrainingRuntime,
+				Name: trainerutils.DefaultClusterTrainingRuntime,
 			},
 			Trainer: &trainerv1alpha1.Trainer{
 				Command: []string{"sleep", "120"},
@@ -305,7 +306,7 @@ func TestKueueWorkloadInadmissibleWithNonExistentLocalQueue(t *testing.T) {
 		},
 		Spec: trainerv1alpha1.TrainJobSpec{
 			RuntimeRef: trainerv1alpha1.RuntimeRef{
-				Name: defaultClusterTrainingRuntime,
+				Name: trainerutils.DefaultClusterTrainingRuntime,
 			},
 			Trainer: &trainerv1alpha1.Trainer{
 				Command: []string{"echo", "test"},

diff --git a/tests/trainer/utils/utils_runtimes.go b/tests/trainer/utils/utils_runtimes.go
@@ -0,0 +1,44 @@
+/*
+Copyright 2025.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package trainer
+
+// ClusterTrainingRuntime represents a ClusterTrainingRuntime with its expected RHOAI image
+type ClusterTrainingRuntime struct {
+	Name       string
+	RHOAIImage string
+}
+
+const (
+	// DefaultClusterTrainingRuntime is the default runtime for CUDA accelerators
+	DefaultClusterTrainingRuntime = "torch-distributed"
+
+	// DefaultClusterTrainingRuntimeROCm is the default runtime for AMD/ROCm accelerators
+	DefaultClusterTrainingRuntimeROCm = "torch-distributed-rocm"
+
+	// DefaultTrainingHubRuntime is the default runtime for SFT/OSFT workloads
+	DefaultTrainingHubRuntime = "training-hub"
+)
+
+// ExpectedRuntimes is the list of expected ClusterTrainingRuntimes on the cluster
+var ExpectedRuntimes = []ClusterTrainingRuntime{
+	{Name: DefaultClusterTrainingRuntime, RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
+	{Name: DefaultClusterTrainingRuntimeROCm, RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
+	{Name: "torch-distributed-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
+	{Name: "torch-distributed-rocm64-torch29-py312", RHOAIImage: "odh-training-rocm64-torch29-py312-rhel9"},
+	{Name: DefaultTrainingHubRuntime, RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
+	{Name: "training-hub-th05-cuda128-torch29-py312", RHOAIImage: "odh-training-cuda128-torch29-py312-rhel9"},
+}