Skip to content

Commit c117324

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents e948802 + 494f9f2 commit c117324

File tree

6 files changed

+302
-19
lines changed

6 files changed

+302
-19
lines changed

tests/kfto/kfto_mnist_sdk_test.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,18 @@ limitations under the License.
1717
package kfto
1818

1919
import (
20+
"fmt"
2021
"strings"
2122
"testing"
2223
"time"
2324

2425
. "github.com/onsi/gomega"
2526

27+
corev1 "k8s.io/api/core/v1"
2628
v1 "k8s.io/api/core/v1"
29+
"k8s.io/apimachinery/pkg/api/resource"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
2732

2833
. "github.com/opendatahub-io/distributed-workloads/tests/common"
2934
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
@@ -42,6 +47,41 @@ func TestMnistSDK(t *testing.T) {
4247
// Create role binding with Namespace specific admin cluster role
4348
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
4449

50+
// Create Kueue resources
51+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
52+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
53+
cqSpec := v1beta1.ClusterQueueSpec{
54+
NamespaceSelector: &metav1.LabelSelector{},
55+
ResourceGroups: []v1beta1.ResourceGroup{
56+
{
57+
CoveredResources: []corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory, corev1.ResourceName(NVIDIA.ResourceLabel)},
58+
Flavors: []v1beta1.FlavorQuotas{
59+
{
60+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
61+
Resources: []v1beta1.ResourceQuota{
62+
{
63+
Name: corev1.ResourceCPU,
64+
NominalQuota: resource.MustParse("1"),
65+
},
66+
{
67+
Name: corev1.ResourceMemory,
68+
NominalQuota: resource.MustParse("4Gi"),
69+
},
70+
{
71+
Name: corev1.ResourceName(NVIDIA.ResourceLabel),
72+
NominalQuota: resource.MustParse(fmt.Sprint(0)),
73+
},
74+
},
75+
},
76+
},
77+
},
78+
},
79+
}
80+
81+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
82+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
83+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
84+
4585
requiredChangesInNotebook := map[string]string{
4686
"${api_url}": GetOpenShiftApiUrl(test),
4787
"${password}": userToken,
@@ -80,7 +120,7 @@ func TestMnistSDK(t *testing.T) {
80120
}()
81121

82122
// Make sure pytorch job is created
83-
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp")).
123+
test.Eventually(PyTorchJob(test, namespace.Name, "pytorch-ddp"), TestTimeoutDouble).
84124
Should(WithTransform(PyTorchJobConditionRunning, Equal(v1.ConditionTrue)))
85125

86126
// Make sure that the job eventually succeeds

tests/kfto/kfto_mnist_training_test.go

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
corev1 "k8s.io/api/core/v1"
2828
"k8s.io/apimachinery/pkg/api/resource"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
3031

3132
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3233
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
@@ -82,10 +83,65 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
8283
"requirements.txt": requirementsFileName,
8384
})
8485

86+
// Create Kueue resources
87+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
88+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
89+
cqSpec := v1beta1.ClusterQueueSpec{
90+
NamespaceSelector: &metav1.LabelSelector{},
91+
ResourceGroups: []v1beta1.ResourceGroup{
92+
{
93+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
94+
Flavors: []v1beta1.FlavorQuotas{
95+
{
96+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
97+
Resources: []v1beta1.ResourceQuota{
98+
{
99+
Name: corev1.ResourceCPU,
100+
NominalQuota: resource.MustParse("8"),
101+
},
102+
{
103+
Name: corev1.ResourceMemory,
104+
NominalQuota: resource.MustParse("18Gi"),
105+
},
106+
},
107+
},
108+
},
109+
},
110+
},
111+
}
112+
113+
if accelerator.IsGpu() {
114+
numGpus := (workerReplicas + 1) * numProcPerNode
115+
cqSpec.ResourceGroups[0].CoveredResources = append(
116+
cqSpec.ResourceGroups[0].CoveredResources,
117+
corev1.ResourceName(accelerator.ResourceLabel),
118+
)
119+
cqSpec.ResourceGroups[0].Flavors[0].Resources = append(
120+
cqSpec.ResourceGroups[0].Flavors[0].Resources,
121+
v1beta1.ResourceQuota{
122+
Name: corev1.ResourceName(accelerator.ResourceLabel),
123+
NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
124+
},
125+
)
126+
}
127+
128+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
129+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
130+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
131+
85132
// Create training PyTorch job
86-
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, accelerator, workerReplicas, numProcPerNode, image)
133+
tuningJob := createKFTOPyTorchMnistJob(test, namespace.Name, *config, accelerator, workerReplicas, numProcPerNode, image, localQueue)
87134
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace.Name).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0))
88135

136+
// Make sure the Workload is created and running
137+
test.Eventually(GetKueueWorkloads(test, namespace.Name), TestTimeoutMedium).
138+
Should(
139+
And(
140+
HaveLen(1),
141+
ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
142+
),
143+
)
144+
89145
// Make sure the PyTorch job is running
90146
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
91147
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
@@ -96,7 +152,7 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
96152

97153
}
98154

99-
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, accelerator Accelerator, workerReplicas int, numProcPerNode int, baseImage string) *kftov1.PyTorchJob {
155+
func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.ConfigMap, accelerator Accelerator, workerReplicas int, numProcPerNode int, baseImage string, localQueue *v1beta1.LocalQueue) *kftov1.PyTorchJob {
100156
var backend string
101157
if accelerator.IsGpu() {
102158
backend = "nccl"
@@ -117,6 +173,9 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
117173
},
118174
ObjectMeta: metav1.ObjectMeta{
119175
GenerateName: "kfto-mnist-",
176+
Labels: map[string]string{
177+
"kueue.x-k8s.io/queue-name": localQueue.Name,
178+
},
120179
},
121180
Spec: kftov1.PyTorchJobSpec{
122181
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
@@ -177,11 +236,11 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
177236
Resources: corev1.ResourceRequirements{
178237
Requests: corev1.ResourceList{
179238
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
180-
corev1.ResourceMemory: resource.MustParse("6Gi"),
239+
corev1.ResourceMemory: resource.MustParse("4Gi"),
181240
},
182241
Limits: corev1.ResourceList{
183242
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
184-
corev1.ResourceMemory: resource.MustParse("6Gi"),
243+
corev1.ResourceMemory: resource.MustParse("4Gi"),
185244
},
186245
},
187246
},
@@ -273,11 +332,11 @@ func createKFTOPyTorchMnistJob(test Test, namespace string, config corev1.Config
273332
Resources: corev1.ResourceRequirements{
274333
Requests: corev1.ResourceList{
275334
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
276-
corev1.ResourceMemory: resource.MustParse("6Gi"),
335+
corev1.ResourceMemory: resource.MustParse("4Gi"),
277336
},
278337
Limits: corev1.ResourceList{
279338
corev1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", numProcPerNode)),
280-
corev1.ResourceMemory: resource.MustParse("6Gi"),
339+
corev1.ResourceMemory: resource.MustParse("4Gi"),
281340
},
282341
},
283342
},

tests/kfto/kfto_pytorchjob_failed_test.go

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
corev1 "k8s.io/api/core/v1"
1010
"k8s.io/apimachinery/pkg/api/resource"
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
12+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
1213

1314
. "github.com/opendatahub-io/distributed-workloads/tests/common"
1415
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
@@ -30,22 +31,56 @@ func runFailedPyTorchJobTest(t *testing.T, image string) {
3031
// Create a namespace
3132
namespace := test.NewTestNamespace()
3233

34+
// Create Kueue resources
35+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
36+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
37+
cqSpec := v1beta1.ClusterQueueSpec{
38+
NamespaceSelector: &metav1.LabelSelector{},
39+
ResourceGroups: []v1beta1.ResourceGroup{
40+
{
41+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
42+
Flavors: []v1beta1.FlavorQuotas{
43+
{
44+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
45+
Resources: []v1beta1.ResourceQuota{
46+
{
47+
Name: corev1.ResourceCPU,
48+
NominalQuota: resource.MustParse("8"),
49+
},
50+
{
51+
Name: corev1.ResourceMemory,
52+
NominalQuota: resource.MustParse("18Gi"),
53+
},
54+
},
55+
},
56+
},
57+
},
58+
},
59+
}
60+
61+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
62+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
63+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
64+
3365
// Create training PyTorch job
34-
tuningJob := createFailedPyTorchJob(test, namespace.Name, image)
66+
tuningJob := createFailedPyTorchJob(test, namespace.Name, image, localQueue)
3567

3668
// Make sure the PyTorch job is failed
3769
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
3870
Should(WithTransform(PyTorchJobConditionFailed, Equal(corev1.ConditionTrue)))
3971
}
4072

41-
func createFailedPyTorchJob(test Test, namespace string, baseImage string) *kftov1.PyTorchJob {
73+
func createFailedPyTorchJob(test Test, namespace string, baseImage string, localQueue *v1beta1.LocalQueue) *kftov1.PyTorchJob {
4274
tuningJob := &kftov1.PyTorchJob{
4375
TypeMeta: metav1.TypeMeta{
4476
APIVersion: corev1.SchemeGroupVersion.String(),
4577
Kind: "PyTorchJob",
4678
},
4779
ObjectMeta: metav1.ObjectMeta{
4880
GenerateName: "kfto-sft-",
81+
Labels: map[string]string{
82+
"kueue.x-k8s.io/queue-name": localQueue.Name,
83+
},
4984
},
5085
Spec: kftov1.PyTorchJobSpec{
5186
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{

tests/kfto/kfto_training_test.go

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
corev1 "k8s.io/api/core/v1"
2828
"k8s.io/apimachinery/pkg/api/resource"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"sigs.k8s.io/kueue/apis/kueue/v1beta1"
3031

3132
. "github.com/opendatahub-io/distributed-workloads/tests/common"
3233
. "github.com/opendatahub-io/distributed-workloads/tests/common/support"
@@ -78,6 +79,53 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
7879
// Create a namespace
7980
namespace := test.CreateOrGetTestNamespace().Name
8081

82+
// Create Kueue resources
83+
resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
84+
fmt.Sprintln(gpu.ResourceLabel)
85+
defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
86+
cqSpec := v1beta1.ClusterQueueSpec{
87+
NamespaceSelector: &metav1.LabelSelector{},
88+
ResourceGroups: []v1beta1.ResourceGroup{
89+
{
90+
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
91+
Flavors: []v1beta1.FlavorQuotas{
92+
{
93+
Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
94+
Resources: []v1beta1.ResourceQuota{
95+
{
96+
Name: corev1.ResourceCPU,
97+
NominalQuota: resource.MustParse("8"),
98+
},
99+
{
100+
Name: corev1.ResourceMemory,
101+
NominalQuota: resource.MustParse("32Gi"),
102+
},
103+
},
104+
},
105+
},
106+
},
107+
},
108+
}
109+
110+
if gpu.IsGpu() {
111+
numberOfGpus := (numberOfWorkerNodes + 1) * numGpus
112+
cqSpec.ResourceGroups[0].CoveredResources = append(
113+
cqSpec.ResourceGroups[0].CoveredResources,
114+
corev1.ResourceName(gpu.ResourceLabel),
115+
)
116+
cqSpec.ResourceGroups[0].Flavors[0].Resources = append(
117+
cqSpec.ResourceGroups[0].Flavors[0].Resources,
118+
v1beta1.ResourceQuota{
119+
Name: corev1.ResourceName(gpu.ResourceLabel),
120+
NominalQuota: resource.MustParse(fmt.Sprint(numberOfGpus)),
121+
},
122+
)
123+
}
124+
125+
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
126+
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
127+
localQueue := CreateKueueLocalQueue(test, namespace, clusterQueue.Name, AsDefaultQueue)
128+
81129
// Create a ConfigMap with training script
82130
configData := map[string][]byte{
83131
"hf_llm_training.py": readFile(test, "resources/hf_llm_training.py"),
@@ -89,7 +137,7 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
89137
defer test.Client().Core().CoreV1().PersistentVolumeClaims(namespace).Delete(test.Ctx(), outputPvc.Name, metav1.DeleteOptions{})
90138

91139
// Create training PyTorch job
92-
tuningJob := createKFTOPyTorchJob(test, namespace, *config, gpu, numGpus, numberOfWorkerNodes, outputPvc.Name, image)
140+
tuningJob := createKFTOPyTorchJob(test, namespace, *config, gpu, numGpus, numberOfWorkerNodes, outputPvc.Name, image, localQueue)
93141
defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0))
94142

95143
// Make sure the PyTorch job is running
@@ -122,14 +170,17 @@ func runKFTOPyTorchJob(t *testing.T, image string, gpu Accelerator, numGpus, num
122170
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
123171
}
124172

125-
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Accelerator, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string) *kftov1.PyTorchJob {
173+
func createKFTOPyTorchJob(test Test, namespace string, config corev1.ConfigMap, gpu Accelerator, numGpus, numberOfWorkerNodes int, outputPvcName string, baseImage string, localQueue *v1beta1.LocalQueue) *kftov1.PyTorchJob {
126174
tuningJob := &kftov1.PyTorchJob{
127175
TypeMeta: metav1.TypeMeta{
128176
APIVersion: corev1.SchemeGroupVersion.String(),
129177
Kind: "PyTorchJob",
130178
},
131179
ObjectMeta: metav1.ObjectMeta{
132180
GenerateName: "kfto-llm-",
181+
Labels: map[string]string{
182+
"kueue.x-k8s.io/default-queue": localQueue.Name,
183+
},
133184
},
134185
Spec: kftov1.PyTorchJobSpec{
135186
PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{

0 commit comments

Comments
 (0)