Skip to content

Commit 22ecc73

Browse files
authored
Improvements in e2e testing (#39)
1 parent 30054a8 commit 22ecc73

File tree

10 files changed

+255
-27
lines changed

10 files changed

+255
-27
lines changed

hack/e2e-util.sh

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,19 @@
1515
export LOG_LEVEL=${TEST_LOG_LEVEL:-2}
1616
export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"}
1717
export CLUSTER_CONTEXT="--name test"
18-
export IMAGE_ECHOSERVER="quay.io/project-codeflare/echo-server:1.0"
19-
export IMAGE_BUSY_BOX_LATEST="quay.io/project-codeflare/busybox:latest"
2018
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/kind-config.yaml"}
2119
export KA_BIN=_output/bin
2220
export WAIT_TIME="20s"
2321
export KUTTL_VERSION=0.15.0
22+
export KUBEFLOW_VERSION=v1.7.0
23+
export CERTMANAGER_VERSION=v1.13.3
2424
DUMP_LOGS="true"
2525

26+
# These are images used by the e2e tests.
27+
# Pull and kind load to avoid long delays during testing
28+
export IMAGE_ECHOSERVER="quay.io/project-codeflare/echo-server:1.0"
29+
export IMAGE_BUSY_BOX_LATEST="quay.io/project-codeflare/busybox:latest"
30+
2631
function update_test_host {
2732

2833
local arch="$(go env GOARCH)"
@@ -109,19 +114,15 @@ function check_prerequisites {
109114
}
110115

111116
function pull_images {
112-
docker pull ${IMAGE_ECHOSERVER}
113-
if [ $? -ne 0 ]
114-
then
115-
echo "Failed to pull ${IMAGE_ECHOSERVER}"
116-
exit 1
117-
fi
118-
119-
docker pull ${IMAGE_BUSY_BOX_LATEST}
120-
if [ $? -ne 0 ]
121-
then
122-
echo "Failed to pull ${IMAGE_BUSY_BOX_LATEST}"
123-
exit 1
124-
fi
117+
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST}
118+
do
119+
docker pull $image
120+
if [ $? -ne 0 ]
121+
then
122+
echo "Failed to pull $image"
123+
exit 1
124+
fi
125+
done
125126

126127
docker images
127128
}
@@ -149,14 +150,26 @@ function kind_up_cluster {
149150

150151
function configure_cluster {
151152
echo "Installing cert-manager"
152-
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.3/cert-manager.yaml
153+
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/$CERTMANAGER_VERSION/cert-manager.yaml
153154

154155
# sleep to ensure cert-manager is fully functional
155156
echo "Waiting for pod in the cert-manager namespace to become ready"
156157
while [[ $(kubectl get pods -n cert-manager -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
157158
do
158159
echo -n "." && sleep 1;
159160
done
161+
echo ""
162+
163+
echo "Installing Kubeflow operator version $KUBEFLOW_VERSION"
164+
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=$KUBEFLOW_VERSION"
165+
166+
# Sleep until the kubeflow operator is running
167+
echo "Waiting for pods in the kueueflow namespace to become ready"
168+
while [[ $(kubectl get pods -n kubeflow -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]]
169+
do
170+
echo -n "." && sleep 1;
171+
done
172+
echo ""
160173
}
161174

162175
function wait_for_appwrapper_controller {

hack/kueue/kueue-manifests-v0.6.0.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11063,7 +11063,7 @@ data:
1106311063
#pprofBindAddress: :8082
1106411064
#waitForPodsReady:
1106511065
# enable: true
11066-
manageJobsWithoutQueueName: true
11066+
# manageJobsWithoutQueueName: true
1106711067
#internalCertManagement:
1106811068
# enable: false
1106911069
# webhookServiceName: ""

internal/controller/appwrapper_controller.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ import (
3434
"sigs.k8s.io/controller-runtime/pkg/handler"
3535
"sigs.k8s.io/controller-runtime/pkg/log"
3636
"sigs.k8s.io/controller-runtime/pkg/reconcile"
37+
"sigs.k8s.io/kueue/pkg/controller/constants"
38+
"sigs.k8s.io/kueue/pkg/controller/jobframework"
3739

3840
workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
3941
)
@@ -315,16 +317,29 @@ func (r *AppWrapperReconciler) createComponents(ctx context.Context, aw *workloa
315317
if err != nil {
316318
return err, true // fatal
317319
}
320+
321+
ref := &metav1.OwnerReference{APIVersion: GVK.GroupVersion().String(), Kind: GVK.Kind, Name: aw.Name, UID: aw.UID}
322+
myWorkloadName, err := jobframework.GetWorkloadNameForOwnerRef(ref)
323+
if err != nil {
324+
return err, true
325+
}
326+
318327
for _, obj := range objects {
328+
annotations := obj.GetAnnotations()
329+
if annotations == nil {
330+
annotations = make(map[string]string)
331+
}
332+
annotations[constants.ParentWorkloadAnnotation] = myWorkloadName
333+
obj.SetAnnotations(annotations)
334+
if err := controllerutil.SetControllerReference(aw, obj, r.Scheme); err != nil {
335+
return err, true
336+
}
319337
if err := r.Create(ctx, obj); err != nil {
320338
if apierrors.IsAlreadyExists(err) {
321339
continue // ignore existing component
322340
}
323341
return err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal
324342
}
325-
if err := controllerutil.SetControllerReference(aw, obj, r.Scheme); err != nil {
326-
return err, true
327-
}
328343
}
329344
return nil, false
330345
}

samples/wrapped-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: workload.codeflare.dev/v1beta2
22
kind: AppWrapper
33
metadata:
4-
name: sample2
4+
name: sample-deployment
55
annotations:
66
kueue.x-k8s.io/queue-name: user-queue
77
spec:
@@ -14,7 +14,7 @@ spec:
1414
apiVersion: apps/v1
1515
kind: Deployment
1616
metadata:
17-
name: test
17+
name: sample-deployment-deployment
1818
labels:
1919
app: test
2020
spec:

samples/wrapped-job.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
apiVersion: workload.codeflare.dev/v1beta2
2+
kind: AppWrapper
3+
metadata:
4+
name: sample-job
5+
annotations:
6+
kueue.x-k8s.io/queue-name: user-queue
7+
spec:
8+
suspend: true
9+
components:
10+
- podSets:
11+
- replicas: 1
12+
path: template.spec.template
13+
template:
14+
apiVersion: batch/v1
15+
kind: Job
16+
metadata:
17+
name: sample-job-job
18+
spec:
19+
template:
20+
spec:
21+
restartPolicy: Never
22+
containers:
23+
- name: busybox
24+
image: quay.io/project-codeflare/busybox:1.36
25+
command: ["sh", "-c", "sleep 600"]
26+
resources:
27+
requests:
28+
cpu: 1

samples/wrapped-pod.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: workload.codeflare.dev/v1beta2
22
kind: AppWrapper
33
metadata:
4-
name: sample
4+
name: sample-pod
55
annotations:
66
kueue.x-k8s.io/queue-name: user-queue
77
spec:
@@ -14,7 +14,7 @@ spec:
1414
apiVersion: v1
1515
kind: Pod
1616
metadata:
17-
name: sample
17+
name: sample-pod-pod
1818
spec:
1919
restartPolicy: Never
2020
initContainers:

samples/wrapped-pytorch-job.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
apiVersion: workload.codeflare.dev/v1beta2
2+
kind: AppWrapper
3+
metadata:
4+
name: sample-pytorch-job
5+
annotations:
6+
kueue.x-k8s.io/queue-name: user-queue
7+
spec:
8+
suspend: true
9+
components:
10+
- podSets:
11+
- replicas: 1
12+
path: template.spec.pytorchReplicaSpecs.Master.template
13+
- replicas: 1
14+
path: template.spec.pytorchReplicaSpecs.Worker.template
15+
template:
16+
apiVersion: "kubeflow.org/v1"
17+
kind: PyTorchJob
18+
metadata:
19+
name: pytorch-simple
20+
spec:
21+
pytorchReplicaSpecs:
22+
Master:
23+
replicas: 1
24+
restartPolicy: OnFailure
25+
template:
26+
spec:
27+
containers:
28+
- name: pytorch
29+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1
30+
command:
31+
- "python3"
32+
- "/opt/pytorch-mnist/mnist.py"
33+
- "--epochs=1"
34+
resources:
35+
requests:
36+
cpu: 1
37+
Worker:
38+
replicas: 1
39+
restartPolicy: OnFailure
40+
template:
41+
spec:
42+
containers:
43+
- name: pytorch
44+
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v1beta1-fc858d1
45+
command:
46+
- "python3"
47+
- "/opt/pytorch-mnist/mnist.py"
48+
- "--epochs=1"
49+
resources:
50+
requests:
51+
cpu: 1

test/e2e/appwrapper_test.go

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,28 @@ var _ = Describe("AppWrapper E2E Test", func() {
5353
appwrappers = append(appwrappers, aw)
5454
Expect(waitAWPodsReady(ctx, aw)).Should(Succeed())
5555
})
56-
// TODO: Batch v1.Jobs
56+
It("Batch Jobs", func() {
57+
aw := createAppWrapper(ctx, batchjob(250))
58+
appwrappers = append(appwrappers, aw)
59+
Expect(waitAWPodsReady(ctx, aw)).Should(Succeed())
60+
})
61+
5762
It("Mixed Basic Resources", func() {
58-
aw := createAppWrapper(ctx, pod(100), deployment(2, 100), statefulset(2, 100), service())
63+
aw := createAppWrapper(ctx, pod(100), deployment(2, 100), statefulset(2, 100), service(), batchjob(100))
64+
appwrappers = append(appwrappers, aw)
65+
Expect(waitAWPodsReady(ctx, aw)).Should(Succeed())
66+
})
67+
})
68+
69+
Describe("Creation of Kubeflow Training Operator GVKs", func() {
70+
It("PyTorch Jobs", func() {
71+
aw := createAppWrapper(ctx, pytorchjob(1, 100, 2, 250))
5972
appwrappers = append(appwrappers, aw)
6073
Expect(waitAWPodsReady(ctx, aw)).Should(Succeed())
6174
})
75+
76+
// TODO: Additional Kubeflow Training Operator GVKs of interest
77+
6278
})
6379

6480
Describe("Error Handling for Invalid Resources", func() {
@@ -90,6 +106,15 @@ var _ = Describe("AppWrapper E2E Test", func() {
90106

91107
})
92108

109+
Describe("Recognition of Child Jobs", func() {
110+
// TODO: Test scenarios where the AW "just fits" in the quota and
111+
// contains components that Kueue might try to queue
112+
// but should not in this case because they are using the parent workload's quota
113+
// 1. batch v1 jobs
114+
// 2. pytorch jobs (which themself contain child Jobs)
115+
116+
})
117+
93118
Describe("Detection of Completion Status", func() {
94119

95120
})

0 commit comments

Comments
 (0)