Skip to content

Commit 2c6a14f

Browse files
committed
Example with Jobs and with Workloads
1 parent a291ccd commit 2c6a14f

File tree

8 files changed

+304
-0
lines changed

8 files changed

+304
-0
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
apiVersion: kueue.x-k8s.io/v1beta1
3+
kind: ResourceFlavor
4+
metadata:
5+
name: "default-flavor"
6+
spec: {}
7+
8+
---
9+
apiVersion: kueue.x-k8s.io/v1beta1
10+
kind: ResourceFlavor
11+
metadata:
12+
name: "gpu"
13+
spec:
14+
labels:
15+
nodeLabels:
16+
instance-type: spot
17+
18+
19+
---
20+
apiVersion: kueue.x-k8s.io/v1beta1
21+
kind: WorkloadPriorityClass
22+
metadata:
23+
name: human-critical
24+
description: "Use for critical human critical workloads like research on disease or natural disaster avoidance"
25+
preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
26+
value: 1000000 # 1M out of 1B, higher is better
27+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
28+
29+
---
30+
apiVersion: kueue.x-k8s.io/v1beta1
31+
kind: WorkloadPriorityClass
32+
metadata:
33+
name: business-impacting
34+
description: "Use for business critical impacting workloads"
35+
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
36+
value: 1000 # 1M out of 1B, higher is better
37+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
38+
39+
40+
---
41+
apiVersion: kueue.x-k8s.io/v1beta1
42+
kind: WorkloadPriorityClass
43+
metadata:
44+
name: long-term-research
45+
description: "Use for long term research processes like extraterrestiral research"
46+
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
47+
value: 1 # 1M out of 1B, higher is better
48+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
49+
50+
51+
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
---
2+
apiVersion: kueue.x-k8s.io/v1beta1
3+
kind: ClusterQueue
4+
metadata:
5+
name: "emergency-cluster-queue"
6+
spec:
7+
description: "Cluster queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)."
8+
cohort: "ai-for-humanity"
9+
namespaceSelector: {} # match all.
10+
flavorFungibility:
11+
whenCanBorrow: Borrow
12+
whenCanPreempt: Preempt
13+
preemption:
14+
reclaimWithinCohort: Any
15+
borrowWithinCohort:
16+
policy: LowerPriority
17+
withinClusterQueue: LowerPriority
18+
resourceGroups:
19+
- coveredResources: ["cpu", "memory"]
20+
flavors:
21+
- name: "default-flavor"
22+
resources:
23+
- name: "cpu"
24+
nominalQuota: 1
25+
- name: "memory"
26+
nominalQuota: 2000Mi
27+
borrowingLimit: 500Mi
28+
29+
---
30+
apiVersion: kueue.x-k8s.io/v1beta1
31+
kind: ClusterQueue
32+
metadata:
33+
name: llm-cluster-queue
34+
spec:
35+
description: "Cluster queue for LLM model workloads"
36+
cohort: ai-for-humanity
37+
namespaceSelector: {}
38+
flavorFungibility:
39+
whenCanBorrow: Borrow
40+
whenCanPreempt: TryNextFlavor
41+
preemption:
42+
reclaimWithinCohort: LowerPriority # only preempt Workloads in the cohort that have lower priority than the pending Workload.
43+
namespaceSelector: {} # match all.
44+
resourceGroups:
45+
- coveredResources:
46+
- "cpu"
47+
- "memory"
48+
flavors:
49+
- name: "default-flavor"
50+
resources:
51+
- name: "cpu"
52+
nominalQuota: 500m
53+
- name: "memory"
54+
nominalQuota: 500Mi
55+
borrowingLimit: 500Mi
56+
57+
---
58+
apiVersion: kueue.x-k8s.io/v1beta1
59+
kind: ClusterQueue
60+
metadata:
61+
name: gai-cluster-queue
62+
spec:
63+
description: "Cluster queue for GAI model workloads"
64+
cohort: ai-against-humanity
65+
namespaceSelector: {}
66+
preemption:
67+
reclaimWithinCohort: Never # do not preempt Workloads in the cohort.
68+
flavorFungibility:
69+
whenCanBorrow: Borrow # this is the default but I'm making it explicit here
70+
whenCanPreempt: Preempt # ensures that accelerators aren't hit with compute workloads
71+
resourceGroups:
72+
- coveredResources:
73+
- "gpu"
74+
flavors:
75+
- name: "gpu"
76+
resources:
77+
- name: "gpu"
78+
nominalQuota: 48Gi
79+
80+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
apiVersion: kueue.x-k8s.io/v1beta1
3+
kind: LocalQueue
4+
metadata:
5+
name: emergency-queue
6+
spec:
7+
description: "Queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)."
8+
clusterQueue: emergency-cluster-queue
9+
10+
---
11+
apiVersion: kueue.x-k8s.io/v1beta1
12+
kind: LocalQueue
13+
metadata:
14+
name: llm-queue
15+
spec:
16+
description: "Queue for the LLM model's training jobs."
17+
clusterQueue: llm-cluster-queue
18+
19+
20+
---
21+
apiVersion: kueue.x-k8s.io/v1beta1
22+
kind: LocalQueue
23+
metadata:
24+
name: gai-queue
25+
spec:
26+
description: "Queue for the GAI (General Artificial Intelligence) model's training jobs."
27+
clusterQueue: gai-cluster-queue
28+
29+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
generateName: gai-
6+
labels:
7+
kueue.x-k8s.io/priority-class: long-term-research
8+
kueue.x-k8s.io/queue-name: gai-queue
9+
spec:
10+
parallelism: 3
11+
completions: 3
12+
suspend: true
13+
template:
14+
spec:
15+
containers:
16+
- name: gai-training-brain
17+
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
18+
args: ["300s"]
19+
resources:
20+
requests:
21+
cpu: 1
22+
memory: "1Gi"
23+
restartPolicy: Never
24+
25+
26+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
generateName: llm-
6+
labels:
7+
kueue.x-k8s.io/priority-class: business-impacting
8+
kueue.x-k8s.io/queue-name: llm-queue
9+
spec:
10+
parallelism: 3
11+
completions: 3
12+
suspend: true
13+
template:
14+
spec:
15+
containers:
16+
- name: dummy-job
17+
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
18+
args: ["300s"]
19+
resources:
20+
requests:
21+
cpu: 50m
22+
memory: "50Mi"
23+
restartPolicy: Never
24+
25+
26+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
---
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
generateName: cure-cancer-
6+
labels:
7+
kueue.x-k8s.io/priority-class: human-critical
8+
kueue.x-k8s.io/queue-name: emergency-queue
9+
spec:
10+
parallelism: 3
11+
completions: 3
12+
suspend: true
13+
template:
14+
spec:
15+
containers:
16+
- name: cancer-treatment-model-update
17+
image: gcr.io/k8s-staging-perf-tests/sleep:v0.1.0
18+
args: ["60s"]
19+
resources:
20+
requests:
21+
cpu: 50m
22+
memory: "50Mi"
23+
restartPolicy: Never
24+
25+
26+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
apiVersion: kueue.x-k8s.io/v1beta1
3+
kind: WorkloadPriorityClass
4+
metadata:
5+
name: human-critical
6+
description: "Use for critical human critical workloads like research on disease or natural disaster avoidance"
7+
preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
8+
value: 1000000 # 1M out of 1B, higher is better
9+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
10+
11+
---
12+
apiVersion: kueue.x-k8s.io/v1beta1
13+
kind: WorkloadPriorityClass
14+
metadata:
15+
name: business-impacting
16+
description: "Use for business critical impacting workloads"
17+
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
18+
value: 1000 # 1M out of 1B, higher is better
19+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
20+
21+
22+
---
23+
apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: WorkloadPriorityClass
25+
metadata:
26+
name: long-term-research
27+
description: "Use for long term research processes like extraterrestiral research"
28+
#preemptionPolicy: Never # set to prevent pods of this priorityClass from being preempted to make space for other pods
29+
value: 1 # 1M out of 1B, higher is better
30+
globalDefault: false # if true all pods get this value instead of the default 0. Only 1 priorityClass in a cluster can be default
31+
32+
33+
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Kueue Jobs Example
2+
3+
Here are a set of resources that can be used to test `kueue` features like preemption.
4+
5+
## Install
6+
Install the base resources
7+
```
8+
for i in 00-common.yaml 01-cluster-queues.yaml 02-local-queues.yaml; do
9+
oc create -f $i
10+
done
11+
```
12+
13+
## Run
14+
Run the workloads from jobs
15+
16+
```
17+
for i in $(seq 1 7); do oc create -f 04-llm-job.yaml; done
18+
for i in $(seq 5); do oc create -f 05-cancer-cure-research.yaml; done
19+
```
20+
21+
## Observe
22+
23+
Observe that `workloads` get created=
24+
```
25+
oc get wl
26+
```
27+
Observe that `workloads` get preempted using `oc describe wl <some not admitted workload>`
28+
29+
30+
31+
32+
33+

0 commit comments

Comments
 (0)