Skip to content

Commit 448655e

Browse files
authored
wip: add support for workflow events (#27)
* wip: add support for workflow events This will add support for ending the workflow early due to a count of successes, failures, or job duration metric. We need to next add ability to grow or shrink (need to think about how to do that, since we want a cloud agnostic solution) and then how to handle application specific metrics Signed-off-by: vsoch <[email protected]> * feat: add support for minicluster If we really want to test scale (shrink and grow) of a job and have it work with the cluster autoscaler, plus collecting metrics from an HPC app, we can most easily do that with the flux operator. This feature adds support for specifying a minicluster property to convert the previous indexed job into a MiniCluster. The flux operator needs to be installed. Signed-off-by: vsoch <[email protected]> * feat: shrink with flux minicluster example working. Signed-off-by: vsoch <[email protected]> * save state Signed-off-by: vsoch <[email protected]> * feat: support for custom metrics In this example, the user is allowed to provide a custom script that will be used against the log, and it needs to return a dictionary of values (the custom metrics). These are passed back to the manager from the state machine step and can influence workflow behavior (e.g., stop early, grow, or shrink. Signed-off-by: vsoch <[email protected]> --------- Signed-off-by: vsoch <[email protected]> Co-authored-by: vsoch <[email protected]>
1 parent 60c82ed commit 448655e

File tree

30 files changed

+1389
-103
lines changed

30 files changed

+1389
-103
lines changed

api/v1alpha1/statemachine_types.go

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,27 @@ type Workflow struct {
7070
// Prefix for jobs (e.g., structure_ for mummi)
7171
Prefix string `json:"prefix,omitempty"`
7272

73-
// TODO add a failure condition?
73+
// Custom events -> actions to take
74+
Events []WorkflowEvent `json:"events,omitempty"`
75+
}
76+
77+
type WorkflowEvent struct {
78+
79+
// Name of metric, indexed into model lookup (e.g., count.job_b.failed)
80+
Metric string `json:"metric"`
81+
82+
// Conditional to check. If not set, checks if nonzero or nonempty
83+
When string `json:"when,omitempty"`
84+
85+
// Action to take (e.g., finish-workflow)
86+
Action string `json:"action"`
87+
88+
// Backoff and repetitions to respond to event
89+
Backoff int32 `json:"backoff,omitempty"`
90+
Repetitions int32 `json:"repetitions,omitempty"`
91+
MinCompletions int32 `json:"minCompletions,omitempty"`
92+
MaxSize int32 `json:"maxSize,omitempty"`
93+
MinSize int32 `json:"minSize,omitempty"`
7494
}
7595

7696
// A JobSequence is a list of JobSteps
@@ -79,6 +99,12 @@ type Workflow struct {
7999
// to the job (not knowing the structure in advance)
80100
type JobSequence []JobStep
81101

102+
type JobEvents struct {
103+
104+
// Custom parsing script
105+
Script string `json:"script,omitempty"`
106+
}
107+
82108
type JobStep struct {
83109

84110
// Name is the name of the job (required)
@@ -92,6 +118,10 @@ type JobStep struct {
92118
// +optional
93119
Registry RegistryConfig `json:"registry,omitempty"`
94120

121+
// Event for a job
122+
// +optional
123+
Events JobEvents `json:"events,omitempty"`
124+
95125
// Architecture (arm64 or amd64)
96126
// +kubebuilder:default="amd64"
97127
// +default="amd64"
@@ -215,6 +245,10 @@ type Manager struct {
215245
// +omitempty
216246
NodeSelector string `json:"nodeSelector,omitempty"`
217247

248+
// Run in more verbose mode
249+
// +optional
250+
Verbose bool `json:"verbose"`
251+
218252
// Image pull policy (e.g., Always, Never, etc.)
219253
// +kubebuilder:default="IfNotPresent"
220254
// +default="IfNotPresent"

config/crd/bases/state-machine.converged-computing.org_statemachines.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,13 @@ spec:
105105
description: Walltime (in string format) for the job
106106
type: string
107107
type: object
108+
events:
109+
description: Event for a job
110+
properties:
111+
script:
112+
description: Custom parsing script
113+
type: string
114+
type: object
108115
image:
109116
description: |-
110117
Namespace is inherited from StateMachine Spec
@@ -185,6 +192,9 @@ spec:
185192
default: r
186193
description: Subdomain to use
187194
type: string
195+
verbose:
196+
description: Run in more verbose mode
197+
type: boolean
188198
type: object
189199
registry:
190200
description: |-
@@ -239,6 +249,42 @@ spec:
239249
description: Number of state machine sequences required for completion
240250
format: int32
241251
type: integer
252+
events:
253+
description: Custom events -> actions to take
254+
items:
255+
properties:
256+
action:
257+
description: Action to take (e.g., finish-workflow)
258+
type: string
259+
backoff:
260+
description: Backoff and repetitions to respond to event
261+
format: int32
262+
type: integer
263+
maxSize:
264+
format: int32
265+
type: integer
266+
metric:
267+
description: Name of metric, indexed into model lookup (e.g.,
268+
count.job_b.failed)
269+
type: string
270+
minCompletions:
271+
format: int32
272+
type: integer
273+
minSize:
274+
format: int32
275+
type: integer
276+
repetitions:
277+
format: int32
278+
type: integer
279+
when:
280+
description: Conditional to check. If not set, checks if
281+
nonzero or nonempty
282+
type: string
283+
required:
284+
- action
285+
- metric
286+
type: object
287+
type: array
242288
prefix:
243289
description: Prefix for jobs (e.g., structure_ for mummi)
244290
type: string

config/rbac/role.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ rules:
1010
- events
1111
verbs:
1212
- create
13+
- delete
14+
- get
15+
- list
16+
- patch
1317
- update
1418
- watch
1519
- apiGroups:
@@ -25,6 +29,42 @@ rules:
2529
- patch
2630
- update
2731
- watch
32+
- apiGroups:
33+
- ""
34+
resources:
35+
- miniclusters
36+
verbs:
37+
- create
38+
- delete
39+
- get
40+
- list
41+
- patch
42+
- update
43+
- watch
44+
- apiGroups:
45+
- ""
46+
resources:
47+
- miniclusters/finalizers
48+
verbs:
49+
- create
50+
- delete
51+
- get
52+
- list
53+
- patch
54+
- update
55+
- watch
56+
- apiGroups:
57+
- ""
58+
resources:
59+
- miniclusters/status
60+
verbs:
61+
- create
62+
- delete
63+
- get
64+
- list
65+
- patch
66+
- update
67+
- watch
2868
- apiGroups:
2969
- apps
3070
resources:
@@ -292,6 +332,54 @@ rules:
292332
- patch
293333
- update
294334
- watch
335+
- apiGroups:
336+
- events.k8s.io
337+
resources:
338+
- events
339+
verbs:
340+
- create
341+
- delete
342+
- get
343+
- list
344+
- patch
345+
- update
346+
- watch
347+
- apiGroups:
348+
- flux-framework.org
349+
resources:
350+
- miniclusters
351+
verbs:
352+
- create
353+
- delete
354+
- get
355+
- list
356+
- patch
357+
- update
358+
- watch
359+
- apiGroups:
360+
- flux-framework.org
361+
resources:
362+
- miniclusters/finalizers
363+
verbs:
364+
- create
365+
- delete
366+
- get
367+
- list
368+
- patch
369+
- update
370+
- watch
371+
- apiGroups:
372+
- flux-framework.org
373+
resources:
374+
- miniclusters/status
375+
verbs:
376+
- create
377+
- delete
378+
- get
379+
- list
380+
- patch
381+
- update
382+
- watch
295383
- apiGroups:
296384
- networking.k8s.io
297385
resources:

0 commit comments

Comments
 (0)