1+ namespace TemporalioSamples . SafeMessageHandlers ;
2+
3+ using Microsoft . Extensions . Logging ;
4+ using Temporalio . Exceptions ;
5+ using Temporalio . Workflows ;
6+
7+ [ Workflow ]
8+ public class ClusterManagerWorkflow
9+ {
10+ public record State
11+ {
12+ public bool ClusterStarted { get ; set ; }
13+
14+ public bool ClusterShutdown { get ; set ; }
15+
16+ public IDictionary < string , string ? > Nodes { get ; init ; } = new Dictionary < string , string ? > ( ) ;
17+
18+ public int MaxAssignedNodes { get ; set ; }
19+ }
20+
21+ public record Input
22+ {
23+ public State State { get ; init ; } = new ( ) ;
24+
25+ public bool TestContinueAsNew { get ; init ; }
26+ }
27+
28+ public record Result (
29+ int MaxAssignedNodes ,
30+ int NumAssignedNodes ) ;
31+
32+ private readonly Semaphore nodesLock = new ( 1 ) ;
33+ private readonly int maxHistoryLength ;
34+ private readonly TimeSpan sleepInterval ;
35+
36+ [ WorkflowInit ]
37+ public ClusterManagerWorkflow ( Input input )
38+ {
39+ CurrentState = input . State ;
40+ maxHistoryLength = input . TestContinueAsNew ? 120 : int . MaxValue ;
41+ sleepInterval = TimeSpan . FromSeconds ( input . TestContinueAsNew ? 1 : 600 ) ;
42+ }
43+
44+ [ WorkflowQuery ]
45+ public State CurrentState { get ; init ; }
46+
47+ [ WorkflowRun ]
48+ public async Task < Result > RunAsync ( Input input )
49+ {
50+ await Workflow . WaitConditionAsync ( ( ) => CurrentState . ClusterStarted ) ;
51+
52+ // Perform health checks at intervals
53+ do
54+ {
55+ await PerformHealthChecksAsync ( ) ;
56+ await Workflow . WaitConditionAsync (
57+ ( ) => CurrentState . ClusterShutdown || ShouldContinueAsNew ,
58+ sleepInterval ) ;
59+
60+ // Continue as new if needed
61+ if ( ShouldContinueAsNew )
62+ {
63+ Workflow . Logger . LogInformation ( "Continuing as new" ) ;
64+ throw Workflow . CreateContinueAsNewException ( ( ClusterManagerWorkflow wf ) => wf . RunAsync ( new ( )
65+ {
66+ State = CurrentState ,
67+ TestContinueAsNew = input . TestContinueAsNew ,
68+ } ) ) ;
69+ }
70+ }
71+ while ( ! CurrentState . ClusterShutdown ) ;
72+ return new ( CurrentState . MaxAssignedNodes , NumAssignedNodes ) ;
73+ }
74+
75+ [ WorkflowSignal ]
76+ public async Task StartClusterAsync ( )
77+ {
78+ CurrentState . ClusterStarted = true ;
79+ foreach ( var node in Enumerable . Range ( 0 , 25 ) )
80+ {
81+ CurrentState . Nodes [ $ "{ node } "] = null ;
82+ }
83+ Workflow . Logger . LogInformation ( "Cluster started" ) ;
84+ }
85+
86+ [ WorkflowSignal ]
87+ public async Task ShutdownClusterAsync ( )
88+ {
89+ await Workflow . WaitConditionAsync ( ( ) => CurrentState . ClusterStarted ) ;
90+ CurrentState . ClusterShutdown = true ;
91+ Workflow . Logger . LogInformation ( "Cluster shut down" ) ;
92+ }
93+
94+ public record AllocateNodesToJobInput ( int NumNodes , string JobName ) ;
95+
96+ [ WorkflowUpdate ]
97+ public async Task < List < string > > AllocateNodesToJobAsync ( AllocateNodesToJobInput input )
98+ {
99+ await Workflow . WaitConditionAsync ( ( ) => CurrentState . ClusterStarted ) ;
100+ if ( CurrentState . ClusterShutdown )
101+ {
102+ throw new ApplicationFailureException (
103+ "Cannot allocate nodes to a job, cluster is already shut down" ) ;
104+ }
105+ await nodesLock . WaitAsync ( ) ;
106+ try
107+ {
108+ var unassignedNodes = CurrentState . Nodes .
109+ Where ( kvp => kvp . Value == null ) .
110+ Select ( kvp => kvp . Key ) .
111+ ToList ( ) ;
112+ if ( unassignedNodes . Count < input . NumNodes )
113+ {
114+ throw new ApplicationFailureException (
115+ $ "Cannot allocate { input . NumNodes } nodes, have only { unassignedNodes . Count } available") ;
116+ }
117+ var assignedNodes = unassignedNodes [ ..input . NumNodes ] ;
118+ // This await would be dangerous without nodesLock because it yields control and allows
119+ // interleaving
120+ await Workflow . ExecuteActivityAsync (
121+ ( ClusterManagerActivities acts ) => acts . AllocateNodesToJobAsync ( new ( assignedNodes , input . JobName ) ) ,
122+ new ( ) { StartToCloseTimeout = TimeSpan . FromSeconds ( 10 ) } ) ;
123+ foreach ( var node in assignedNodes )
124+ {
125+ CurrentState . Nodes [ node ] = input . JobName ;
126+ }
127+ CurrentState . MaxAssignedNodes = int . Max ( CurrentState . MaxAssignedNodes , NumAssignedNodes ) ;
128+ return assignedNodes ;
129+ }
130+ finally
131+ {
132+ nodesLock . Release ( ) ;
133+ }
134+ }
135+
136+ public record DeleteJobInput ( string JobName ) ;
137+
138+ [ WorkflowUpdate ]
139+ public async Task DeleteJobAsync ( DeleteJobInput input )
140+ {
141+ await Workflow . WaitConditionAsync ( ( ) => CurrentState . ClusterStarted ) ;
142+ if ( CurrentState . ClusterShutdown )
143+ {
144+ throw new ApplicationFailureException (
145+ "Cannot delete job, cluster is already shut down" ) ;
146+ }
147+ await nodesLock . WaitAsync ( ) ;
148+ try
149+ {
150+ var toUnassign = CurrentState . Nodes .
151+ Where ( kvp => kvp . Value == input . JobName ) .
152+ Select ( kvp => kvp . Key ) .
153+ ToList ( ) ;
154+ // This await would be dangerous without nodesLock because it yields control and allows
155+ // interleaving
156+ await Workflow . ExecuteActivityAsync (
157+ ( ClusterManagerActivities acts ) => acts . DeallocateNodesFromJobAsync ( new ( toUnassign , input . JobName ) ) ,
158+ new ( ) { StartToCloseTimeout = TimeSpan . FromSeconds ( 10 ) } ) ;
159+ foreach ( var node in toUnassign )
160+ {
161+ CurrentState . Nodes [ node ] = null ;
162+ }
163+ }
164+ finally
165+ {
166+ nodesLock . Release ( ) ;
167+ }
168+ }
169+
170+ private int NumAssignedNodes =>
171+ CurrentState . Nodes . Count ( kvp => kvp . Value is { } val && val != "BAD!" ) ;
172+
173+ private bool ShouldContinueAsNew =>
174+ // Don't continue as new while update running
175+ nodesLock . CurrentCount > 0 &&
176+ // Continue if suggested or, for ease of testing, max history reached
177+ ( Workflow . ContinueAsNewSuggested || Workflow . CurrentHistoryLength > maxHistoryLength ) ;
178+
179+ private async Task PerformHealthChecksAsync ( )
180+ {
181+ await nodesLock . WaitAsync ( ) ;
182+ try
183+ {
184+ // Find bad nodes from the set of non-bad ones. This await would be dangerous without
185+ // nodesLock because it yields control and allows interleaving.
186+ var assignedNodes = CurrentState . Nodes .
187+ Where ( kvp => kvp . Value is { } val && val != "BAD!" ) .
188+ Select ( kvp => kvp . Value ! ) .
189+ ToList ( ) ;
190+ var badNodes = await Workflow . ExecuteActivityAsync (
191+ ( ClusterManagerActivities acts ) => acts . FindBadNodesAsync ( new ( assignedNodes ) ) ,
192+ new ( )
193+ {
194+ StartToCloseTimeout = TimeSpan . FromSeconds ( 10 ) ,
195+ // This health check is optional, and our lock would block the whole workflow if
196+ // we let it retry forever
197+ RetryPolicy = new ( ) { MaximumAttempts = 1 } ,
198+ } ) ;
199+ foreach ( var node in badNodes )
200+ {
201+ CurrentState . Nodes [ node ] = "BAD!" ;
202+ }
203+ }
204+ finally
205+ {
206+ nodesLock . Release ( ) ;
207+ }
208+ }
209+ }
0 commit comments