@@ -77,31 +77,24 @@ public async Task StartAsync(CancellationToken appStopToken)
7777 var isMaster = await this . clusterNodes . SelfElectToMasterNodeAsync ( ) ;
7878 if ( isMaster )
7979 {
80- // Reload all simulations to have fresh status and discover new simulations
81- IList < Simulation > simulations = ( await this . simulations . GetListAsync ( ) ) ;
82-
83- IList < Simulation > activeSimulations = simulations
84- . Where ( x => x . IsActiveNow ) . ToList ( ) ;
85- this . log . Debug ( "Active simulations loaded" , ( ) => new { activeSimulations . Count } ) ;
86-
87- IList < Simulation > deletionRequiredSimulations = simulations
88- . Where ( x => x . DeviceDeletionRequired ) . ToList ( ) ;
89- this . log . Debug ( "InActive simulations loaded" , ( ) => new { deletionRequiredSimulations . Count } ) ;
90-
9180 await this . clusterNodes . RemoveStaleNodesAsync ( ) ;
9281
93- // Scale nodes in Vmss
94- await this . ScaleVmssNodes ( activeSimulations ) ;
82+ var ( success , activeSimulations , deletionRequiredSimulations ) = await this . GetSimulations ( ) ;
83+ if ( success )
84+ {
85+ // Scale nodes in Vmss
86+ await this . ScaleVmssNodes ( activeSimulations ) ;
9587
96- // Create IoTHub devices for all the active simulations
97- await this . CreateDevicesAsync ( activeSimulations ) ;
88+ // Create IoTHub devices for all the active simulations
89+ await this . CreateDevicesAsync ( activeSimulations ) ;
9890
99- // Delete IoTHub devices for inactive simulations
100- await this . DeleteDevicesAsync ( deletionRequiredSimulations ) ;
91+ // Delete IoTHub devices for inactive simulations
92+ await this . DeleteDevicesAsync ( deletionRequiredSimulations ) ;
10193
102- // Create and delete partitions
103- await this . CreatePartitionsAsync ( activeSimulations ) ;
104- await this . DeletePartitionsAsync ( activeSimulations ) ;
94+ // Create and delete partitions
95+ await this . CreatePartitionsAsync ( activeSimulations ) ;
96+ await this . DeletePartitionsAsync ( activeSimulations ) ;
97+ }
10598 }
10699
107100 // Sleep some seconds before checking for new simulations (by default 15 seconds)
@@ -113,69 +106,116 @@ public void Stop()
113106 {
114107 this . running = false ;
115108 }
116-
117- private async Task ScaleVmssNodes ( IList < Simulation > activeSimulations )
118- {
119- // Default node count is 1
120- var nodeCount = DEFAULT_NODE_COUNT ;
121- var maxDevicesPerNode = this . clusteringConfig . MaxDevicesPerNode ;
122109
123- if ( activeSimulations . Count > 0 )
110+ private async
111+ Task < ( bool success , IList < Simulation > activeSimulations , IList < Simulation > deletionRequiredSimulations ) >
112+ GetSimulations ( )
113+ {
114+ try
124115 {
125- var models = new List < Simulation . DeviceModelRef > ( ) ;
126- var customDevices = 0 ;
116+ // Reload all simulations to have fresh status and discover new simulations
117+ IList < Simulation > list = ( await this . simulations . GetListAsync ( ) ) ;
127118
128- foreach ( var simulation in activeSimulations )
129- {
130- // Loop through all the device models used in the simulation
131- models = ( from model in simulation . DeviceModels where model . Count > 0 select model ) . ToList ( ) ;
119+ IList < Simulation > activeSimulations = list
120+ . Where ( x => x . IsActiveNow ) . ToList ( ) ;
121+ this . log . Debug ( "Active simulations loaded" , ( ) => new { activeSimulations . Count } ) ;
132122
133- // Count total custom devices
134- customDevices += simulation . CustomDevices . Count ;
135- }
136-
137- // Calculate the total number of devices
138- var totalDevices = models . Sum ( model => model . Count ) + customDevices ;
123+ IList < Simulation > deletionRequiredSimulations = list
124+ . Where ( x => x . DeviceDeletionRequired ) . ToList ( ) ;
125+ this . log . Debug ( "Inactive simulations loaded" , ( ) => new { deletionRequiredSimulations . Count } ) ;
139126
140- // Calculate number of nodes required
141- nodeCount = maxDevicesPerNode > 0 ? ( int ) Math . Ceiling ( ( double ) totalDevices / maxDevicesPerNode ) : DEFAULT_NODE_COUNT ;
127+ return ( true , activeSimulations , deletionRequiredSimulations ) ;
142128 }
143-
144- if ( this . currentNodeCount != nodeCount )
129+ catch ( Exception e )
145130 {
146- // Send a request to update vmss auto scale settings to create vm instances
147- // TODO: when devices are added or removed, the number of VMs might need an update
148- await this . azureManagementAdapter . CreateOrUpdateVmssAutoscaleSettingsAsync ( nodeCount ) ;
149-
150- this . currentNodeCount = nodeCount ;
131+ this . log . Error ( "An unexpected error occurred in the master node while loading the list of simulations" , e ) ;
132+ return ( false , null , null ) ;
151133 }
152134 }
153135
154- private async Task DeleteDevicesAsync ( IList < Simulation > deletionRequiredSimulations )
136+ private async Task ScaleVmssNodes ( IList < Simulation > activeSimulations )
155137 {
156- if ( deletionRequiredSimulations . Count == 0 ) return ;
138+ try
139+ {
140+ // Default node count is 1
141+ var nodeCount = DEFAULT_NODE_COUNT ;
142+ var maxDevicesPerNode = this . clusteringConfig . MaxDevicesPerNode ;
143+
144+ if ( activeSimulations . Count > 0 )
145+ {
146+ var models = new List < Simulation . DeviceModelRef > ( ) ;
147+ var customDevices = 0 ;
148+
149+ foreach ( var simulation in activeSimulations )
150+ {
151+ // Loop through all the device models used in the simulation
152+ models = ( from model in simulation . DeviceModels where model . Count > 0 select model ) . ToList ( ) ;
153+
154+ // Count total custom devices
155+ customDevices += simulation . CustomDevices . Count ;
156+ }
157157
158- foreach ( var simulation in deletionRequiredSimulations )
158+ // Calculate the total number of devices
159+ var totalDevices = models . Sum ( model => model . Count ) + customDevices ;
160+
161+ // Calculate number of nodes required
162+ nodeCount = maxDevicesPerNode > 0 ? ( int ) Math . Ceiling ( ( double ) totalDevices / maxDevicesPerNode ) : DEFAULT_NODE_COUNT ;
163+ }
164+
165+ if ( this . currentNodeCount != nodeCount )
166+ {
167+ // Send a request to update vmss auto scale settings to create vm instances
168+ // TODO: when devices are added or removed, the number of VMs might need an update
169+ await this . azureManagementAdapter . CreateOrUpdateVmssAutoscaleSettingsAsync ( nodeCount ) ;
170+
171+ this . currentNodeCount = nodeCount ;
172+ }
173+ }
174+ catch ( Exception e )
159175 {
160- await this . DeleteIoTHubDevicesAsync ( simulation ) ;
176+ this . log . Error ( "Unexpected error while scaling the deployment" , e ) ;
161177 }
162178 }
163179
164180 private async Task CreateDevicesAsync ( IList < Simulation > activeSimulations )
165181 {
166- if ( activeSimulations . Count == 0 ) return ;
182+ try
183+ {
184+ if ( activeSimulations . Count == 0 ) return ;
167185
168- var simulationsWithDevicesToCreate = activeSimulations . Where ( x => x . DeviceCreationRequired ) . ToList ( ) ;
186+ var simulationsWithDevicesToCreate = activeSimulations . Where ( x => x . DeviceCreationRequired ) . ToList ( ) ;
169187
170- if ( simulationsWithDevicesToCreate . Count == 0 )
188+ if ( simulationsWithDevicesToCreate . Count == 0 )
189+ {
190+ this . log . Debug ( "No simulations require device creation" ) ;
191+ return ;
192+ }
193+
194+ foreach ( var simulation in simulationsWithDevicesToCreate )
195+ {
196+ await this . CreateIoTHubDevicesAsync ( simulation ) ;
197+ }
198+ }
199+ catch ( Exception e )
171200 {
172- this . log . Debug ( "No simulations require device creation" ) ;
173- return ;
201+ this . log . Error ( "Unexpected error while creating devices" , e ) ;
174202 }
203+ }
175204
176- foreach ( var simulation in simulationsWithDevicesToCreate )
205+ private async Task DeleteDevicesAsync ( IList < Simulation > deletionRequiredSimulations )
206+ {
207+ try
177208 {
178- await this . CreateIoTHubDevicesAsync ( simulation ) ;
209+ if ( deletionRequiredSimulations . Count == 0 ) return ;
210+
211+ foreach ( var simulation in deletionRequiredSimulations )
212+ {
213+ await this . DeleteIoTHubDevicesAsync ( simulation ) ;
214+ }
215+ }
216+ catch ( Exception e )
217+ {
218+ this . log . Error ( "Unexpected error while deleting devices" , e ) ;
179219 }
180220 }
181221
@@ -213,6 +253,8 @@ private async Task DeleteIoTHubDevicesAsync(Simulation simulation)
213253 : "Device deletion is still in progress" ,
214254 ( ) => new { SimulationId = simulation . Id } ) ;
215255 }
256+
257+ deviceService . Dispose ( ) ;
216258 }
217259
218260 // Start the job to delete the devices
@@ -233,6 +275,8 @@ private async Task DeleteIoTHubDevicesAsync(Simulation simulation)
233275 {
234276 this . log . Warn ( "Failed to start device deletion, will retry later" ) ;
235277 }
278+
279+ deviceService . Dispose ( ) ;
236280 }
237281 }
238282
@@ -252,7 +296,11 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
252296
253297 if ( await deviceService . IsJobCompleteAsync ( simulation . DeviceCreationJobId , ( ) => { creationFailed = true ; } ) )
254298 {
255- this . log . Info ( "All devices have been created, updating the simulation record" , ( ) => new { SimulationId = simulation . Id } ) ;
299+ // Note: at this point we don't know if all devices have been created, quota can cause some errors,
300+ // see job log in the storage account
301+ this . log . Info ( "Device creation job complete, updating the simulation record. All devices should have been created. " +
302+ "If any error occurred, the 'importErrors.log' file in the storage account contains the details." ,
303+ ( ) => new { SimulationId = simulation . Id } ) ;
256304
257305 if ( await this . simulations . TryToSetDeviceCreationCompleteAsync ( simulation . Id ) )
258306 {
@@ -270,6 +318,8 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
270318 : "Device creation is still in progress" ,
271319 ( ) => new { SimulationId = simulation . Id } ) ;
272320 }
321+
322+ deviceService . Dispose ( ) ;
273323 }
274324
275325 // Start the job to import the devices
@@ -290,53 +340,69 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
290340 {
291341 this . log . Warn ( "Failed to start device creation, will retry later" ) ;
292342 }
343+
344+ deviceService . Dispose ( ) ;
293345 }
294346 }
295347
296348 private async Task CreatePartitionsAsync ( IList < Simulation > activeSimulations )
297349 {
298- if ( activeSimulations . Count == 0 ) return ;
350+ try
351+ {
352+ if ( activeSimulations . Count == 0 ) return ;
299353
300- var simulationsToPartition = activeSimulations . Where ( x => x . PartitioningRequired ) . ToList ( ) ;
354+ var simulationsToPartition = activeSimulations . Where ( x => x . PartitioningRequired ) . ToList ( ) ;
301355
302- if ( simulationsToPartition . Count == 0 )
303- {
304- this . log . Debug ( "No simulations to be partitioned" ) ;
305- return ;
306- }
356+ if ( simulationsToPartition . Count == 0 )
357+ {
358+ this . log . Debug ( "No simulations to be partitioned" ) ;
359+ return ;
360+ }
307361
308- foreach ( Simulation sim in simulationsToPartition )
362+ foreach ( Simulation sim in simulationsToPartition )
363+ {
364+ await this . partitions . CreateAsync ( sim . Id ) ;
365+ }
366+ }
367+ catch ( Exception e )
309368 {
310- await this . partitions . CreateAsync ( sim . Id ) ;
369+ this . log . Error ( "Unexpected error while creating partitions" , e ) ;
311370 }
312371 }
313372
314373 private async Task DeletePartitionsAsync ( IList < Simulation > activeSimulations )
315374 {
316- if ( activeSimulations . Count == 0 ) return ;
375+ try
376+ {
377+ if ( activeSimulations . Count == 0 ) return ;
317378
318- this . log . Debug ( "Searching partitions to delete..." ) ;
379+ this . log . Debug ( "Searching partitions to delete..." ) ;
319380
320- var allPartitions = await this . partitions . GetAllAsync ( ) ;
321- var simulationIds = new HashSet < string > ( activeSimulations . Select ( x => x . Id ) ) ;
322- var partitionIds = new List < string > ( ) ;
323- foreach ( var partition in allPartitions )
324- {
325- if ( ! simulationIds . Contains ( partition . SimulationId ) )
381+ var allPartitions = await this . partitions . GetAllAsync ( ) ;
382+ var simulationIds = new HashSet < string > ( activeSimulations . Select ( x => x . Id ) ) ;
383+ var partitionIds = new List < string > ( ) ;
384+ foreach ( var partition in allPartitions )
326385 {
327- partitionIds . Add ( partition . Id ) ;
386+ if ( ! simulationIds . Contains ( partition . SimulationId ) )
387+ {
388+ partitionIds . Add ( partition . Id ) ;
389+ }
390+ }
391+
392+ if ( partitionIds . Count == 0 )
393+ {
394+ this . log . Debug ( "No partitions to delete" ) ;
395+ return ;
328396 }
329- }
330397
331- if ( partitionIds . Count == 0 )
398+ // TODO: partitions should be deleted only after its actors are down
399+ this . log . Debug ( "Deleting partitions..." , ( ) => new { partitionIds . Count } ) ;
400+ await this . partitions . DeleteListAsync ( partitionIds ) ;
401+ }
402+ catch ( Exception e )
332403 {
333- this . log . Debug ( "No partitions to delete" ) ;
334- return ;
404+ this . log . Error ( "Unexpected error while deleting partitions" , e ) ;
335405 }
336-
337- // TODO: partitions should be deleted only after its actors are down
338- this . log . Debug ( "Deleting partitions..." , ( ) => new { partitionIds . Count } ) ;
339- await this . partitions . DeleteListAsync ( partitionIds ) ;
340406 }
341407 }
342408}
0 commit comments