Skip to content
This repository was archived by the owner on Oct 11, 2023. It is now read-only.

Commit 8fa180f

Browse files
authored
Reliability improvements
**Major:** 1. Backoff in case of d2c messaging throttling, stopping telemetry and twin operations 2. Improve perf reducing load on garbage collector and disposing IoT SDK resources 3. Improve device twin management, e.g. try to avoid twin writes when data is not changed, handle more error scenarios 4. Add "development" feature flag to enable/disable expensive runtime checks 5. Change partitions size from 1000 to 5000 devices to reduce the load on storage **Minor:** 1. Backoff in case of device count throttling (e.g. when reaching 8000 devices in the free SKU) 2. Remove unused daily counter for telemetry 3. Improve perf reducing the number of no-op async tasks 4. Update IoT SDK and other dependencies 5. Clean up logging code, reimplement log filtering (removed in past PRs) 6. Print SDK version at startup 7. Add some scripts for development, see /scripts/development (create/delete simulation, start storage adapter) **Bug fixes:** 1. Handle and recover from exceptions in the partitioning agent 2. Fix some swallowed errors/exceptions and unnecessary try/catch 3. Share script interpreter between methods and state to ensure device state consistency 4. Fix logged throughput in case of no traffic, i.e. show 0.0 msg/sec, and round value to 3 decimals 5. Fix logging from ConfigData 6. Change the dev endpoint used to delete simulations: don't delete devices (the endpoint was not working) - no user impact, dev only
1 parent 4cc7936 commit 8fa180f

File tree

99 files changed

+1982
-948
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+1982
-948
lines changed

PartitioningAgent.Test/PartitioningAgent.Test.csproj

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
<IsPackable>false</IsPackable>
66
</PropertyGroup>
77
<ItemGroup>
8-
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.8.0"/>
9-
<PackageReference Include="Moq" Version="4.10.0"/>
10-
<PackageReference Include="xunit" Version="2.4.0"/>
11-
<PackageReference Include="xunit.assert" Version="2.4.0"/>
12-
<PackageReference Include="xunit.runner.console" Version="2.4.0"/>
13-
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.0"/>
8+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
9+
<PackageReference Include="Moq" Version="4.10.0" />
10+
<PackageReference Include="xunit" Version="2.4.1" />
11+
<PackageReference Include="xunit.assert" Version="2.4.1" />
12+
<PackageReference Include="xunit.runner.console" Version="2.4.1" />
13+
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
1414
</ItemGroup>
1515
<ItemGroup>
16-
<ProjectReference Include="..\PartitioningAgent\PartitioningAgent.csproj"/>
17-
<ProjectReference Include="..\Services\Services.csproj"/>
16+
<ProjectReference Include="..\PartitioningAgent\PartitioningAgent.csproj" />
17+
<ProjectReference Include="..\Services\Services.csproj" />
1818
</ItemGroup>
1919
</Project>

PartitioningAgent/Agent.cs

Lines changed: 152 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -77,31 +77,24 @@ public async Task StartAsync(CancellationToken appStopToken)
7777
var isMaster = await this.clusterNodes.SelfElectToMasterNodeAsync();
7878
if (isMaster)
7979
{
80-
// Reload all simulations to have fresh status and discover new simulations
81-
IList<Simulation> simulations = (await this.simulations.GetListAsync());
82-
83-
IList<Simulation> activeSimulations = simulations
84-
.Where(x => x.IsActiveNow).ToList();
85-
this.log.Debug("Active simulations loaded", () => new { activeSimulations.Count });
86-
87-
IList<Simulation> deletionRequiredSimulations = simulations
88-
.Where(x => x.DeviceDeletionRequired).ToList();
89-
this.log.Debug("InActive simulations loaded", () => new { deletionRequiredSimulations.Count });
90-
9180
await this.clusterNodes.RemoveStaleNodesAsync();
9281

93-
// Scale nodes in Vmss
94-
await this.ScaleVmssNodes(activeSimulations);
82+
var (success, activeSimulations, deletionRequiredSimulations) = await this.GetSimulations();
83+
if (success)
84+
{
85+
// Scale nodes in Vmss
86+
await this.ScaleVmssNodes(activeSimulations);
9587

96-
// Create IoTHub devices for all the active simulations
97-
await this.CreateDevicesAsync(activeSimulations);
88+
// Create IoTHub devices for all the active simulations
89+
await this.CreateDevicesAsync(activeSimulations);
9890

99-
// Delete IoTHub devices for inactive simulations
100-
await this.DeleteDevicesAsync(deletionRequiredSimulations);
91+
// Delete IoTHub devices for inactive simulations
92+
await this.DeleteDevicesAsync(deletionRequiredSimulations);
10193

102-
// Create and delete partitions
103-
await this.CreatePartitionsAsync(activeSimulations);
104-
await this.DeletePartitionsAsync(activeSimulations);
94+
// Create and delete partitions
95+
await this.CreatePartitionsAsync(activeSimulations);
96+
await this.DeletePartitionsAsync(activeSimulations);
97+
}
10598
}
10699

107100
// Sleep some seconds before checking for new simulations (by default 15 seconds)
@@ -113,69 +106,116 @@ public void Stop()
113106
{
114107
this.running = false;
115108
}
116-
117-
private async Task ScaleVmssNodes(IList<Simulation> activeSimulations)
118-
{
119-
// Default node count is 1
120-
var nodeCount = DEFAULT_NODE_COUNT;
121-
var maxDevicesPerNode = this.clusteringConfig.MaxDevicesPerNode;
122109

123-
if (activeSimulations.Count > 0)
110+
private async
111+
Task<(bool success, IList<Simulation> activeSimulations, IList<Simulation> deletionRequiredSimulations)>
112+
GetSimulations()
113+
{
114+
try
124115
{
125-
var models = new List<Simulation.DeviceModelRef>();
126-
var customDevices = 0;
116+
// Reload all simulations to have fresh status and discover new simulations
117+
IList<Simulation> list = (await this.simulations.GetListAsync());
127118

128-
foreach (var simulation in activeSimulations)
129-
{
130-
// Loop through all the device models used in the simulation
131-
models = (from model in simulation.DeviceModels where model.Count > 0 select model).ToList();
119+
IList<Simulation> activeSimulations = list
120+
.Where(x => x.IsActiveNow).ToList();
121+
this.log.Debug("Active simulations loaded", () => new { activeSimulations.Count });
132122

133-
// Count total custom devices
134-
customDevices += simulation.CustomDevices.Count;
135-
}
136-
137-
// Calculate the total number of devices
138-
var totalDevices = models.Sum(model => model.Count) + customDevices;
123+
IList<Simulation> deletionRequiredSimulations = list
124+
.Where(x => x.DeviceDeletionRequired).ToList();
125+
this.log.Debug("Inactive simulations loaded", () => new { deletionRequiredSimulations.Count });
139126

140-
// Calculate number of nodes required
141-
nodeCount = maxDevicesPerNode > 0 ? (int)Math.Ceiling((double)totalDevices / maxDevicesPerNode) : DEFAULT_NODE_COUNT;
127+
return (true, activeSimulations, deletionRequiredSimulations);
142128
}
143-
144-
if (this.currentNodeCount != nodeCount)
129+
catch (Exception e)
145130
{
146-
// Send a request to update vmss auto scale settings to create vm instances
147-
// TODO: when devices are added or removed, the number of VMs might need an update
148-
await this.azureManagementAdapter.CreateOrUpdateVmssAutoscaleSettingsAsync(nodeCount);
149-
150-
this.currentNodeCount = nodeCount;
131+
this.log.Error("An unexpected error occurred in the master node while loading the list of simulations", e);
132+
return (false, null, null);
151133
}
152134
}
153135

154-
private async Task DeleteDevicesAsync(IList<Simulation> deletionRequiredSimulations)
136+
private async Task ScaleVmssNodes(IList<Simulation> activeSimulations)
155137
{
156-
if (deletionRequiredSimulations.Count == 0) return;
138+
try
139+
{
140+
// Default node count is 1
141+
var nodeCount = DEFAULT_NODE_COUNT;
142+
var maxDevicesPerNode = this.clusteringConfig.MaxDevicesPerNode;
143+
144+
if (activeSimulations.Count > 0)
145+
{
146+
var models = new List<Simulation.DeviceModelRef>();
147+
var customDevices = 0;
148+
149+
foreach (var simulation in activeSimulations)
150+
{
151+
// Loop through all the device models used in the simulation
152+
models = (from model in simulation.DeviceModels where model.Count > 0 select model).ToList();
153+
154+
// Count total custom devices
155+
customDevices += simulation.CustomDevices.Count;
156+
}
157157

158-
foreach (var simulation in deletionRequiredSimulations)
158+
// Calculate the total number of devices
159+
var totalDevices = models.Sum(model => model.Count) + customDevices;
160+
161+
// Calculate number of nodes required
162+
nodeCount = maxDevicesPerNode > 0 ? (int) Math.Ceiling((double) totalDevices / maxDevicesPerNode) : DEFAULT_NODE_COUNT;
163+
}
164+
165+
if (this.currentNodeCount != nodeCount)
166+
{
167+
// Send a request to update vmss auto scale settings to create vm instances
168+
// TODO: when devices are added or removed, the number of VMs might need an update
169+
await this.azureManagementAdapter.CreateOrUpdateVmssAutoscaleSettingsAsync(nodeCount);
170+
171+
this.currentNodeCount = nodeCount;
172+
}
173+
}
174+
catch (Exception e)
159175
{
160-
await this.DeleteIoTHubDevicesAsync(simulation);
176+
this.log.Error("Unexpected error while scaling the deployment", e);
161177
}
162178
}
163179

164180
private async Task CreateDevicesAsync(IList<Simulation> activeSimulations)
165181
{
166-
if (activeSimulations.Count == 0) return;
182+
try
183+
{
184+
if (activeSimulations.Count == 0) return;
167185

168-
var simulationsWithDevicesToCreate = activeSimulations.Where(x => x.DeviceCreationRequired).ToList();
186+
var simulationsWithDevicesToCreate = activeSimulations.Where(x => x.DeviceCreationRequired).ToList();
169187

170-
if (simulationsWithDevicesToCreate.Count == 0)
188+
if (simulationsWithDevicesToCreate.Count == 0)
189+
{
190+
this.log.Debug("No simulations require device creation");
191+
return;
192+
}
193+
194+
foreach (var simulation in simulationsWithDevicesToCreate)
195+
{
196+
await this.CreateIoTHubDevicesAsync(simulation);
197+
}
198+
}
199+
catch (Exception e)
171200
{
172-
this.log.Debug("No simulations require device creation");
173-
return;
201+
this.log.Error("Unexpected error while creating devices", e);
174202
}
203+
}
175204

176-
foreach (var simulation in simulationsWithDevicesToCreate)
205+
private async Task DeleteDevicesAsync(IList<Simulation> deletionRequiredSimulations)
206+
{
207+
try
177208
{
178-
await this.CreateIoTHubDevicesAsync(simulation);
209+
if (deletionRequiredSimulations.Count == 0) return;
210+
211+
foreach (var simulation in deletionRequiredSimulations)
212+
{
213+
await this.DeleteIoTHubDevicesAsync(simulation);
214+
}
215+
}
216+
catch (Exception e)
217+
{
218+
this.log.Error("Unexpected error while deleting devices", e);
179219
}
180220
}
181221

@@ -213,6 +253,8 @@ private async Task DeleteIoTHubDevicesAsync(Simulation simulation)
213253
: "Device deletion is still in progress",
214254
() => new { SimulationId = simulation.Id });
215255
}
256+
257+
deviceService.Dispose();
216258
}
217259

218260
// Start the job to delete the devices
@@ -233,6 +275,8 @@ private async Task DeleteIoTHubDevicesAsync(Simulation simulation)
233275
{
234276
this.log.Warn("Failed to start device deletion, will retry later");
235277
}
278+
279+
deviceService.Dispose();
236280
}
237281
}
238282

@@ -252,7 +296,11 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
252296

253297
if (await deviceService.IsJobCompleteAsync(simulation.DeviceCreationJobId, () => { creationFailed = true; }))
254298
{
255-
this.log.Info("All devices have been created, updating the simulation record", () => new { SimulationId = simulation.Id });
299+
// Note: at this point we don't know if all devices have been created, quota can cause some errors,
300+
// see job log in the storage account
301+
this.log.Info("Device creation job complete, updating the simulation record. All devices should have been created. " +
302+
"If any error occurred, the 'importErrors.log' file in the storage account contains the details.",
303+
() => new { SimulationId = simulation.Id });
256304

257305
if (await this.simulations.TryToSetDeviceCreationCompleteAsync(simulation.Id))
258306
{
@@ -270,6 +318,8 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
270318
: "Device creation is still in progress",
271319
() => new { SimulationId = simulation.Id });
272320
}
321+
322+
deviceService.Dispose();
273323
}
274324

275325
// Start the job to import the devices
@@ -290,53 +340,69 @@ private async Task CreateIoTHubDevicesAsync(Simulation simulation)
290340
{
291341
this.log.Warn("Failed to start device creation, will retry later");
292342
}
343+
344+
deviceService.Dispose();
293345
}
294346
}
295347

296348
private async Task CreatePartitionsAsync(IList<Simulation> activeSimulations)
297349
{
298-
if (activeSimulations.Count == 0) return;
350+
try
351+
{
352+
if (activeSimulations.Count == 0) return;
299353

300-
var simulationsToPartition = activeSimulations.Where(x => x.PartitioningRequired).ToList();
354+
var simulationsToPartition = activeSimulations.Where(x => x.PartitioningRequired).ToList();
301355

302-
if (simulationsToPartition.Count == 0)
303-
{
304-
this.log.Debug("No simulations to be partitioned");
305-
return;
306-
}
356+
if (simulationsToPartition.Count == 0)
357+
{
358+
this.log.Debug("No simulations to be partitioned");
359+
return;
360+
}
307361

308-
foreach (Simulation sim in simulationsToPartition)
362+
foreach (Simulation sim in simulationsToPartition)
363+
{
364+
await this.partitions.CreateAsync(sim.Id);
365+
}
366+
}
367+
catch (Exception e)
309368
{
310-
await this.partitions.CreateAsync(sim.Id);
369+
this.log.Error("Unexpected error while creating partitions", e);
311370
}
312371
}
313372

314373
private async Task DeletePartitionsAsync(IList<Simulation> activeSimulations)
315374
{
316-
if (activeSimulations.Count == 0) return;
375+
try
376+
{
377+
if (activeSimulations.Count == 0) return;
317378

318-
this.log.Debug("Searching partitions to delete...");
379+
this.log.Debug("Searching partitions to delete...");
319380

320-
var allPartitions = await this.partitions.GetAllAsync();
321-
var simulationIds = new HashSet<string>(activeSimulations.Select(x => x.Id));
322-
var partitionIds = new List<string>();
323-
foreach (var partition in allPartitions)
324-
{
325-
if (!simulationIds.Contains(partition.SimulationId))
381+
var allPartitions = await this.partitions.GetAllAsync();
382+
var simulationIds = new HashSet<string>(activeSimulations.Select(x => x.Id));
383+
var partitionIds = new List<string>();
384+
foreach (var partition in allPartitions)
326385
{
327-
partitionIds.Add(partition.Id);
386+
if (!simulationIds.Contains(partition.SimulationId))
387+
{
388+
partitionIds.Add(partition.Id);
389+
}
390+
}
391+
392+
if (partitionIds.Count == 0)
393+
{
394+
this.log.Debug("No partitions to delete");
395+
return;
328396
}
329-
}
330397

331-
if (partitionIds.Count == 0)
398+
// TODO: partitions should be deleted only after its actors are down
399+
this.log.Debug("Deleting partitions...", () => new { partitionIds.Count });
400+
await this.partitions.DeleteListAsync(partitionIds);
401+
}
402+
catch (Exception e)
332403
{
333-
this.log.Debug("No partitions to delete");
334-
return;
404+
this.log.Error("Unexpected error while deleting partitions", e);
335405
}
336-
337-
// TODO: partitions should be deleted only after its actors are down
338-
this.log.Debug("Deleting partitions...", () => new { partitionIds.Count });
339-
await this.partitions.DeleteListAsync(partitionIds);
340406
}
341407
}
342408
}

0 commit comments

Comments
 (0)