diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt new file mode 100644 index 0000000000..8152c7c72e --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/Examples/nvidia-smi/query-nvlink.txt @@ -0,0 +1,200 @@ +GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-547e49e9-c77f-15a6-b5da-15b9eb0207d3) + Link 0: Data Tx: 1040 KiB + Link 0: Data Rx: 200.3 KiB + Link 1: Data Tx: 1500 KiB + Link 1: Data Rx: 1500 KiB + Link 2: Data Tx: 1500 KiB + Link 2: Data Rx: 1500 KiB + Link 3: Data Tx: 1500 KiB + Link 3: Data Rx: 1500 KiB + Link 4: Data Tx: 1500 KiB + Link 4: Data Rx: 1500 KiB + Link 5: Data Tx: 1500 KiB + Link 5: Data Rx: 1500 KiB + Link 6: Data Tx: 1500 KiB + Link 6: Data Rx: 1500 KiB + Link 7: Data Tx: 1500 KiB + Link 7: Data Rx: 1500 KiB + Link 8: Data Tx: 1500 KiB + Link 8: Data Rx: 1500 KiB + Link 9: Data Tx: 1500 KiB + Link 9: Data Rx: 1500 KiB + Link 10: Data Tx: 1500 KiB + Link 10: Data Rx: 1500 KiB + Link 11: Data Tx: 1500 KiB + Link 11: Data Rx: 1500 KiB +GPU 1: NVIDIA A100-SXM4-40GB (UUID: GPU-dc450a09-2c60-40a5-34fc-77ac7faf167e) + Link 0: Data Tx: 800 KiB + Link 0: Data Rx: 800 KiB + Link 1: Data Tx: 800 KiB + Link 1: Data Rx: 800 KiB + Link 2: Data Tx: 800 KiB + Link 2: Data Rx: 800 KiB + Link 3: Data Tx: 800 KiB + Link 3: Data Rx: 800 KiB + Link 4: Data Tx: 800 KiB + Link 4: Data Rx: 800 KiB + Link 5: Data Tx: 800 KiB + Link 5: Data Rx: 800 KiB + Link 6: Data Tx: 800 KiB + Link 6: Data Rx: 800 KiB + Link 7: Data Tx: 800 KiB + Link 7: Data Rx: 800 KiB + Link 8: Data Tx: 800 KiB + Link 8: Data Rx: 800 KiB + Link 9: Data Tx: 800 KiB + Link 9: Data Rx: 800 KiB + Link 10: Data Tx: 800 KiB + Link 10: Data Rx: 800 KiB + Link 11: Data Tx: 800 KiB + Link 11: Data Rx: 800 KiB +GPU 2: NVIDIA A100-SXM4-40GB (UUID: GPU-38eef22b-8b56-ae96-1f1f-25575b9fc7e7) + Link 0: Data Tx: 500 KiB + Link 0: Data Rx: 500 KiB + Link 1: Data Tx: 500 KiB + Link 1: Data Rx: 500 KiB + Link 2: Data Tx: 500 KiB + Link 2: Data Rx: 500 KiB + Link 3: Data Tx: 500 KiB + Link 3: Data Rx: 500 KiB + Link 4: Data Tx: 500 KiB + Link 4: Data Rx: 500 KiB + Link 5: Data Tx: 500 KiB + Link 5: Data Rx: 500 KiB + Link 6: Data Tx: 500 KiB + Link 6: Data Rx: 500 KiB + Link 7: Data Tx: 500 KiB + Link 7: Data Rx: 500 KiB + Link 8: Data Tx: 500 KiB + Link 8: Data Rx: 500 KiB + Link 9: Data Tx: 500 KiB + Link 9: Data Rx: 500 KiB + Link 10: Data Tx: 500 KiB + Link 10: Data Rx: 500 KiB + Link 11: Data Tx: 500 KiB + Link 11: Data Rx: 500 KiB +GPU 3: NVIDIA A100-SXM4-40GB (UUID: GPU-bb58bf68-496a-a909-f7a6-eb6e8bff5892) + Link 0: Data Tx: 1200 KiB + Link 0: Data Rx: 1200 KiB + Link 1: Data Tx: 1200 KiB + Link 1: Data Rx: 1200 KiB + Link 2: Data Tx: 1200 KiB + Link 2: Data Rx: 1200 KiB + Link 3: Data Tx: 1200 KiB + Link 3: Data Rx: 1200 KiB + Link 4: Data Tx: 1200 KiB + Link 4: Data Rx: 1200 KiB + Link 5: Data Tx: 1200 KiB + Link 5: Data Rx: 1200 KiB + Link 6: Data Tx: 1200 KiB + Link 6: Data Rx: 1200 KiB + Link 7: Data Tx: 1200 KiB + Link 7: Data Rx: 1200 KiB + Link 8: Data Tx: 1200 KiB + Link 8: Data Rx: 1200 KiB + Link 9: Data Tx: 1200 KiB + Link 9: Data Rx: 1200 KiB + Link 10: Data Tx: 1200 KiB + Link 10: Data Rx: 1200 KiB + Link 11: Data Tx: 1200 KiB + Link 11: Data Rx: 1200 KiB +GPU 4: NVIDIA A100-SXM4-40GB (UUID: GPU-e7900065-8d18-a01c-7d45-9ef032d7d1ed) + Link 0: Data Tx: 2000 KiB + Link 0: Data Rx: 2000 KiB + Link 1: Data Tx: 2000 KiB + Link 1: Data Rx: 2000 KiB + Link 2: Data Tx: 2000 KiB + Link 2: Data Rx: 2000 KiB + Link 3: Data Tx: 2000 KiB + Link 3: Data Rx: 2000 KiB + Link 4: Data Tx: 2000 KiB + Link 4: Data Rx: 2000 KiB + Link 5: Data Tx: 2000 KiB + Link 5: Data Rx: 2000 KiB + Link 6: Data Tx: 2000 KiB + Link 6: Data Rx: 2000 KiB + Link 7: Data Tx: 2000 KiB + Link 7: Data Rx: 2000 KiB + Link 8: Data Tx: 2000 KiB + Link 8: Data Rx: 2000 KiB + Link 9: Data Tx: 2000 KiB + Link 9: Data Rx: 2000 KiB + Link 10: Data Tx: 2000 KiB + Link 10: Data Rx: 2000 KiB + Link 11: Data Tx: 2000 KiB + Link 11: Data Rx: 2000 KiB +GPU 5: NVIDIA A100-SXM4-40GB (UUID: GPU-8e000139-4a61-ec47-798b-374ae1cbf96a) + Link 0: Data Tx: 400 KiB + Link 0: Data Rx: 400 KiB + Link 1: Data Tx: 400 KiB + Link 1: Data Rx: 400 KiB + Link 2: Data Tx: 400 KiB + Link 2: Data Rx: 400 KiB + Link 3: Data Tx: 400 KiB + Link 3: Data Rx: 400 KiB + Link 4: Data Tx: 400 KiB + Link 4: Data Rx: 400 KiB + Link 5: Data Tx: 400 KiB + Link 5: Data Rx: 400 KiB + Link 6: Data Tx: 400 KiB + Link 6: Data Rx: 400 KiB + Link 7: Data Tx: 400 KiB + Link 7: Data Rx: 400 KiB + Link 8: Data Tx: 400 KiB + Link 8: Data Rx: 400 KiB + Link 9: Data Tx: 400 KiB + Link 9: Data Rx: 400 KiB + Link 10: Data Tx: 400 KiB + Link 10: Data Rx: 400 KiB + Link 11: Data Tx: 400 KiB + Link 11: Data Rx: 400 KiB +GPU 6: NVIDIA A100-SXM4-40GB (UUID: GPU-53bbb70c-10a4-f0b3-9e6a-0bfc103ed298) + Link 0: Data Tx: 750 KiB + Link 0: Data Rx: 750 KiB + Link 1: Data Tx: 750 KiB + Link 1: Data Rx: 750 KiB + Link 2: Data Tx: 750 KiB + Link 2: Data Rx: 750 KiB + Link 3: Data Tx: 750 KiB + Link 3: Data Rx: 750 KiB + Link 4: Data Tx: 750 KiB + Link 4: Data Rx: 750 KiB + Link 5: Data Tx: 750 KiB + Link 5: Data Rx: 750 KiB + Link 6: Data Tx: 750 KiB + Link 6: Data Rx: 750 KiB + Link 7: Data Tx: 750 KiB + Link 7: Data Rx: 750 KiB + Link 8: Data Tx: 750 KiB + Link 8: Data Rx: 750 KiB + Link 9: Data Tx: 750 KiB + Link 9: Data Rx: 750 KiB + Link 10: Data Tx: 750 KiB + Link 10: Data Rx: 750 KiB + Link 11: Data Tx: 750 KiB + Link 11: Data Rx: 750 KiB +GPU 7: NVIDIA A100-SXM4-40GB (UUID: GPU-f6babbbb-c44f-416a-79ec-8d28350c2ad2) + Link 0: Data Tx: 600 KiB + Link 0: Data Rx: 600 KiB + Link 1: Data Tx: 600 KiB + Link 1: Data Rx: 600 KiB + Link 2: Data Tx: 600 KiB + Link 2: Data Rx: 600 KiB + Link 3: Data Tx: 600 KiB + Link 3: Data Rx: 600 KiB + Link 4: Data Tx: 600 KiB + Link 4: Data Rx: 600 KiB + Link 5: Data Tx: 600 KiB + Link 5: Data Rx: 600 KiB + Link 6: Data Tx: 600 KiB + Link 6: Data Rx: 600 KiB + Link 7: Data Tx: 600 KiB + Link 7: Data Rx: 600 KiB + Link 8: Data Tx: 600 KiB + Link 8: Data Rx: 600 KiB + Link 9: Data Tx: 600 KiB + Link 9: Data Rx: 600 KiB + Link 10: Data Tx: 600 KiB + Link 10: Data Rx: 600 KiB + Link 11: Data Tx: 600 KiB + Link 11: Data Rx: 600 KiB \ No newline at end of file diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs similarity index 93% rename from src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs rename to src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs index 60525fff5d..6e2d78cc45 100644 --- a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryC2CParserUnitTest.cs +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryC2CParserUnitTest.cs @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -namespace VirtualClient.Monitors.UnitTests +namespace VirtualClient.Monitors { using NUnit.Framework; using System; @@ -27,8 +27,8 @@ public void NvidiaSmiC2CParserParsesMetricsCorrectly() NvidiaSmiC2CParser testParser = new NvidiaSmiC2CParser(rawText); IList metrics = testParser.Parse(); - Assert.AreEqual(10, metrics.Count); - MetricAssert.Exists(metrics, "GPU 0: C2C Link 0 Speed", 44.712, "GB/s"); + Assert.AreEqual(10, metrics.Count); + MetricAssert.Exists(metrics, "GPU 0: C2C Link 0 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 1 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 2 Speed", 44.712, "GB/s"); MetricAssert.Exists(metrics, "GPU 0: C2C Link 3 Speed", 44.712, "GB/s"); diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryGpuParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryGpuParserUnitTests.cs similarity index 100% rename from src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmiQueryGpuParserUnitTests.cs rename to src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryGpuParserUnitTests.cs diff --git a/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs new file mode 100644 index 0000000000..018a066fea --- /dev/null +++ b/src/VirtualClient/VirtualClient.Monitors.UnitTests/NvidiaSmi/NvidiaSmiQueryNvLinkParserUnitTests.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace VirtualClient.Monitors +{ + using NUnit.Framework; + using System; + using System.Collections.Generic; + using System.IO; + using System.Linq; + using System.Reflection; + using System.Text; + using System.Threading.Tasks; + using VirtualClient.Contracts; + + [TestFixture] + [Category("Unit")] + public class NvidiaSmiQueryNvLinkParserUnitTest + { + [Test] + public void NvidiaSmiNvLinkParserParsesMetricsCorrectly() + { + string workingDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); + string outputPath = Path.Combine(workingDirectory, "Examples", "nvidia-smi", "query-nvlink.txt"); + string rawText = File.ReadAllText(outputPath); + + NvidiaSmiQueryNvLinkParser testParser = new NvidiaSmiQueryNvLinkParser(rawText); + IList metrics = testParser.Parse(); + + Assert.AreEqual(192, metrics.Count); + MetricAssert.Exists(metrics, "GPU 0: NvLink Rx 0 Throughput", 200.3, "KiB"); + MetricAssert.Exists(metrics, "GPU 1: NvLink Tx 11 Throughput", 800, "KiB"); + MetricAssert.Exists(metrics, "GPU 2: NvLink Rx 9 Throughput", 500, "KiB"); + MetricAssert.Exists(metrics, "GPU 3: NvLink Tx 5 Throughput", 1200, "KiB"); + MetricAssert.Exists(metrics, "GPU 4: NvLink Rx 1 Throughput", 2000, "KiB"); + MetricAssert.Exists(metrics, "GPU 5: NvLink Tx 3 Throughput", 400, "KiB"); + MetricAssert.Exists(metrics, "GPU 6: NvLink Rx 2 Throughput", 750, "KiB"); + MetricAssert.Exists(metrics, "GPU 7: NvLink Tx 10 Throughput", 600, "KiB"); + } + } +} diff --git a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs index 04741a6caa..c900ba8884 100644 --- a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs +++ b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs @@ -22,6 +22,8 @@ namespace VirtualClient.Monitors /// public class NvidiaSmiMonitor : VirtualClientIntervalBasedMonitor { + private const string NvidiaSmiCommand = "nvidia-smi"; + /// /// Initializes a new instance of the class. /// @@ -43,10 +45,26 @@ protected override async Task ExecuteAsync(EventContext telemetryContext, Cancel break; case PlatformID.Unix: - await this.QueryC2CAsync(telemetryContext, cancellationToken) - .ConfigureAwait(false); - await this.QueryGpuAsync(telemetryContext, cancellationToken) + + await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) .ConfigureAwait(false); + + while (!cancellationToken.IsCancellationRequested) + { + DateTime nextIteration = DateTime.UtcNow; + await this.WaitAsync(nextIteration, cancellationToken); + nextIteration = DateTime.UtcNow.Add(this.MonitorFrequency); + + await this.QueryC2CAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + + await this.QueryNvLinkBandwidthAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + + await this.QueryGpuAsync(telemetryContext, cancellationToken) + .ConfigureAwait(false); + } + break; } } @@ -64,21 +82,53 @@ protected override void Validate() } } - private async Task QueryC2CAsync(EventContext telemetryContext, CancellationToken cancellationToken) + private async Task QueryNvLinkBandwidthAsync(EventContext telemetryContext, CancellationToken cancellationToken) { - ISystemManagement systemManagement = this.Dependencies.GetService(); + string arguments = "nvlink -gt d"; + + try + { + DateTime startTime = DateTime.UtcNow; + IProcessProxy process = await this.ExecuteCommandAsync(NvidiaSmiCommand, arguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); + DateTime endTime = DateTime.UtcNow; + + if (!cancellationToken.IsCancellationRequested) + { + await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-NvLink", logToFile: true); + process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); + + if (process.StandardOutput.Length > 0) + { + NvidiaSmiQueryNvLinkParser parser = new NvidiaSmiQueryNvLinkParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); + + if (metrics?.Any() == true) + { + this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); + } + } + } + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + // This would be expected on new VM while nvidia-smi is still being installed. + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); + } + } + private async Task QueryC2CAsync(EventContext telemetryContext, CancellationToken cancellationToken) + { // This is the Nvidia smi c2c command - string command = "nvidia-smi"; string c2cCommandArguments = "c2c -s"; - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); - try { DateTime startTime = DateTime.UtcNow; - IProcessProxy process = await this.ExecuteCommandAsync(command, c2cCommandArguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); + IProcessProxy process = await this.ExecuteCommandAsync(NvidiaSmiCommand, c2cCommandArguments, Environment.CurrentDirectory, telemetryContext, cancellationToken, runElevated: true); DateTime endTime = DateTime.UtcNow; if (!cancellationToken.IsCancellationRequested) @@ -109,7 +159,7 @@ await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); } } - + private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToken cancellationToken) { ISystemManagement systemManagement = this.Dependencies.GetService(); @@ -125,8 +175,7 @@ private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToke // ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.device_memory,ecc.errors.uncorrected.aggregate.dram,ecc.errors.uncorrected.aggregate.sram, // ecc.errors.uncorrected.aggregate.total // --format=csv,nounits - int totalSamples = (int)this.MonitorFrequency.TotalSeconds; - string command = "nvidia-smi"; + string commandArguments = "--query-gpu=timestamp,name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.current,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory," + "power.draw.average,clocks.gr,clocks.sm,clocks.video,clocks.mem,memory.total,memory.free,memory.used,power.draw.instant,pcie.link.gen.gpucurrent," + "pcie.link.width.current,ecc.errors.corrected.volatile.device_memory,ecc.errors.corrected.volatile.dram,ecc.errors.corrected.volatile.sram," + @@ -136,56 +185,45 @@ private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToke "ecc.errors.uncorrected.aggregate.total " + "--format=csv,nounits"; - await Task.Delay(this.MonitorWarmupPeriod, cancellationToken) - .ConfigureAwait(false); - - DateTime nextIteration = DateTime.UtcNow; - - while (!cancellationToken.IsCancellationRequested) + try { - try + using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, NvidiaSmiCommand, $"{commandArguments}", Environment.CurrentDirectory)) { - await this.WaitAsync(nextIteration, cancellationToken); - nextIteration = DateTime.UtcNow.Add(this.MonitorFrequency); + this.CleanupTasks.Add(() => process.SafeKill()); - using (IProcessProxy process = systemManagement.ProcessManager.CreateElevatedProcess(this.Platform, command, $"{commandArguments}", Environment.CurrentDirectory)) - { - this.CleanupTasks.Add(() => process.SafeKill()); + DateTime startTime = DateTime.UtcNow; + await process.StartAndWaitAsync(cancellationToken) + .ConfigureAwait(false); - DateTime startTime = DateTime.UtcNow; - await process.StartAndWaitAsync(cancellationToken) - .ConfigureAwait(false); + DateTime endTime = DateTime.UtcNow; - DateTime endTime = DateTime.UtcNow; + if (!cancellationToken.IsCancellationRequested) + { + // We cannot log the process details here. The output is too large. + await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-gpu", logToFile: true); + process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); - if (!cancellationToken.IsCancellationRequested) + if (process.StandardOutput.Length > 0) { - // We cannot log the process details here. The output is too large. - await this.LogProcessDetailsAsync(process, telemetryContext, "Nvidia-Smi-gpu", logToFile: true); - process.ThrowIfErrored(errorReason: ErrorReason.MonitorFailed); + NvidiaSmiQueryGpuParser parser = new NvidiaSmiQueryGpuParser(process.StandardOutput.ToString()); + IList metrics = parser.Parse(); - if (process.StandardOutput.Length > 0) + if (metrics?.Any() == true) { - NvidiaSmiQueryGpuParser parser = new NvidiaSmiQueryGpuParser(process.StandardOutput.ToString()); - IList metrics = parser.Parse(); - - if (metrics?.Any() == true) - { - this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); - } + this.Logger.LogPerformanceCounters("nvidia", metrics, startTime, endTime, telemetryContext); } } } } - catch (OperationCanceledException) - { - // Expected whenever ctrl-C is used. - } - catch (Exception exc) - { - // This would be expected on new VM while nvidia-smi is still being installed. - this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); - } + } + catch (OperationCanceledException) + { + // Expected whenever ctrl-C is used. + } + catch (Exception exc) + { + // This would be expected on new VM while nvidia-smi is still being installed. + this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning); } } } diff --git a/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs new file mode 100644 index 0000000000..6af058029f Binary files /dev/null and b/src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiQueryNvLinkParser.cs differ