Skip to content

Commit 84af897

Browse files
Adding win-x64 support for nvidia-smi (#546)
* Adding win-x64 support for nvidia-smi * Adding win-x64 support in class * Adding documentation edits * Bumping VC Version --------- Co-authored-by: Deepanshu Vaid <[email protected]>
1 parent 71df05b commit 84af897

File tree

4 files changed

+26
-28
lines changed

4 files changed

+26
-28
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.1
1+
2.1.2

src/VirtualClient/VirtualClient.Main/profiles/MONITORS-GPU-NVIDIA.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Description": "Default Monitors for Nvidia GPU systems.",
33
"Metadata": {
4-
"SupportedPlatforms": "linux-arm64,linux-x64",
4+
"SupportedPlatforms": "linux-arm64,linux-x64,win-x64",
55
"SupportedOperatingSystems": "CBL-Mariner,CentOS,Debian,RedHat,Suse,Ubuntu,Windows"
66
},
77
"Parameters": {

src/VirtualClient/VirtualClient.Monitors/Nvidia-Smi/NvidiaSmiMonitor.cs

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace VirtualClient.Monitors
1818
/// <summary>
1919
/// The Performance Counter Monitor for Virtual Client
2020
/// </summary>
21-
[SupportedPlatforms("linux-arm64,linux-x64")]
21+
[SupportedPlatforms("linux-arm64,linux-x64,win-x64")]
2222
public class NvidiaSmiMonitor : VirtualClientIntervalBasedMonitor
2323
{
2424
/// <summary>
@@ -39,34 +39,31 @@ protected override Task ExecuteAsync(EventContext telemetryContext, Cancellation
3939
{
4040
try
4141
{
42-
if (this.Platform == PlatformID.Unix)
42+
// Check that nvidia-smi is installed. If not, we exit the monitor.
43+
bool toolsetInstalled = await this.VerifyToolsetInstalledAsync(telemetryContext, cancellationToken);
44+
45+
if (toolsetInstalled)
4346
{
44-
// Check that nvidia-smi is installed. If not, we exit the monitor.
45-
bool toolsetInstalled = await this.VerifyToolsetInstalledAsync(telemetryContext, cancellationToken);
47+
await this.WaitAsync(this.MonitorWarmupPeriod, cancellationToken);
4648

47-
if (toolsetInstalled)
49+
int iterations = 0;
50+
while (!cancellationToken.IsCancellationRequested)
4851
{
49-
await this.WaitAsync(this.MonitorWarmupPeriod, cancellationToken);
50-
51-
int iterations = 0;
52-
while (!cancellationToken.IsCancellationRequested)
52+
try
5353
{
54-
try
55-
{
56-
iterations++;
57-
if (this.IsIterationComplete(iterations))
58-
{
59-
break;
60-
}
61-
62-
await this.QueryC2CAsync(telemetryContext, cancellationToken);
63-
await this.QueryGpuAsync(telemetryContext, cancellationToken);
64-
await this.WaitAsync(this.MonitorFrequency, cancellationToken);
65-
}
66-
catch (Exception exc)
54+
iterations++;
55+
if (this.IsIterationComplete(iterations))
6756
{
68-
this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning);
57+
break;
6958
}
59+
60+
await this.QueryC2CAsync(telemetryContext, cancellationToken);
61+
await this.QueryGpuAsync(telemetryContext, cancellationToken);
62+
await this.WaitAsync(this.MonitorFrequency, cancellationToken);
63+
}
64+
catch (Exception exc)
65+
{
66+
this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning);
7067
}
7168
}
7269
}
@@ -140,7 +137,7 @@ private async Task QueryC2CAsync(EventContext telemetryContext, CancellationToke
140137
this.Logger.LogErrorMessage(exc, telemetryContext, LogLevel.Warning);
141138
}
142139
}
143-
140+
144141
private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToken cancellationToken)
145142
{
146143
// This is the Nvidia smi query gpu command
@@ -161,7 +158,7 @@ private async Task QueryGpuAsync(EventContext telemetryContext, CancellationToke
161158
"ecc.errors.corrected.volatile.total,ecc.errors.corrected.aggregate.device_memory,ecc.errors.corrected.aggregate.dram,ecc.errors.corrected.aggregate.sram," +
162159
"ecc.errors.corrected.aggregate.total,ecc.errors.uncorrected.volatile.device_memory,ecc.errors.uncorrected.volatile.dram,ecc.errors.uncorrected.volatile.sram," +
163160
"ecc.errors.uncorrected.volatile.total,ecc.errors.uncorrected.aggregate.device_memory,ecc.errors.uncorrected.aggregate.dram,ecc.errors.uncorrected.aggregate.sram," +
164-
"ecc.errors.uncorrected.aggregate.total " +
161+
"ecc.errors.uncorrected.aggregate.total " +
165162
"--format=csv,nounits";
166163

167164
DateTime nextIteration = DateTime.UtcNow;

website/docs/monitors/0300-nvidia-smi.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Nvidia SMI
22
The NVIDIA System Management Interface (nvidia-smi) is a command line utility, based on top of the NVIDIA Management Library (NVML), intended to aid in the management and monitoring of NVIDIA GPU devices.
33

4-
This utility allows administrators to query GPU device state and with the appropriate privileges, permits administrators to modify GPU device state. It is targeted at the TeslaTM, GRIDTM, QuadroTM and Titan X product, though limited support is also available on other NVIDIA GPUs.
4+
This utility allows administrators to query GPU device state and with the appropriate privileges, permits administrators to modify GPU device state. It is targeted at the Blackwell, Hopper, Ampere, TeslaTM, GRIDTM, QuadroTM and Titan X product, though limited support is also available on other NVIDIA GPUs.
55

66
NVIDIA-smi ships with NVIDIA GPU display drivers on Linux, and with 64bit Windows Server 2008 R2 and Windows 7. Nvidia-smi can report query information as XML or human readable plain text to either standard output or a file. For more details, please refer to the nvidia-smi documentation.
77

@@ -14,6 +14,7 @@ This monitor has dependency on nvidia-smi. Please use [Nvidia Driver Installatio
1414
## Supported Platforms
1515
* linux-x64
1616
* linux-arm64
17+
* win-x64
1718

1819
## Supported Query
1920
Right now the query supported are --query-gpu and --query-c2c. Please create a feature request if you need other queries.

0 commit comments

Comments
 (0)