Skip to content

Commit 167278f

Browse files
nv-hwoomc-nvnv-braf
authored
Update DCGM version to 3.2.6 (#785)
* Update DCGM version (#692) * Update library version * Added new DCGM files * Added new DCGM files * Update package references * Attempt at DCGM fix * Fixing uuid and name * Making PCI ID an int * Printing device types * Next attempt * Fixing copyright issues * Update DCGM version * Ignore pre-commit hooks for DCGM * Fix pre-commit --------- Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Misha Chornyi <[email protected]> Co-authored-by: Brian Raf <[email protected]>
1 parent f15427e commit 167278f

28 files changed

+7424
-2376
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
exclude: monitor/dcgm/
2728
repos:
2829
- repo: https://github.com/timothycrosley/isort
2930
rev: 5.12.0

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ ARG BASE_IMAGE
2727
ARG TRITONSDK_BASE_IMAGE
2828

2929
# DCGM version to install for Model Analyzer
30-
ENV DCGM_VERSION=2.4.7
30+
ENV DCGM_VERSION=3.2.6
3131

3232
# Ensure apt-get won't prompt for selecting options
3333
ENV DEBIAN_FRONTEND=noninteractive

model_analyzer/device/gpu_device_factory.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,10 @@ def init_all_devices(self, dcgmPath=None):
6666
device_atrributes = dcgm_agent.dcgmGetDeviceAttributes(
6767
dcgm_handle, device_id
6868
).identifiers
69-
pci_bus_id = device_atrributes.pciBusId.decode("utf-8").upper()
70-
device_uuid = str(device_atrributes.uuid, encoding="utf-8")
71-
device_name = str(device_atrributes.deviceName, encoding="utf-8")
69+
pci_bus_id = device_atrributes.pciBusId
70+
device_uuid = device_atrributes.uuid
71+
device_name = device_atrributes.deviceName
72+
7273
gpu_device = GPUDevice(device_name, device_id, pci_bus_id, device_uuid)
7374

7475
self._devices.append(gpu_device)
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
16+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
17+
18+
19+
class DcgmDiag:
20+
21+
# Maps version codes to simple version values for range comparisons
22+
_versionMap = {dcgm_structs.dcgmRunDiag_version: 5}
23+
24+
def __init__(self,
25+
gpuIds=None,
26+
testNamesStr='',
27+
paramsStr='',
28+
verbose=True,
29+
version=dcgm_structs.dcgmRunDiag_version):
30+
# Make sure version is valid
31+
if version not in DcgmDiag._versionMap:
32+
raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
33+
version)
34+
self.version = version
35+
36+
if self.version == dcgm_structs.dcgmRunDiag_version7:
37+
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
38+
else:
39+
self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
40+
41+
self.numTests = 0
42+
self.numParams = 0
43+
self.SetVerbose(verbose)
44+
if testNamesStr == '':
45+
# default to a level 1 test
46+
self.runDiagInfo.validate = 1
47+
elif testNamesStr == '1':
48+
self.runDiagInfo.validate = 1
49+
elif testNamesStr == '2':
50+
self.runDiagInfo.validate = 2
51+
elif testNamesStr == '3':
52+
self.runDiagInfo.validate = 3
53+
elif testNamesStr == '4':
54+
self.runDiagInfo.validate = 4
55+
else:
56+
# Make sure no number other that 1-4 were submitted
57+
if testNamesStr.isdigit():
58+
raise ValueError("'%s' is not a valid test name." %
59+
testNamesStr)
60+
61+
# Copy to the testNames portion of the object
62+
names = testNamesStr.split(',')
63+
if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
64+
err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
65+
(len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
66+
raise ValueError(err)
67+
68+
for testName in names:
69+
self.AddTest(testName)
70+
71+
if paramsStr != '':
72+
params = paramsStr.split(';')
73+
if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
74+
err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
75+
(len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
76+
raise ValueError(err)
77+
78+
for param in params:
79+
self.AddParameter(param)
80+
81+
if gpuIds:
82+
first = True
83+
for gpu in gpuIds:
84+
if first:
85+
self.runDiagInfo.gpuList = str(gpu)
86+
first = False
87+
else:
88+
self.runDiagInfo.gpuList = "%s,%s" % (
89+
self.runDiagInfo.gpuList, str(gpu))
90+
91+
def SetVerbose(self, val):
92+
if val == True:
93+
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
94+
else:
95+
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
96+
97+
def UseFakeGpus(self):
98+
self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList
99+
100+
def GetStruct(self):
101+
return self.runDiagInfo
102+
103+
def AddParameter(self, parameterStr):
104+
if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
105+
err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
106+
(parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
107+
raise ValueError(err)
108+
109+
index = 0
110+
for c in parameterStr:
111+
self.runDiagInfo.testParms[self.numParams][index] = ord(c)
112+
index += 1
113+
114+
self.numParams += 1
115+
116+
def AddTest(self, testNameStr):
117+
if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
118+
err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
119+
(testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
120+
raise ValueError(err)
121+
122+
index = 0
123+
for c in testNameStr:
124+
self.runDiagInfo.testNames[self.numTests][index] = ord(c)
125+
index += 1
126+
127+
self.numTests += 1
128+
129+
def SetStatsOnFail(self, val):
130+
if val == True:
131+
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL
132+
133+
def SetThrottleMask(self, value):
134+
if DcgmDiag._versionMap[self.version] < 3:
135+
raise ValueError(
136+
"Throttle mask requires minimum version 3 for dcgmRunDiag.")
137+
if isinstance(
138+
value,
139+
str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
140+
raise ValueError("Throttle mask value '%s' exceeds max length %d." %
141+
(value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))
142+
143+
self.runDiagInfo.throttleMask = str(value)
144+
145+
def SetFailEarly(self, enable=True, checkInterval=5):
146+
if DcgmDiag._versionMap[self.version] < 5:
147+
raise ValueError(
148+
"Fail early requires minimum version 5 for dcgmRunDiag.")
149+
if not isinstance(checkInterval, int):
150+
raise ValueError("Invalid checkInterval value: %s" % checkInterval)
151+
152+
if enable:
153+
self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
154+
self.runDiagInfo.failCheckInterval = checkInterval
155+
else:
156+
self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
157+
158+
def Execute(self, handle):
159+
return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
160+
self.version)
161+
162+
def SetStatsPath(self, statsPath):
163+
if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
164+
err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
165+
(statsPath, dcgm_structs.DCGM_PATH_LEN)
166+
raise ValueError(err)
167+
168+
self.runDiagInfo.statsPath = statsPath
169+
170+
def SetConfigFileContents(self, configFileContents):
171+
if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
172+
err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
173+
% (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
174+
raise ValueError(err)
175+
176+
self.runDiagInfo.configFileContents = configFileContents
177+
178+
def SetDebugLogFile(self, logFileName):
179+
if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
180+
raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
181+
% (logFileName, dcgm_structs.DCGM_FILE_LEN))
182+
183+
self.runDiagInfo.debugLogFile = logFileName
184+
185+
def SetDebugLevel(self, debugLevel):
186+
if debugLevel < 0 or debugLevel > 5:
187+
raise ValueError(
188+
"Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
189+
)
190+
191+
self.runDiagInfo.debugLevel = debugLevel
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
16+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
17+
'''
18+
Class for managing a group of field IDs in the host engine.
19+
'''
20+
21+
22+
class DcgmFieldGroup:
23+
'''
24+
Constructor
25+
26+
dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
27+
name - Name of the field group to use within DCGM. This must be unique
28+
fieldIds - Fields that are part of this group
29+
fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
30+
'''
31+
32+
def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
33+
fieldIds = fieldIds or []
34+
self.name = name
35+
self.fieldIds = fieldIds
36+
self._dcgmHandle = dcgmHandle
37+
self.wasCreated = False
38+
39+
#If the user passed in an ID, the field group already exists. Fetch live info
40+
if fieldGroupId is not None:
41+
self.fieldGroupId = fieldGroupId
42+
fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
43+
self._dcgmHandle.handle, self.fieldGroupId)
44+
self.name = fieldGroupInfo.fieldGroupName
45+
self.fieldIds = fieldGroupInfo.fieldIds
46+
else:
47+
self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails
48+
self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
49+
self._dcgmHandle.handle, fieldIds, name)
50+
self.wasCreated = True
51+
52+
'''
53+
Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
54+
'''
55+
56+
def Delete(self):
57+
if self.wasCreated and self.fieldGroupId is not None:
58+
try:
59+
try:
60+
dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
61+
self.fieldGroupId)
62+
except dcgm_structs.dcgmExceptionClass(
63+
dcgm_structs.DCGM_ST_NO_DATA):
64+
# someone may have deleted the group under us. That's ok.
65+
pass
66+
except dcgm_structs.dcgmExceptionClass(
67+
dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
68+
# We lost our connection, but we're destructing this object anyway.
69+
pass
70+
except AttributeError as ae:
71+
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
72+
# get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
73+
pass
74+
except TypeError as te:
75+
# When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
76+
# get a TypeError: "'NoneType' object is not callable'" Ignore this
77+
pass
78+
self.fieldGroupId = None
79+
self._dcgmHandle = None
80+
81+
#Destructor
82+
def __del__(self):
83+
self.Delete()

0 commit comments

Comments
 (0)