|
| 1 | +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs |
| 16 | +import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent |
| 17 | + |
| 18 | + |
| 19 | +class DcgmDiag: |
| 20 | + |
| 21 | + # Maps version codes to simple version values for range comparisons |
| 22 | + _versionMap = {dcgm_structs.dcgmRunDiag_version: 5} |
| 23 | + |
| 24 | + def __init__(self, |
| 25 | + gpuIds=None, |
| 26 | + testNamesStr='', |
| 27 | + paramsStr='', |
| 28 | + verbose=True, |
| 29 | + version=dcgm_structs.dcgmRunDiag_version): |
| 30 | + # Make sure version is valid |
| 31 | + if version not in DcgmDiag._versionMap: |
| 32 | + raise ValueError("'%s' is not a valid version for dcgmRunDiag." % |
| 33 | + version) |
| 34 | + self.version = version |
| 35 | + |
| 36 | + if self.version == dcgm_structs.dcgmRunDiag_version7: |
| 37 | + self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7() |
| 38 | + else: |
| 39 | + self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t() |
| 40 | + |
| 41 | + self.numTests = 0 |
| 42 | + self.numParams = 0 |
| 43 | + self.SetVerbose(verbose) |
| 44 | + if testNamesStr == '': |
| 45 | + # default to a level 1 test |
| 46 | + self.runDiagInfo.validate = 1 |
| 47 | + elif testNamesStr == '1': |
| 48 | + self.runDiagInfo.validate = 1 |
| 49 | + elif testNamesStr == '2': |
| 50 | + self.runDiagInfo.validate = 2 |
| 51 | + elif testNamesStr == '3': |
| 52 | + self.runDiagInfo.validate = 3 |
| 53 | + elif testNamesStr == '4': |
| 54 | + self.runDiagInfo.validate = 4 |
| 55 | + else: |
| 56 | + # Make sure no number other that 1-4 were submitted |
| 57 | + if testNamesStr.isdigit(): |
| 58 | + raise ValueError("'%s' is not a valid test name." % |
| 59 | + testNamesStr) |
| 60 | + |
| 61 | + # Copy to the testNames portion of the object |
| 62 | + names = testNamesStr.split(',') |
| 63 | + if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES: |
| 64 | + err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\ |
| 65 | + (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES) |
| 66 | + raise ValueError(err) |
| 67 | + |
| 68 | + for testName in names: |
| 69 | + self.AddTest(testName) |
| 70 | + |
| 71 | + if paramsStr != '': |
| 72 | + params = paramsStr.split(';') |
| 73 | + if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS: |
| 74 | + err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\ |
| 75 | + (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS) |
| 76 | + raise ValueError(err) |
| 77 | + |
| 78 | + for param in params: |
| 79 | + self.AddParameter(param) |
| 80 | + |
| 81 | + if gpuIds: |
| 82 | + first = True |
| 83 | + for gpu in gpuIds: |
| 84 | + if first: |
| 85 | + self.runDiagInfo.gpuList = str(gpu) |
| 86 | + first = False |
| 87 | + else: |
| 88 | + self.runDiagInfo.gpuList = "%s,%s" % ( |
| 89 | + self.runDiagInfo.gpuList, str(gpu)) |
| 90 | + |
| 91 | + def SetVerbose(self, val): |
| 92 | + if val == True: |
| 93 | + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE |
| 94 | + else: |
| 95 | + self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE |
| 96 | + |
| 97 | + def UseFakeGpus(self): |
| 98 | + self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList |
| 99 | + |
| 100 | + def GetStruct(self): |
| 101 | + return self.runDiagInfo |
| 102 | + |
| 103 | + def AddParameter(self, parameterStr): |
| 104 | + if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN: |
| 105 | + err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \ |
| 106 | + (parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN) |
| 107 | + raise ValueError(err) |
| 108 | + |
| 109 | + index = 0 |
| 110 | + for c in parameterStr: |
| 111 | + self.runDiagInfo.testParms[self.numParams][index] = ord(c) |
| 112 | + index += 1 |
| 113 | + |
| 114 | + self.numParams += 1 |
| 115 | + |
| 116 | + def AddTest(self, testNameStr): |
| 117 | + if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN: |
| 118 | + err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \ |
| 119 | + (testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN) |
| 120 | + raise ValueError(err) |
| 121 | + |
| 122 | + index = 0 |
| 123 | + for c in testNameStr: |
| 124 | + self.runDiagInfo.testNames[self.numTests][index] = ord(c) |
| 125 | + index += 1 |
| 126 | + |
| 127 | + self.numTests += 1 |
| 128 | + |
| 129 | + def SetStatsOnFail(self, val): |
| 130 | + if val == True: |
| 131 | + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL |
| 132 | + |
| 133 | + def SetThrottleMask(self, value): |
| 134 | + if DcgmDiag._versionMap[self.version] < 3: |
| 135 | + raise ValueError( |
| 136 | + "Throttle mask requires minimum version 3 for dcgmRunDiag.") |
| 137 | + if isinstance( |
| 138 | + value, |
| 139 | + str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN: |
| 140 | + raise ValueError("Throttle mask value '%s' exceeds max length %d." % |
| 141 | + (value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1)) |
| 142 | + |
| 143 | + self.runDiagInfo.throttleMask = str(value) |
| 144 | + |
| 145 | + def SetFailEarly(self, enable=True, checkInterval=5): |
| 146 | + if DcgmDiag._versionMap[self.version] < 5: |
| 147 | + raise ValueError( |
| 148 | + "Fail early requires minimum version 5 for dcgmRunDiag.") |
| 149 | + if not isinstance(checkInterval, int): |
| 150 | + raise ValueError("Invalid checkInterval value: %s" % checkInterval) |
| 151 | + |
| 152 | + if enable: |
| 153 | + self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY |
| 154 | + self.runDiagInfo.failCheckInterval = checkInterval |
| 155 | + else: |
| 156 | + self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY |
| 157 | + |
| 158 | + def Execute(self, handle): |
| 159 | + return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo, |
| 160 | + self.version) |
| 161 | + |
| 162 | + def SetStatsPath(self, statsPath): |
| 163 | + if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN: |
| 164 | + err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \ |
| 165 | + (statsPath, dcgm_structs.DCGM_PATH_LEN) |
| 166 | + raise ValueError(err) |
| 167 | + |
| 168 | + self.runDiagInfo.statsPath = statsPath |
| 169 | + |
| 170 | + def SetConfigFileContents(self, configFileContents): |
| 171 | + if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN: |
| 172 | + err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \ |
| 173 | + % (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN) |
| 174 | + raise ValueError(err) |
| 175 | + |
| 176 | + self.runDiagInfo.configFileContents = configFileContents |
| 177 | + |
| 178 | + def SetDebugLogFile(self, logFileName): |
| 179 | + if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN: |
| 180 | + raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\ |
| 181 | + % (logFileName, dcgm_structs.DCGM_FILE_LEN)) |
| 182 | + |
| 183 | + self.runDiagInfo.debugLogFile = logFileName |
| 184 | + |
| 185 | + def SetDebugLevel(self, debugLevel): |
| 186 | + if debugLevel < 0 or debugLevel > 5: |
| 187 | + raise ValueError( |
| 188 | + "Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive." |
| 189 | + ) |
| 190 | + |
| 191 | + self.runDiagInfo.debugLevel = debugLevel |
0 commit comments