Skip to content

Commit b531c8f

Browse files
committed
Select GPUs with AvailableGPU class
1 parent ce8f5ff commit b531c8f

File tree

4 files changed

+101
-27
lines changed

4 files changed

+101
-27
lines changed

Configuration/PyReleaseValidation/python/MatrixRunner.py

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,21 @@
11
import os, sys, time
2-
import subprocess
32

43
from collections import Counter
54

6-
from Configuration.PyReleaseValidation.WorkFlow import WorkFlow
75
from Configuration.PyReleaseValidation.WorkFlowRunner import WorkFlowRunner
86
from Configuration.PyReleaseValidation.MatrixUtil import check_dups
97
# ================================================================================
108

119
class MatrixRunner(object):
1210

13-
def __init__(self, wfIn=None, nThrMax=4, nThreads=1, gpu=False):
11+
def __init__(self, wfIn=None, nThrMax=4, nThreads=1, gpus=None):
1412

1513
self.workFlows = wfIn
1614

1715
self.threadList = []
1816
self.maxThreads = nThrMax
1917
self.nThreads = nThreads
20-
self.gpus = ()
21-
22-
if gpu:
23-
print("> Running with --gpu option. Checking the GPUs available.")
24-
cuda = subprocess.check_output("cudaComputeCapabilities", shell=True, executable="/bin/bash").decode('utf8')
25-
# Building on top of the {cuda|rocm}ComputeCapabilities
26-
# output in case of no {NVIDIA|AMD} GPU:
27-
# 'no XXX-capable device is detecte'
28-
if "capable device is detected" in cuda:
29-
cuda = 0
30-
else:
31-
print(cuda.split("\n"))
32-
rocm = subprocess.check_output("rocmComputeCapabilities", shell=True, executable="/bin/bash").decode('utf8')
33-
if "capable device is detected" in rocm:
34-
rocm = 0
35-
else:
36-
print(cuda.split("\n"))
37-
print("Checks for GPU")
38-
pass
18+
self.gpus = gpus
3919

4020
#the directories in which it happened
4121
self.runDirs={}
@@ -87,7 +67,10 @@ def runTests(self, opt):
8767

8868
print('\nPreparing to run %s %s' % (wf.numId, item))
8969
sys.stdout.flush()
90-
current = WorkFlowRunner(wf,opt,noRun,dryRun,cafVeto,njob)
70+
gpu_cmd = None
71+
if self.gpus is not None:
72+
gpu_cmd = next(self.gpus).gpuBind()
73+
current = WorkFlowRunner(wf,opt,noRun,dryRun,cafVeto,njob,gpu_cmd)
9174
self.threadList.append(current)
9275
current.start()
9376
if not dryRun:

Configuration/PyReleaseValidation/python/MatrixUtil.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import os
2+
import subprocess
3+
24
class Matrix(dict):
35
def __setitem__(self,key,value):
46
if key in self:
@@ -281,3 +283,58 @@ def check_dups(input):
281283
dups = set(x for x in input if x in seen or seen.add(x))
282284

283285
return dups
286+
287+
class AvailableGPU():
288+
289+
def __init__(self, make, counter, id, capability, name):
290+
self.make = make
291+
self.counter = counter
292+
self.id = id
293+
self.capability = capability
294+
self.name = name
295+
296+
def __str__(self):
297+
return "> GPU no.{0}: {1} - {2} - {3} - {4}".format(self.counter,self.make,self.id,self.capability,self.name)
298+
299+
def isCUDA(self):
300+
return self.make == 'CUDA'
301+
def isROCM(self):
302+
return self.make == 'ROCM'
303+
304+
def gpuBind(self):
305+
306+
cmd = ''
307+
if self.make == 'CUDA':
308+
cmd = 'CUDA_VISIBLE_DEVICES=' + str(self.id) + " HIP_VISIBLE_DEVICES= "
309+
elif self.make == 'ROCM':
310+
cmd = 'CUDA_VISIBLE_DEVICES= HIP_VISIBLE_DEVICES=' + str(self.id) + " "
311+
312+
return cmd
313+
314+
def cleanComputeCapabilities(make, offset = 0):
315+
316+
# Building on top of {cuda|rocm}ComputeCapabilities
317+
# with output:
318+
# ID computeCapability Architetcure Model Info
319+
320+
out = subprocess.run(make + "ComputeCapabilities", capture_output = True, text = True)
321+
322+
if out.returncode > 0:
323+
return []
324+
325+
gpus = []
326+
for f in out.stdout.split("\n"):
327+
328+
if not len(f)>0:
329+
continue
330+
331+
if "unsupported" in f:
332+
print("> Warning! Unsupported GPU:")
333+
print(" > " + " ".join(f))
334+
continue
335+
336+
gpus.append(f.split())
337+
338+
gpus = [AvailableGPU(make.upper(), i + offset, int(f[0]),f[1]," ".join(f[2:])) for i,f in enumerate(gpus)]
339+
340+
return gpus

Configuration/PyReleaseValidation/python/WorkFlowRunner.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from datetime import datetime
88

99
class WorkFlowRunner(Thread):
10-
def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=None):
10+
def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=None, gpu = None):
1111
Thread.__init__(self)
1212
self.wf = wf
1313

@@ -18,6 +18,8 @@ def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=N
1818
self.noRun = noRun
1919
self.dryRun = dryRun
2020
self.cafVeto = cafVeto
21+
self.gpu = gpu
22+
2123
self.dasOptions = opt.dasOptions
2224
self.jobReport = opt.jobReports
2325
self.nThreads = opt.nThreads
@@ -31,7 +33,7 @@ def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=N
3133
self.wfDir=str(self.wf.numId)+'_'+self.wf.nameId
3234
if jobNumber is not None:
3335
self.wfDir = self.wfDir + '_job' + str(jobNumber)
34-
print(self.wfDir)
36+
3537
return
3638

3739
def doCmd(self, cmd):
@@ -154,6 +156,9 @@ def closeCmd(i,ID):
154156

155157
else:
156158
#chaining IO , which should be done in WF object already and not using stepX.root but <stepName>.root
159+
if self.gpu is not None:
160+
cmd = cmd + self.gpu
161+
157162
cmd += com
158163

159164
if self.startFrom:

Configuration/PyReleaseValidation/scripts/runTheMatrix.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
#!/usr/bin/env python3
22
import sys, os
33

4+
from itertools import cycle
5+
46
from Configuration.PyReleaseValidation.MatrixReader import MatrixReader
57
from Configuration.PyReleaseValidation.MatrixRunner import MatrixRunner
68
from Configuration.PyReleaseValidation.MatrixInjector import MatrixInjector,performInjectionOptionTest
7-
9+
from Configuration.PyReleaseValidation.MatrixUtil import cleanComputeCapabilities
810
# ================================================================================
911

1012
def showRaw(opt):
@@ -34,7 +36,7 @@ def runSelected(opt):
3436
mrd.show(opt.testList, opt.extended, opt.cafVeto)
3537
if opt.testList : print('selected items:', opt.testList)
3638
else:
37-
mRunnerHi = MatrixRunner(mrd.workFlows, opt.nProcs, opt.nThreads)
39+
mRunnerHi = MatrixRunner(mrd.workFlows, opt.nProcs, opt.nThreads, opt.selected_gpus)
3840
ret = mRunnerHi.runTests(opt)
3941

4042
if opt.wmcontrol:
@@ -448,6 +450,33 @@ def runSelected(opt):
448450
default='')
449451

450452
opt = parser.parse_args()
453+
454+
opt.selected_gpus = None
455+
if opt.gpu:
456+
457+
print(">> Running with --gpu option. Checking the available and supported GPUs.")
458+
gpus = cleanComputeCapabilities("cuda")
459+
gpus = gpus + cleanComputeCapabilities("rocm", len(gpus))
460+
available_gpus = gpus
461+
462+
print("> GPUs Available:")
463+
[print(f) for f in available_gpus]
464+
465+
# Filtering ONLY CUDA GPUs on capability
466+
gpus = [g for g in gpus if not g.isCUDA() or (g.isCUDA() and g.capability in opt.CUDACapabilities)]
467+
468+
# Filtering by name (if parsed)
469+
if len(opt.GPUName) > 0:
470+
gpus = [g for g in gpus if g.name == opt.GPUName]
471+
472+
if available_gpus != gpus:
473+
print(">> Selected:")
474+
[print(f) for f in gpus]
475+
else:
476+
print(">> All selected!")
477+
478+
opt.selected_gpus = cycle(gpus)
479+
451480
if opt.command: opt.command = ' '.join(opt.command)
452481
os.environ["CMSSW_DAS_QUERY_SITES"]=opt.dasSites
453482
if opt.failed_from:

0 commit comments

Comments
 (0)