Select GPUs with AvailableGPU class

AdrianoDee · AdrianoDee · commit b531c8f4b463 · 2025-03-19T15:01:24.000+01:00
diff --git a/Configuration/PyReleaseValidation/python/MatrixRunner.py b/Configuration/PyReleaseValidation/python/MatrixRunner.py
@@ -1,41 +1,21 @@
 import os, sys, time
-import subprocess
 
 from collections import Counter
 
-from Configuration.PyReleaseValidation.WorkFlow import WorkFlow
 from Configuration.PyReleaseValidation.WorkFlowRunner import WorkFlowRunner
 from Configuration.PyReleaseValidation.MatrixUtil import check_dups
 # ================================================================================
 
 class MatrixRunner(object):
 
-    def __init__(self, wfIn=None, nThrMax=4, nThreads=1, gpu=False):
+    def __init__(self, wfIn=None, nThrMax=4, nThreads=1, gpus=None):
 
         self.workFlows = wfIn
 
         self.threadList = []
         self.maxThreads = nThrMax
         self.nThreads = nThreads
-        self.gpus = ()
-
-        if gpu:
-            print("> Running with --gpu option. Checking the GPUs available.")
-            cuda = subprocess.check_output("cudaComputeCapabilities", shell=True, executable="/bin/bash").decode('utf8')
-            # Building on top of the {cuda|rocm}ComputeCapabilities 
-            # output in case of no {NVIDIA|AMD} GPU:
-            # 'no XXX-capable device is detecte'
-            if "capable device is detected" in cuda:
-                cuda = 0
-            else:   
-                print(cuda.split("\n"))
-            rocm = subprocess.check_output("rocmComputeCapabilities", shell=True, executable="/bin/bash").decode('utf8')
-            if "capable device is detected" in rocm:
-                rocm = 0
-            else:
-                print(cuda.split("\n"))
-            print("Checks for GPU")
-            pass
+        self.gpus = gpus
 
         #the directories in which it happened
         self.runDirs={}
@@ -87,7 +67,10 @@ def runTests(self, opt):
 
             print('\nPreparing to run %s %s' % (wf.numId, item))
             sys.stdout.flush()
-            current = WorkFlowRunner(wf,opt,noRun,dryRun,cafVeto,njob)
+            gpu_cmd = None
+            if self.gpus is not None:
+                gpu_cmd = next(self.gpus).gpuBind()
+            current = WorkFlowRunner(wf,opt,noRun,dryRun,cafVeto,njob,gpu_cmd)
             self.threadList.append(current)
             current.start()
             if not dryRun:
diff --git a/Configuration/PyReleaseValidation/python/MatrixUtil.py b/Configuration/PyReleaseValidation/python/MatrixUtil.py
@@ -1,4 +1,6 @@
 import os
+import subprocess
+
 class Matrix(dict):
     def __setitem__(self,key,value):
         if key in self:
@@ -281,3 +283,58 @@ def check_dups(input):
     dups = set(x for x in input if x in seen or seen.add(x))
     
     return dups
+
+class AvailableGPU():
+
+    def __init__(self, make, counter, id, capability, name):
+        self.make = make
+        self.counter = counter
+        self.id = id
+        self.capability = capability
+        self.name = name
+    
+    def __str__(self):
+        return "> GPU no.{0}: {1} - {2} - {3} - {4}".format(self.counter,self.make,self.id,self.capability,self.name)
+    
+    def isCUDA(self):
+        return self.make == 'CUDA'
+    def isROCM(self):
+        return self.make == 'ROCM'
+    
+    def gpuBind(self):
+        
+        cmd = ''
+        if self.make == 'CUDA':
+            cmd = 'CUDA_VISIBLE_DEVICES=' + str(self.id) + " HIP_VISIBLE_DEVICES= "
+        elif self.make == 'ROCM':
+            cmd = 'CUDA_VISIBLE_DEVICES= HIP_VISIBLE_DEVICES=' + str(self.id) + " "
+        
+        return cmd
+
+def cleanComputeCapabilities(make, offset = 0):
+        
+    # Building on top of {cuda|rocm}ComputeCapabilities
+    # with output:
+    # ID     computeCapability    Architetcure Model Info 
+
+    out = subprocess.run(make + "ComputeCapabilities", capture_output = True, text = True)
+
+    if out.returncode > 0:
+        return []
+
+    gpus = []
+    for f in out.stdout.split("\n"):
+
+        if not len(f)>0:
+            continue
+        
+        if "unsupported" in f:
+            print("> Warning! Unsupported GPU:")
+            print(" > " + " ".join(f))
+            continue
+
+        gpus.append(f.split())
+
+    gpus = [AvailableGPU(make.upper(), i + offset, int(f[0]),f[1]," ".join(f[2:])) for i,f in enumerate(gpus)]
+
+    return gpus
diff --git a/Configuration/PyReleaseValidation/python/WorkFlowRunner.py b/Configuration/PyReleaseValidation/python/WorkFlowRunner.py
@@ -7,7 +7,7 @@
 from datetime import datetime
 
 class WorkFlowRunner(Thread):
-    def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=None):
+    def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=None, gpu = None):
         Thread.__init__(self)
         self.wf = wf
 
@@ -18,6 +18,8 @@ def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=N
         self.noRun = noRun
         self.dryRun = dryRun
         self.cafVeto = cafVeto
+        self.gpu = gpu
+
         self.dasOptions = opt.dasOptions
         self.jobReport = opt.jobReports
         self.nThreads = opt.nThreads
@@ -31,7 +33,7 @@ def __init__(self, wf, opt, noRun=False, dryRun=False, cafVeto=True, jobNumber=N
         self.wfDir=str(self.wf.numId)+'_'+self.wf.nameId
         if jobNumber is not None:
             self.wfDir = self.wfDir + '_job' + str(jobNumber)
-        print(self.wfDir)
+
         return
 
     def doCmd(self, cmd):
@@ -154,6 +156,9 @@ def closeCmd(i,ID):
 
             else:
                 #chaining IO , which should be done in WF object already and not using stepX.root but <stepName>.root
+                if self.gpu is not None:
+                    cmd = cmd + self.gpu
+
                 cmd += com
 
                 if self.startFrom:
diff --git a/Configuration/PyReleaseValidation/scripts/runTheMatrix.py b/Configuration/PyReleaseValidation/scripts/runTheMatrix.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python3
 import sys, os
 
+from itertools import cycle
+
 from Configuration.PyReleaseValidation.MatrixReader import MatrixReader
 from Configuration.PyReleaseValidation.MatrixRunner import MatrixRunner
 from Configuration.PyReleaseValidation.MatrixInjector import MatrixInjector,performInjectionOptionTest
-
+from Configuration.PyReleaseValidation.MatrixUtil import cleanComputeCapabilities
 # ================================================================================
 
 def showRaw(opt):
@@ -34,7 +36,7 @@ def runSelected(opt):
         mrd.show(opt.testList, opt.extended, opt.cafVeto)
         if opt.testList : print('selected items:', opt.testList)
     else:
-        mRunnerHi = MatrixRunner(mrd.workFlows, opt.nProcs, opt.nThreads)
+        mRunnerHi = MatrixRunner(mrd.workFlows, opt.nProcs, opt.nThreads, opt.selected_gpus)
         ret = mRunnerHi.runTests(opt)
 
     if opt.wmcontrol:
@@ -448,6 +450,33 @@ def runSelected(opt):
                           default='')
 
     opt = parser.parse_args()
+
+    opt.selected_gpus = None
+    if opt.gpu:
+
+        print(">> Running with --gpu option. Checking the available and supported GPUs.")
+        gpus = cleanComputeCapabilities("cuda")
+        gpus = gpus + cleanComputeCapabilities("rocm", len(gpus))
+        available_gpus = gpus
+
+        print("> GPUs Available:")
+        [print(f) for f in available_gpus]
+
+        # Filtering ONLY CUDA GPUs on capability
+        gpus = [g for g in gpus if not g.isCUDA() or (g.isCUDA() and g.capability in opt.CUDACapabilities)]
+
+        # Filtering by name (if parsed)
+        if len(opt.GPUName) > 0:
+            gpus = [g for g in gpus if g.name == opt.GPUName]
+
+        if available_gpus != gpus:
+            print(">> Selected:")   
+            [print(f) for f in gpus]
+        else:
+            print(">> All selected!")
+        
+        opt.selected_gpus = cycle(gpus)
+    
     if opt.command: opt.command = ' '.join(opt.command)
     os.environ["CMSSW_DAS_QUERY_SITES"]=opt.dasSites
     if opt.failed_from: