1414
1515from .. import envs
1616from ..detector import (
17- Devices ,
18- ManufacturerEnum ,
17+ Topology ,
1918 detect_devices ,
19+ get_devices_topologies ,
20+ group_devices_by_manufacturer ,
2021 manufacturer_to_backend ,
2122)
2223from .__utils__ import (
@@ -1279,6 +1280,17 @@ class Deployer(ABC):
12791280 "AMD_VISIBLE_DEVICES": ["0", "1"]
12801281 }.
12811282 """
1283+ _visible_devices_topologies : dict [str , Topology ] | None = None
1284+ """
1285+ Recorded visible devices topologies,
1286+ the key is the runtime visible devices env name,
1287+ the value is the corresponding topology.
1288+ For example:
1289+ {
1290+ "NVIDIA_VISIBLE_DEVICES": Topology(...),
1291+ "AMD_VISIBLE_DEVICES": Topology(...)
1292+ }.
1293+ """
12821294 _backend_visible_devices_values_alignment : dict [str , dict [str , str ]] | None = None
12831295 """
12841296 Recorded backend visible devices values alignment,
@@ -1326,25 +1338,27 @@ def __enter__(self):
13261338 def __exit__ (self , exc_type , exc_value , traceback ):
13271339 self .close ()
13281340
1329- def _fetch_visible_devices_env_values (self ):
1341+ def _prepare (self ):
13301342 """
1331- Fetch the visible devices environment variables and values.
1343+ Detect devices once, and construct critical elements for post processing, including:
1344+ - Prepare visible devices environment variables mapping.
1345+ - Prepare visible devices values mapping.
1346+ - Prepare topology.
13321347 """
13331348 if self ._visible_devices_env :
13341349 return
13351350
13361351 self ._visible_devices_env = {}
13371352 self ._visible_devices_values = {}
1353+ self ._visible_devices_topologies = {}
13381354 self ._backend_visible_devices_values_alignment = {}
13391355
1340- devices : dict [ManufacturerEnum , Devices ] = {}
1341- for dev in detect_devices (fast = False ):
1342- if dev .manufacturer not in devices :
1343- devices [dev .manufacturer ] = []
1344- devices [dev .manufacturer ].append (dev )
1356+ group_devices = group_devices_by_manufacturer (
1357+ detect_devices (fast = False ),
1358+ )
13451359
1346- if devices :
1347- for manu , devs in devices .items ():
1360+ if group_devices :
1361+ for manu , devs in group_devices .items ():
13481362 backend = manufacturer_to_backend (manu )
13491363 rk = envs .GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY .get (backend )
13501364 ren = envs .GPUSTACK_RUNTIME_DEPLOY_RESOURCE_KEY_MAP_RUNTIME_VISIBLE_DEVICES .get (
@@ -1377,6 +1391,13 @@ def _fetch_visible_devices_env_values(self):
13771391 self ._backend_visible_devices_values_alignment [ben_item ] = (
13781392 dev_indexes_alignment
13791393 )
1394+ if (
1395+ envs .GPUSTACK_RUNTIME_DEPLOY_CPU_AFFINITY
1396+ or envs .GPUSTACK_RUNTIME_DEPLOY_NUMA_AFFINITY
1397+ ):
1398+ topos = get_devices_topologies (devices = devs )
1399+ if topos :
1400+ self ._visible_devices_topologies [ren ] = topos [0 ]
13801401
13811402 if self ._visible_devices_env :
13821403 return
@@ -1385,7 +1406,7 @@ def _fetch_visible_devices_env_values(self):
13851406 self ._visible_devices_env ["UNKNOWN_RUNTIME_VISIBLE_DEVICES" ] = []
13861407 self ._visible_devices_values ["UNKNOWN_RUNTIME_VISIBLE_DEVICES" ] = ["all" ]
13871408
1388- def visible_devices_env_values (
1409+ def get_visible_devices_env_values (
13891410 self ,
13901411 ) -> (dict [str , list [str ]], dict [str , list [str ]]):
13911412 """
@@ -1410,9 +1431,44 @@ def visible_devices_env_values(
14101431 to lists of device indexes or UUIDs.
14111432
14121433 """
1413- self ._fetch_visible_devices_env_values ()
1434+ self ._prepare ()
14141435 return self ._visible_devices_env , self ._visible_devices_values
14151436
1437+ def get_visible_devices_affinities (
1438+ self ,
1439+ runtime_env : list [str ],
1440+ resource_value : str ,
1441+ ) -> tuple [str , str ]:
1442+ """
1443+ Get the CPU and NUMA affinities for the given runtime environment and resource value.
1444+
1445+ Args:
1446+ runtime_env:
1447+ The list of runtime visible devices environment variable names.
1448+ resource_value:
1449+ The resource value, which can be "all" or a comma-separated list of device indexes
1450+
1451+ Returns:
1452+ A tuple containing:
1453+ - A comma-separated string of CPU affinities.
1454+ - A comma-separated string of NUMA affinities.
1455+
1456+ """
1457+ dev_indexes = []
1458+ if resource_value != "all" :
1459+ dev_indexes = [int (v .strip ()) for v in resource_value .split ("," )]
1460+
1461+ cpus_set : list [str ] = []
1462+ numas_set : list [str ] = []
1463+ for re_ in runtime_env :
1464+ topo = self ._visible_devices_topologies .get (re_ )
1465+ if topo :
1466+ cs , ns = topo .get_affinities (dev_indexes , deduplicate = False )
1467+ cpus_set .extend (cs )
1468+ numas_set .extend (ns )
1469+
1470+ return "," .join (set (cpus_set )), "," .join (set (numas_set ))
1471+
14161472 def align_backend_visible_devices_env_values (
14171473 self ,
14181474 backend_visible_devices_env : str ,
@@ -1440,7 +1496,7 @@ def align_backend_visible_devices_env_values(
14401496 not in envs .GPUSTACK_RUNTIME_DEPLOY_BACKEND_VISIBLE_DEVICES_VALUE_ALIGNMENT
14411497 ):
14421498 return resource_key_values
1443- self ._fetch_visible_devices_env_values ()
1499+ self ._prepare ()
14441500 alignments = self ._backend_visible_devices_values_alignment .get (
14451501 backend_visible_devices_env ,
14461502 )
0 commit comments