@@ -108,6 +108,10 @@ def build_graph(self, element, parent_gp=None):
108108 def discover_devices (self ):
109109 """
110110 Discover different kinds of GPU and NIC.
111+
112+ TODO: a better design would be to discover devices, then annotate the graph
113+ nodes with device type. Then when we search for gpu or nic, we can just
114+ search by that attribute. Right now we search for PCIDev, and then filter.
111115 """
112116 # Don't assume only one vendor of GPU.
113117 for vendor , command in {"NVIDIA" : commands .nvidia_smi , "AMD" : commands .rocm_smi }.items ():
@@ -180,7 +184,7 @@ def add_latency_edges(matrix, indexes):
180184 weight = matrix [i ][j ], # The weight is the latency value from the matrix.
181185 )
182186
183- # --- First, try to parse the modern hwloc v2.x format ---
187+ # First, try to parse newer hwloc v2.x format
184188 for dist_el in root .iter ("distances2" ):
185189 try :
186190 # nbobjs is how many objects are in the matrix (e.g., 2 for 2 NUMA nodes).
@@ -275,6 +279,9 @@ def match_resources(self, jobspec, allocated_gps=None):
275279 return final_allocation
276280
277281 def sort_by_affinity (self , candidates , affinity , allocated ):
282+ """
283+ Sort list of candidates by affinity so we get closest one.
284+ """
278285 target_type = self .translate_type (affinity .get ("type" ))
279286 if not target_type :
280287 log .warning ("Affinity spec missing 'type'." )
@@ -323,6 +330,48 @@ def translate_type(self, requested_type: str):
323330 # capitalizing the word (e.g., 'l3cache' -> 'L3cache').
324331 return mapping .get (requested_type .lower (), requested_type .capitalize ())
325332
333+ def find_bindable_leaves (self , total_allocation , bind_level ):
334+ """
335+ Given an allocation, find bindable leaf nodes.
336+
337+ This function handles three cases for each allocated resource:
338+ 1. Identity: The resource is already the desired bindable type.
339+ 2. Descendants: The resource is a container (e.g., Package) of the desired type.
340+ 3. Ancestors: The resource is a child (e.g., PU) of the desired type.
341+ """
342+ leaf_nodes = []
343+ log .debug (f"Transforming { len (total_allocation )} allocated objects to bind_level '{ bind_level } '..." )
344+
345+ # Concrete hwloc type string for our target bind level.
346+ # Hey spack, we can concretize too!
347+ bind_type_concrete = self .translate_type (bind_level )
348+
349+ for gp , data in total_allocation :
350+
351+ # Case 1: The allocated object IS the type we want to bind to.
352+ if data .get ('type' ) == bind_type_concrete :
353+ leaf_nodes .append ((gp , data ))
354+ continue
355+
356+ # Case 2: The allocated object is a container of the type we want to bind to.
357+ descendants = self .get_descendants (gp , type = bind_type_concrete )
358+ if descendants :
359+ leaf_nodes .extend (descendants )
360+ continue
361+
362+ # Case 3: The allocated object is a child of the type we want to bind to.
363+ # Example: total_allocation=[PU:0], bind_level='core'
364+ ancestor = self .get_ancestor_of_type (gp , bind_type_concrete )
365+ if ancestor :
366+ leaf_nodes .append (ancestor )
367+
368+ # De-duplicate the final list (e.g., if two PUs map to the same Core)
369+ # and sort for deterministic assignment.
370+ leaf_nodes = list ({gp : (gp , data ) for gp , data in leaf_nodes }.values ())
371+ leaf_nodes .sort (key = self .get_sort_key_for_node )
372+ log .debug (f"Transformation resulted in { len (leaf_nodes )} unique bindable leaf nodes." )
373+ return leaf_nodes
374+
326375 def summarize (self , nodes ):
327376 """
328377 Given a set of nodes in the graph (a set of resources) print a textual visual.
@@ -393,22 +442,7 @@ def get_available_children(self, parent_gp, child_type, allocated):
393442
394443 parent_type = parent_node .get ("type" )
395444 child_type_lower = child_type .lower ()
396-
397- candidate_gp_set = set ()
398- if child_type_lower == "gpu" :
399- for gpu_info in self .gpus :
400- if (pci_id := gpu_info .get ("pci_bus_id" )) and (
401- matches := self .find_objects (type = "PCIDev" , pci_busid = pci_id )
402- ):
403- candidate_gp_set .add (matches [0 ][0 ])
404- elif child_type_lower == "nic" :
405- for nic_gp , _ in self .nics :
406- candidate_gp_set .add (nic_gp )
407- else :
408- for gp , _ in self .find_objects (type = child_type ):
409- candidate_gp_set .add (gp )
410-
411- all_candidates = [(gp , self .graph .nodes [gp ]) for gp in candidate_gp_set ]
445+ all_candidates = self .find_objects (type = child_type )
412446 log .debug (
413447 f" - -> Found { len (all_candidates )} total unique system-wide candidates for '{ child_type } '."
414448 )
@@ -422,14 +456,14 @@ def get_available_children(self, parent_gp, child_type, allocated):
422456
423457 # Rule 1: Relationship to NUMA node is through shared PACKAGE parent (for Cores) or LOCALITY (for Devices)
424458 if parent_type == "NUMANode" :
425- if child_type_lower in ["gpu" , "nic" ]:
426- if data .get ("numa_os_index" ) == parent_node .get ("os_index" ):
427- is_valid_child = True
428- elif child_type_lower in ["core" , "pu" ]:
459+ if child_type_lower in ["core" , "pu" ]:
429460 package_of_numa = self .get_ancestor_of_type (parent_gp , "Package" )
430461 if package_of_numa and nx .has_path (self .hierarchy_view , package_of_numa [0 ], gp ):
431462 is_valid_child = True
432-
463+ elif child_type_lower == "pcidev" :
464+ if data .get ("numa_os_index" ) == parent_node .get ("os_index" ):
465+ is_valid_child = True
466+
433467 # Rule 2 (NEW): Relationship of a Core/PU to a Device is through shared NUMA LOCALITY
434468 elif parent_type == "PCIDev" and child_type_lower in ["core" , "pu" ]:
435469 parent_numa_idx = parent_node .get ("numa_os_index" )
@@ -449,8 +483,9 @@ def get_available_children(self, parent_gp, child_type, allocated):
449483 return available
450484
451485 def find_objects (self , ** attributes ):
452- # STOPED HERE, trace this and see if gp is globally identifier, add comments.
453- # then go back to making distances/distances2 function and add comments.
486+ """
487+ Search nodes in the graph for a specific attribute (or more than one)
488+ """
454489 return [
455490 (gp , data )
456491 for gp , data in self .graph .nodes (data = True )
@@ -468,9 +503,23 @@ def find_assignment_recursive(self, request, domain_gp, allocated, depth=0):
468503 domain_info = f"{ domain_node .get ('type' )} :{ domain_node .get ('os_index' , domain_node .get ('pci_busid' , domain_gp ))} "
469504 log .debug (f"{ indent } [ENTER] find_assignment(req={ request } , domain={ domain_info } )" )
470505
471- req_type , count = self .translate_type (request ["type" ]), request ["count" ]
506+ # This can also be gpu/nic
507+ raw_request_type = request ['type' ]
508+ req_type , count = self .translate_type (raw_request_type ), request ["count" ]
472509
510+ # If the req_type is gpu or nic, this isn't an actual type in the graph - it is PCIDev.
473511 candidates = self .get_available_children (domain_gp , req_type , allocated )
512+
513+ # Now we handle the type of the pcidev request and filter candidates to those devices.
514+ if raw_request_type .lower () == 'gpu' :
515+ gpu_bus_ids = {g ['pci_bus_id' ] for g in self .gpus }
516+ candidates = [node for node in candidates if node [1 ].get ('pci_busid' ) in gpu_bus_ids ]
517+ log .debug (f"{ indent } -> Filtered for 'gpu', { len (candidates )} candidates remain." )
518+ elif raw_request_type .lower () == 'nic' :
519+ nic_bus_ids = {n [1 ]['pci_busid' ] for n in self .nics }
520+ candidates = [node for node in candidates if node [1 ].get ('pci_busid' ) in nic_bus_ids ]
521+ log .debug (f"{ indent } -> Filtered for 'nic', { len (candidates )} candidates remain." )
522+
474523 log .debug (f"{ indent } -> Found { len (candidates )} initial candidates for '{ req_type } '." )
475524
476525 affinity_spec = request .get ("affinity" )
@@ -550,6 +599,9 @@ def find_assignment_recursive(self, request, domain_gp, allocated, depth=0):
550599 return None
551600
552601 def get_descendants (self , gp_index , ** filters ):
602+ """
603+ Given a global position index, return descendents that match a filter.
604+ """
553605 if gp_index not in self .graph :
554606 return []
555607 desc = list (nx .descendants (self .hierarchy_view , gp_index ))
@@ -648,14 +700,17 @@ def create_ordered_gpus(self):
648700 precompute_numa_affinities are done.
649701 """
650702 ordered_gpus = []
703+
704+ # The gpus we found were discovered with nvidia/rocm-smi and we need
705+ # to map to things in the graph.
651706 for gpu_info in self .gpus :
652707 pci_id = gpu_info .get ("pci_bus_id" )
653708 if not pci_id :
654709 continue
655710
656711 # Find the corresponding PCIDev object in our graph
657712 # Note: We now store and search for types in lowercase
658- matches = self .find_objects (type = "pcidev " , pci_busid = pci_id )
713+ matches = self .find_objects (type = "PCIDev " , pci_busid = pci_id )
659714 if not matches :
660715 log .warning (
661716 f"Could not find a graph object for discovered GPU with PCI ID: { pci_id } "
0 commit comments