@@ -598,18 +598,18 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
598598 for row in reader :
599599 # Handle both "+" (heterogeneous) and "." (regular) job ID formats
600600 job_id_full = row ["JobID" ]
601-
601+
602602 # Split on both "+" and "." to handle different SLURM configurations
603603 if "+" in job_id_full :
604604 job_id , * parts = job_id_full .split ("+" )
605605 is_subjob = len (parts ) > 0 and "." in parts [0 ]
606606 else :
607607 job_id , * parts = job_id_full .split ("." )
608608 is_subjob = len (parts ) > 0
609-
609+
610610 if job_id != app_id :
611611 continue
612-
612+
613613 if is_subjob :
614614 # we only care about the main job not the child jobs (.batch, .0, etc.)
615615 continue
@@ -717,13 +717,17 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
717717 nodes_data = job_resources .get ("nodes" , {})
718718
719719 # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
720- if "allocation" in nodes_data and isinstance (nodes_data ["allocation" ], list ):
720+ if "allocation" in nodes_data and isinstance (
721+ nodes_data ["allocation" ], list
722+ ):
721723 # SLURM 24.11+ format: nodes.allocation is a list
722724 for node_info in nodes_data ["allocation" ]:
723725 hostname = node_info ["name" ]
724726 cpu = int (node_info ["cpus" ]["used" ])
725- memMB = int (node_info ["memory" ]["allocated" ]) // 1024 # Convert to MB
726-
727+ memMB = (
728+ int (node_info ["memory" ]["allocated" ]) // 1024
729+ ) # Convert to MB
730+
727731 role .resource = Resource (cpu = cpu , memMB = memMB , gpu = - 1 )
728732 role .num_replicas += 1
729733 role_status .replicas .append (
@@ -734,7 +738,9 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
734738 hostname = hostname ,
735739 )
736740 )
737- elif "allocated_nodes" in job_resources and isinstance (job_resources ["allocated_nodes" ], list ):
741+ elif "allocated_nodes" in job_resources and isinstance (
742+ job_resources ["allocated_nodes" ], list
743+ ):
738744 # Legacy format: allocated_nodes is a list
739745 for node_info in job_resources ["allocated_nodes" ]:
740746 # NOTE: we expect resource specs for all the nodes to be the same
@@ -772,7 +778,6 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
772778 )
773779 )
774780
775-
776781 return DescribeAppResponse (
777782 app_id = app_id ,
778783 roles = list (roles .values ()),
0 commit comments