Skip to content

Commit 2c91835

Browse files
committed
fix test failure
1 parent 4bb027f commit 2c91835

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

torchx/schedulers/slurm_scheduler.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,15 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
767767
)
768768
else:
769769
# Fallback: use hostname from nodes.list
770-
hostname = nodes_data.get("list", "")
770+
if isinstance(nodes_data, str):
771+
hostname = nodes_data
772+
else:
773+
hostname = (
774+
nodes_data.get("list", "")
775+
if isinstance(nodes_data, dict)
776+
else ""
777+
)
778+
771779
role.num_replicas += 1
772780
role_status.replicas.append(
773781
ReplicaStatus(

torchx/schedulers/test/slurm_scheduler_test.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,3 +1102,29 @@ def test_describe_sacct_handles_dot_separated_job_ids(self) -> None:
11021102
assert len(result.roles) == 1
11031103
assert result.roles[0].name == "mesh0"
11041104
assert result.roles[0].num_replicas == 1
1105+
1106+
def test_describe_squeue_nodes_as_string(self) -> None:
1107+
"""Test when job_resources.nodes is a string (hostname) not a dict."""
1108+
mock_job_data = {
1109+
"jobs": [
1110+
{
1111+
"name": "test-job-0",
1112+
"job_state": ["RUNNING"],
1113+
"job_resources": {
1114+
"nodes": "compute-node-123" # String, not dict
1115+
# No allocated_nodes field
1116+
},
1117+
"command": "/bin/echo",
1118+
"current_working_directory": "/tmp",
1119+
}
1120+
]
1121+
}
1122+
1123+
with patch("subprocess.check_output") as mock_subprocess:
1124+
mock_subprocess.return_value = json.dumps(mock_job_data)
1125+
1126+
scheduler = SlurmScheduler("test")
1127+
result = scheduler._describe_squeue("123")
1128+
1129+
assert result is not None
1130+
assert result.roles_statuses[0].replicas[0].hostname == "compute-node-123"

0 commit comments

Comments
 (0)