Skip to content

Commit fcac383

Browse files
committed
more scheduler tests
Signed-off-by: oliver könig <[email protected]>
1 parent 7ecba1e commit fcac383

File tree

2 files changed

+84
-13
lines changed

2 files changed

+84
-13
lines changed

nemo_run/run/torchx_backend/schedulers/docker.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,23 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
170170
)
171171
)
172172
states.append(_state)
173+
else:
174+
status_file = os.path.join(req.executor.job_dir, f"status_{role}.out")
175+
if os.path.exists(status_file):
176+
with open(status_file, "r") as f:
177+
status = json.load(f)
178+
roles_statuses[role].replicas.append(
179+
ReplicaStatus(
180+
id=0,
181+
role=role,
182+
state=int(status["exit_code"]),
183+
hostname=container.name,
184+
)
185+
)
186+
state = (
187+
AppState.FAILED if int(status["exit_code"]) != 0 else AppState.SUCCEEDED
188+
)
189+
states.append(state)
173190

174191
state = AppState.UNKNOWN
175192
if any(is_terminal(state) for state in states):
@@ -179,19 +196,6 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
179196
state = AppState.FAILED
180197
elif len(states) > 0:
181198
state = next(state for state in states if not is_terminal(state))
182-
else:
183-
status_file = os.path.join(req.executor.job_dir, f"status_{role}.out")
184-
if os.path.exists(status_file):
185-
with open(status_file, "r") as f:
186-
status = json.load(f)
187-
roles_statuses[role].replicas.append(
188-
ReplicaStatus(
189-
id=0, role=role, state=int(status["exit_code"]), hostname=container.name
190-
)
191-
)
192-
state = AppState.FAILED if int(status["exit_code"]) != 0 else AppState.SUCCEEDED
193-
else:
194-
state = AppState.UNKNOWN
195199

196200
return DescribeAppResponse(
197201
app_id=app_id,

test/run/torchx_backend/schedulers/test_docker.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import json
17+
import os
1618
import tempfile
1719
from unittest import mock
1820

@@ -153,6 +155,71 @@ def test_describe_running(docker_scheduler, docker_executor):
153155
assert len(response.roles) == 1
154156

155157

158+
def test_describe_failed(docker_scheduler, docker_executor):
159+
with (
160+
mock.patch.object(DockerJobRequest, "load") as mock_load,
161+
mock.patch.object(DockerContainer, "get_container") as mock_get_container,
162+
mock.patch.object(PersistentDockerScheduler, "_get_app_state") as mock_get_app_state,
163+
):
164+
container = DockerContainer(
165+
name="test_role",
166+
command=["test"],
167+
executor=docker_executor,
168+
extra_env={},
169+
)
170+
req = DockerJobRequest(
171+
id="test_session___test_role___test_container_id",
172+
executor=docker_executor,
173+
containers=[container],
174+
)
175+
mock_load.return_value = req
176+
mock_get_container.return_value = container
177+
mock_get_app_state.return_value = None
178+
status_file = os.path.join(req.executor.job_dir, f"status_{req.containers[0].name}.out")
179+
180+
with open(status_file, "w") as f:
181+
f.write(json.dumps({"exit_code": 1}))
182+
183+
response = docker_scheduler.describe(req.id)
184+
assert response is not None
185+
assert response.app_id == req.id
186+
assert "FAILED" in str(response.state)
187+
assert len(response.roles) == 1
188+
189+
190+
@pytest.mark.xfail
191+
def test_describe_failure_not_detected(docker_scheduler, docker_executor):
192+
with (
193+
mock.patch.object(DockerJobRequest, "load") as mock_load,
194+
mock.patch.object(DockerContainer, "get_container") as mock_get_container,
195+
mock.patch.object(PersistentDockerScheduler, "_get_app_state") as mock_get_app_state,
196+
):
197+
container = DockerContainer(
198+
name="test_role",
199+
command=["test"],
200+
executor=docker_executor,
201+
extra_env={},
202+
)
203+
req = DockerJobRequest(
204+
id="test_session___test_role___test_container_id",
205+
executor=docker_executor,
206+
containers=[container],
207+
)
208+
mock_load.return_value = req
209+
mock_get_container.return_value = container
210+
mock_get_app_state.return_value = None
211+
status_file = os.path.join(req.executor.job_dir, f"status_{req.containers[0].name}.out")
212+
213+
with open(status_file, "w") as f:
214+
f.write(json.dumps({"exit_code": 1}))
215+
216+
response = docker_scheduler.describe(req.id)
217+
assert response is not None
218+
assert response.app_id == req.id
219+
assert "SUCCEEDED" in str(response.state)
220+
assert len(response.roles) == 1
221+
222+
156223
def test_save_and_get_job_dirs():
157224
with tempfile.TemporaryDirectory() as temp_dir:
158225
from nemo_run.config import set_nemorun_home

0 commit comments

Comments
 (0)