Skip to content

Commit 0b8306b

Browse files
ko3n1gclaude
andauthored
fix: Remove fail-fast (-x) and guard distributed teardown against deadlock (#4139)
Signed-off-by: oliver könig <okoenig@nvidia.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 10e7b74 commit 0b8306b

File tree

4 files changed

+13
-5
lines changed

4 files changed

+13
-5
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ exclude = '''
234234
'''
235235

236236
[tool.pytest.ini_options]
237-
addopts = "--durations=15 -s -rA -x"
237+
addopts = "--durations=15 -s -rA"
238238
testpaths = ["tests"]
239239
python_files = "test_*.py"
240240
markers = [

tests/unit_tests/conftest.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22

33
import os
4+
from datetime import timedelta
45
from pathlib import Path
56

67
import pytest
@@ -42,7 +43,10 @@ def pytest_sessionfinish(session, exitstatus):
4243
def cleanup():
4344
yield
4445
if torch.distributed.is_initialized():
45-
torch.distributed.barrier()
46+
try:
47+
torch.distributed.barrier(timeout=timedelta(seconds=30))
48+
except Exception:
49+
return
4650
torch.distributed.destroy_process_group()
4751

4852

tests/unit_tests/run_ci_test.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,14 @@ for i in $(seq $UNIT_TEST_REPEAT); do
148148
--data-file=.coverage.unit_tests \
149149
--source=megatron/core \
150150
-m pytest \
151-
-xvs \
151+
-vs \
152152
${IGNORE_ARGS[@]} \
153153
-m "'not experimental and ${MARKER_ARG}'" $(echo "$BUCKET" | sed 's|/\*\*/\*\.py$||'))
154154
eval "$CMD"
155155

156156
if [[ "$TAG" == "latest" ]]; then
157157
CMD=$(echo uv run --no-sync python -m torch.distributed.run ${DISTRIBUTED_ARGS[@]} -m pytest \
158-
-xvs \
158+
-vs \
159159
--experimental \
160160
${IGNORE_ARGS[@]} \
161161
-m "'experimental and ${MARKER_ARG}'" $(echo "$BUCKET" | sed 's|/\*\*/\*\.py$||'))

tests/unit_tests/test_utilities.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,11 @@ def destroy_model_parallel():
9191
os.environ.pop('NVTE_UNFUSED_ATTN', None)
9292
if not Utils.inited:
9393
return
94-
torch.distributed.barrier()
94+
try:
95+
torch.distributed.barrier(timeout=timedelta(seconds=30))
96+
except Exception:
97+
Utils.inited = False
98+
return
9599
ps.destroy_model_parallel()
96100
Utils.inited = False
97101

0 commit comments

Comments
 (0)