Skip to content

Commit 77a7b6c

Browse files
authored
[release/2.7][ROCm] update state check for test_trace_while_active* (… (#2202)
#2110) When timing in enabled, ROCR runtime used to sleep for a small amount which ensured that the application saw the correct state. However, for perf reasons this sleep was removed and now the state is not guaranteed to be "started". That's why, I updated the test state check to be either "started" or "scheduled" Fixes https://ontrack-internal.amd.com/browse/SWDEV-525883 Upstream PR: pytorch#153545 (cherry picked from commit 8a1ad2c) Fixes #ISSUE_NUMBER
1 parent a9d0da9 commit 77a7b6c

File tree

1 file changed

+12
-4
lines changed

1 file changed

+12
-4
lines changed

test/distributed/test_c10d_nccl.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from torch import nn
4040
from torch._C._distributed_c10d import ErrorType, OpType, WorkResult
4141
from torch.nn.parallel import DistributedDataParallel
42-
from torch.testing._internal.common_cuda import TEST_MULTIGPU
42+
from torch.testing._internal.common_cuda import TEST_MULTIGPU, _get_torch_rocm_version
4343
from torch.testing._internal.common_distributed import (
4444
get_timeout,
4545
init_multigpu_helper,
@@ -4634,9 +4634,17 @@ def test_trace_while_active(self, timing_enabled, only_active):
46344634
else:
46354635
self.assertEqual(t[-1]["profiling_name"], "nccl:all_reduce")
46364636
self.assertEqual(t[-1]["collective_seq_id"], 2)
4637-
self.assertEqual(
4638-
t[-1]["state"], self.started_or_scheduled(timing_enabled)
4639-
)
4637+
4638+
#ROCm runtime used to call uSleep(20 µs)inside the default‑signal busy-wait loop.
4639+
#Now, this sleep is removed which lets the host thread spin continuously
4640+
#Therefore, the state can either be scheduled or started before test dumps the trace.
4641+
if torch.version.hip and _get_torch_rocm_version() >= (6,4) and timing_enabled:
4642+
assert(
4643+
t[-1]["state"] in ("scheduled", "started"))
4644+
else:
4645+
self.assertEqual(
4646+
t[-1]["state"], self.started_or_scheduled(timing_enabled)
4647+
)
46404648

46414649
self.parent.send("next")
46424650
self.assertEqual("next", self.parent.recv())

0 commit comments

Comments
 (0)