Skip to content

Commit 852b99e

Browse files
Revert "[c10d] Separate monitoring thread into a class in PGNCCL (pytorch#153977)"
This reverts commit 0db9c64. Reverted pytorch#153977 on behalf of https://github.com/izaitsevfb due to breaks lots of jobs internally, safer to revert, see D75628917 ([comment](pytorch#153977 (comment)))
1 parent 20ee5f9 commit 852b99e

File tree

3 files changed

+163
-257
lines changed

3 files changed

+163
-257
lines changed

test/cpp/c10d/ProcessGroupNCCLErrorsTest.cpp

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,7 @@ class ProcessGroupNCCLNoHeartbeatCaught
179179
int rank,
180180
int size,
181181
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
182-
: ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {
183-
// Override the heartbeat monitor function to make sure that we capture
184-
// the exception in the monitor thread because we cannot try-catch it in
185-
// the main thread and we set a flag for the main thread to check.
186-
heartbeatMonitor_ = std::make_unique<TestHeartbeatMonitor>(this);
187-
}
182+
: ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {}
188183

189184
std::mutex& getWatchdogMutex() {
190185
return workMetaListMutex_;
@@ -200,22 +195,18 @@ class ProcessGroupNCCLNoHeartbeatCaught
200195
asyncDebugDump.wait();
201196
}
202197

203-
class TestHeartbeatMonitor : public c10d::ProcessGroupNCCL::HeartbeatMonitor {
204-
public:
205-
using HeartbeatMonitor::HeartbeatMonitor;
206-
207-
void runLoop() override {
208-
try {
209-
c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop();
210-
} catch (std::runtime_error& e) {
211-
// Safe cast because we know it's a ProcessGroupNCCLNoHeartbeatCaught
212-
auto* pg = static_cast<ProcessGroupNCCLNoHeartbeatCaught*>(pg_);
213-
pg->hasMonitorThreadCaughtError_ = true;
214-
}
198+
protected:
199+
// Override the heartbeat monitor function to make sure that we capture
200+
// the exception in the monitor thread because we cannot try-catch it in
201+
// the main thread and we set a flag for the main thread to check.
202+
void heartbeatMonitor() override {
203+
try {
204+
c10d::ProcessGroupNCCL::heartbeatMonitor();
205+
} catch (std::runtime_error& e) {
206+
hasMonitorThreadCaughtError_ = true;
215207
}
216-
};
208+
}
217209

218-
protected:
219210
// It's really hard to unit test std::abort. So we override it instead.
220211
// Commented this override, we do see process aborted with core dump without
221212
// this override.

0 commit comments

Comments
 (0)