@@ -179,12 +179,7 @@ class ProcessGroupNCCLNoHeartbeatCaught
179179 int rank,
180180 int size,
181181 c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> opts)
182- : ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {
183- // Override the heartbeat monitor function to make sure that we capture
184- // the exception in the monitor thread because we cannot try-catch it in
185- // the main thread and we set a flag for the main thread to check.
186- heartbeatMonitor_ = std::make_unique<TestHeartbeatMonitor>(this );
187- }
182+ : ProcessGroupNCCLTimedOutErrors(store, rank, size, std::move(opts)) {}
188183
189184 std::mutex& getWatchdogMutex () {
190185 return workMetaListMutex_;
@@ -200,22 +195,18 @@ class ProcessGroupNCCLNoHeartbeatCaught
200195 asyncDebugDump.wait ();
201196 }
202197
203- class TestHeartbeatMonitor : public c10d ::ProcessGroupNCCL::HeartbeatMonitor {
204- public:
205- using HeartbeatMonitor::HeartbeatMonitor;
206-
207- void runLoop () override {
208- try {
209- c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop ();
210- } catch (std::runtime_error& e) {
211- // Safe cast because we know it's a ProcessGroupNCCLNoHeartbeatCaught
212- auto * pg = static_cast <ProcessGroupNCCLNoHeartbeatCaught*>(pg_);
213- pg->hasMonitorThreadCaughtError_ = true ;
214- }
198+ protected:
199+ // Override the heartbeat monitor function to make sure that we capture
200+ // the exception in the monitor thread because we cannot try-catch it in
201+ // the main thread and we set a flag for the main thread to check.
202+ void heartbeatMonitor () override {
203+ try {
204+ c10d::ProcessGroupNCCL::heartbeatMonitor ();
205+ } catch (std::runtime_error& e) {
206+ hasMonitorThreadCaughtError_ = true ;
215207 }
216- };
208+ }
217209
218- protected:
219210 // It's really hard to unit test std::abort. So we override it instead.
220211 // Commented this override, we do see process aborted with core dump without
221212 // this override.
0 commit comments