Remove completed_work_queue_

pavanbalaji · meta-codesync[bot] · commit 03da44178563 · 2025-10-26T11:45:26.000-07:00
Summary:
The completed_work_queue_ is no longer needed.  It is OK to free
work objects and the corresponding tensors from the timeout thread as
well.

Reviewed By: tanquer

Differential Revision: D85455174

fbshipit-source-id: e74fab99702515a1295533c79fec83f0e2c50adf
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.hpp b/comms/torchcomms/ncclx/TorchCommNCCLX.hpp
@@ -297,7 +297,7 @@ class TorchCommNCCLX : public TorchCommBackend,
   void timeoutWatchdog() noexcept;
   void checkInitialized() const;
   void checkAndAbortIfTimedOutOrError();
-  void checkWorkQueue(bool isMainThread);
+  void checkWorkQueue();
   void enqueueWork(std::shared_ptr<TorchWorkNCCLX> work, cudaStream_t stream);
   bool getGraphCaptureMode();
   cudaStream_t getOperationStream(bool async_op);
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLXUtils.cpp b/comms/torchcomms/ncclx/TorchCommNCCLXUtils.cpp
@@ -157,8 +157,8 @@ NcclxWindowCmpOp TorchCommNCCLX::getNcclSignalCmpOp(SignalCmpOp op) {
 #endif
 }
 
-void TorchCommNCCLX::checkWorkQueue(bool isMainThread) {
-  TorchWorkNCCLX::WorkStatus status = workq_.garbageCollect(isMainThread);
+void TorchCommNCCLX::checkWorkQueue() {
+  TorchWorkNCCLX::WorkStatus status = workq_.garbageCollect();
 
   switch (status) {
     case TorchWorkNCCLX::WorkStatus::TIMEDOUT:
@@ -192,7 +192,7 @@ void TorchCommNCCLX::timeoutWatchdog() noexcept {
     }
 
     // Check work objects for completion or timeout
-    checkWorkQueue(false);
+    checkWorkQueue();
     if (comm_state_ != CommState::NORMAL &&
         options_.abort_process_on_timeout_or_error) {
       // Log the error and abort the process.  We cannot abort the NCCL
@@ -226,7 +226,7 @@ void TorchCommNCCLX::checkAndAbortIfTimedOutOrError() {
   }
 
   // First, check work queue status
-  checkWorkQueue(true);
+  checkWorkQueue();
 
   if (comm_state_ == CommState::TIMEOUT) {
     abortNcclComm();
diff --git a/comms/torchcomms/ncclx/TorchWorkNCCLX.cpp b/comms/torchcomms/ncclx/TorchWorkNCCLX.cpp
@@ -121,9 +121,6 @@ TorchWorkNCCLX::WorkStatus TorchWorkNCCLX::checkStatus() {
   if (end_status == cudaSuccess) {
     // End event has completed, mark the work as completed
     state_ = WorkStatus::COMPLETED;
-
-    // Release the input tensors
-    inputTensors_.clear();
   } else if (end_status == cudaErrorNotReady) {
     // End event has not completed yet, check for timeout
     auto current_time = std::chrono::steady_clock::now();
diff --git a/comms/torchcomms/ncclx/TorchWorkNCCLX.hpp b/comms/torchcomms/ncclx/TorchWorkNCCLX.hpp
@@ -95,15 +95,14 @@ class TorchWorkNCCLXQueue {
   TorchWorkNCCLXQueue() = default;
   ~TorchWorkNCCLXQueue() = default;
 
-  TorchWorkNCCLX::WorkStatus garbageCollect(bool isMainThread);
+  TorchWorkNCCLX::WorkStatus garbageCollect();
   // Finalize function can only be called from the main thread
   TorchWorkNCCLX::WorkStatus finalize();
   void enqueueWork(std::shared_ptr<TorchWorkNCCLX> work, cudaStream_t stream);
 
  private:
   std::unordered_map<cudaStream_t, std::queue<std::shared_ptr<TorchWorkNCCLX>>>
       stream_work_queues_;
-  std::vector<std::shared_ptr<TorchWorkNCCLX>> completed_work_queue_;
   std::recursive_mutex work_queues_mutex_;
 
   friend class TorchWorkNCCLXQueueCommTest;
diff --git a/comms/torchcomms/ncclx/TorchWorkNCCLXQueue.cpp b/comms/torchcomms/ncclx/TorchWorkNCCLXQueue.cpp
@@ -5,8 +5,7 @@
 namespace torch {
 namespace comms {
 
-TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::garbageCollect(
-    bool isMainThread) {
+TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::garbageCollect() {
   std::lock_guard<std::recursive_mutex> lock(work_queues_mutex_);
 
   TorchWorkNCCLX::WorkStatus last_status =
@@ -30,7 +29,6 @@ TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::garbageCollect(
       if (status == TorchWorkNCCLX::WorkStatus::COMPLETED) {
         // Work is completed, remove it from the work queue
         work_queue.pop();
-        completed_work_queue_.push_back(work);
         // Continue to the next element in the queue
       } else if (
           status == TorchWorkNCCLX::WorkStatus::TIMEDOUT ||
@@ -51,11 +49,6 @@ TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::garbageCollect(
     }
   }
 
-  if (isMainThread) {
-    // If we are the main thread, clear the completed work queues
-    completed_work_queue_.clear();
-  }
-
   return last_status;
 }
 
@@ -71,7 +64,7 @@ TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::finalize() {
   // empty
   TorchWorkNCCLX::WorkStatus status = TorchWorkNCCLX::WorkStatus::COMPLETED;
   while (!stream_work_queues_.empty()) {
-    status = garbageCollect(true);
+    status = garbageCollect();
     if (status == TorchWorkNCCLX::WorkStatus::ERROR ||
         status == TorchWorkNCCLX::WorkStatus::TIMEDOUT ||
         status == TorchWorkNCCLX::WorkStatus::COMPLETED) {
@@ -84,7 +77,6 @@ TorchWorkNCCLX::WorkStatus TorchWorkNCCLXQueue::finalize() {
   // NOTE: finalize MUST return without holding references to any work object,
   // otherwise it may leak object and cause side effects.
   stream_work_queues_.clear();
-  completed_work_queue_.clear();
 
   return status;
 }
diff --git a/comms/torchcomms/ncclx/tests/unit/cpp/TorchWorkNCCLXQueueTest.cpp b/comms/torchcomms/ncclx/tests/unit/cpp/TorchWorkNCCLXQueueTest.cpp
@@ -146,18 +146,14 @@ class TorchWorkNCCLXQueueCommTest : public ::testing::Test {
     EXPECT_CALL(*mock_hook_, clear()).Times(times_clear);
   }
 
-  void checkWorkQueue(bool isMainThread) {
-    comm_->checkWorkQueue(isMainThread);
+  void checkWorkQueue() {
+    comm_->checkWorkQueue();
   }
 
   const auto& getStreamWorkQueues() {
     return comm_->workq_.stream_work_queues_;
   }
 
-  const auto& getCompletedWorkQueue() {
-    return comm_->workq_.completed_work_queue_;
-  }
-
   cudaEvent_t getAsyncDependencyEvent() {
     return comm_->dependency_event_;
   }
@@ -180,7 +176,7 @@ class TorchWorkNCCLXQueueCommTest : public ::testing::Test {
 
 TEST_F(TorchWorkNCCLXQueueTest, GarbageCollectEmptyQueue) {
   // Test garbage collection on empty queue
-  auto status = queue_->garbageCollect(false);
+  auto status = queue_->garbageCollect();
   EXPECT_EQ(status, TorchWorkNCCLX::WorkStatus::COMPLETED);
 }
 
@@ -191,9 +187,9 @@ TEST_F(TorchWorkNCCLXQueueTest, FinalizeEmptyQueue) {
 
 TEST_F(TorchWorkNCCLXQueueTest, MultipleGarbageCollectCalls) {
   // Multiple garbage collect calls on empty queue should be safe
-  auto status1 = queue_->garbageCollect(false);
-  auto status2 = queue_->garbageCollect(false);
-  auto status3 = queue_->garbageCollect(true);
+  auto status1 = queue_->garbageCollect();
+  auto status2 = queue_->garbageCollect();
+  auto status3 = queue_->garbageCollect();
 
   EXPECT_EQ(status1, TorchWorkNCCLX::WorkStatus::COMPLETED);
   EXPECT_EQ(status2, TorchWorkNCCLX::WorkStatus::COMPLETED);
@@ -202,7 +198,7 @@ TEST_F(TorchWorkNCCLXQueueTest, MultipleGarbageCollectCalls) {
 
 TEST_F(TorchWorkNCCLXQueueTest, MultipleFinalizeCallsAfterGarbageCollect) {
   // Garbage collect first
-  auto gc_status = queue_->garbageCollect(false);
+  auto gc_status = queue_->garbageCollect();
   EXPECT_EQ(gc_status, TorchWorkNCCLX::WorkStatus::COMPLETED);
 
   // Multiple finalize calls should be safe
@@ -215,8 +211,8 @@ TEST_F(TorchWorkNCCLXQueueTest, MultipleFinalizeCallsAfterGarbageCollect) {
 
 TEST_F(TorchWorkNCCLXQueueTest, GarbageCollectMainThreadFlag) {
   // Test that the isMainThread flag doesn't cause issues on empty queue
-  auto status1 = queue_->garbageCollect(false);
-  auto status2 = queue_->garbageCollect(true);
+  auto status1 = queue_->garbageCollect();
+  auto status2 = queue_->garbageCollect();
 
   EXPECT_EQ(status1, TorchWorkNCCLX::WorkStatus::COMPLETED);
   EXPECT_EQ(status2, TorchWorkNCCLX::WorkStatus::COMPLETED);
@@ -232,17 +228,16 @@ TEST_F(TorchWorkNCCLXQueueTest, ConcurrentGarbageCollectCalls) {
   // mutex-protected operations work correctly with multiple calls
 
   for (int i = 0; i < 10; ++i) {
-    auto status =
-        queue_->garbageCollect(i % 2 == 0); // Alternate main thread flag
+    auto status = queue_->garbageCollect();
     EXPECT_EQ(status, TorchWorkNCCLX::WorkStatus::COMPLETED);
   }
 }
 
 TEST_F(TorchWorkNCCLXQueueTest, ConcurrentFinalizeAndGarbageCollect) {
   // Test that finalize and garbage collect can be called in sequence safely
-  auto gc_status = queue_->garbageCollect(false);
+  auto gc_status = queue_->garbageCollect();
   auto finalize_status = queue_->finalize();
-  auto gc_status2 = queue_->garbageCollect(true);
+  auto gc_status2 = queue_->garbageCollect();
 
   EXPECT_EQ(gc_status, TorchWorkNCCLX::WorkStatus::COMPLETED);
   EXPECT_EQ(finalize_status, TorchWorkNCCLX::WorkStatus::COMPLETED);
@@ -278,7 +273,7 @@ TEST_F(TorchWorkNCCLXQueueTest, QueueCreationAndDestruction) {
   EXPECT_NE(queue2, nullptr);
 
   // Test basic operations on new queue
-  auto status = queue2->garbageCollect(false);
+  auto status = queue2->garbageCollect();
   EXPECT_EQ(status, TorchWorkNCCLX::WorkStatus::COMPLETED);
 
   status = queue2->finalize();
@@ -294,8 +289,8 @@ TEST_F(TorchWorkNCCLXQueueTest, MultipleQueuesIndependent) {
   auto queue3 = std::make_unique<TorchWorkNCCLXQueue>();
 
   // Operations on different queues should not interfere
-  auto status1 = queue_->garbageCollect(false);
-  auto status2 = queue2->garbageCollect(true);
+  auto status1 = queue_->garbageCollect();
+  auto status2 = queue2->garbageCollect();
   auto status3 = queue3->finalize();
 
   EXPECT_EQ(status1, TorchWorkNCCLX::WorkStatus::COMPLETED);
@@ -322,12 +317,11 @@ TEST_F(TorchWorkNCCLXQueueCommTest, NoLeakedObjectsAfterFinalize) {
   auto work = comm_->send(tensor, 1, true); // async send
 
   // Simulate the timeout thread calling checkWorkQueue
-  checkWorkQueue(/*isMainThread=*/false);
+  checkWorkQueue();
   // Comm finalize will call the work queue finalize().
   comm_->finalize();
 
   EXPECT_EQ(getStreamWorkQueues().size(), 0);
-  EXPECT_EQ(getCompletedWorkQueue().size(), 0);
 }
 
 TEST_F(TorchWorkNCCLXQueueCommTest, NoFailureUnderCudaGraphMode) {
@@ -370,12 +364,11 @@ TEST_F(TorchWorkNCCLXQueueCommTest, NoFailureUnderCudaGraphMode) {
   auto work = comm_->send(tensor, 1, true); // async send
 
   // Simulate the timeout thread calling checkWorkQueue
-  checkWorkQueue(/*isMainThread=*/false);
+  checkWorkQueue();
   // Comm finalize will call the work queue finalize().
   comm_->finalize();
 
   EXPECT_EQ(getStreamWorkQueues().size(), 0);
-  EXPECT_EQ(getCompletedWorkQueue().size(), 0);
 }
 
 } // namespace comms