Fix segfault on exit in CachingHostAllocator by signaling background thread to exit (pytorch#154117)

keith2018 · pytorchmergebot · commit c4ef4090c578 · 2025-05-25T07:46:12.000Z
Fixes pytorch#152008 This PR fixes a segmentation fault that occurred when exiting the program due to improper background thread management in CachingHostAllocator. Previously, the background thread continued running and called process_events() even after the allocator object was destroyed, leading to a crash on exit. https://github.com/pytorch/pytorch/blob/f12d8d60b19083123d810ebda1eb1591dbe3dd3d/aten/src/ATen/core/CachingHostAllocator.h#L218 ```cpp // Launch the background thread and process events in a loop. static bool background_thread_flag [[maybe_unused]] = [this] { getBackgroundThreadPool()->run([&]() { while (true) { process_events(); // <-- This line may cause segfault on exit std::this_thread::sleep_for(std::chrono::microseconds(100)); } }); return true; }(); ``` The fix adds a mechanism to signal the background thread to exit before the object is destructed, ensuring the thread stops safely. Pull Request resolved: pytorch#154117 Approved by: https://github.com/ngimel, https://github.com/cyyever
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
@@ -177,7 +177,12 @@ template <
     typename E,
     typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
-  virtual ~CachingHostAllocatorImpl() = default;
+  virtual ~CachingHostAllocatorImpl() {
+    active_ = false;
+    if (pinned_use_background_threads()) {
+      getBackgroundThreadPool()->waitWorkComplete();
+    }
+  }
 
  public:
   // return data_ptr and block pair.
@@ -214,7 +219,7 @@ struct CachingHostAllocatorImpl {
       // Launch the background thread and process events in a loop.
       static bool background_thread_flag [[maybe_unused]] = [this] {
         getBackgroundThreadPool()->run([&]() {
-          while (true) {
+          while (active_) {
             process_events();
             std::this_thread::sleep_for(std::chrono::microseconds(100));
           }
@@ -620,6 +625,10 @@ struct CachingHostAllocatorImpl {
 
   alignas(64) std::mutex events_mutex_;
   std::deque<std::pair<E, B*>> events_; // event queue paired with block
+
+  // Indicates whether the object is active.
+  // Set to false in the destructor to signal background threads to stop.
+  std::atomic<bool> active_{true};
 protected:
   alignas(64) HostStatsStaged stats_;
 };
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -328,6 +328,19 @@ def test_pinned_memory_empty_cache(self):
                 "pinned_use_cuda_host_register:False"
             )
 
+    def test_pinned_memory_use_background_threads(self):
+        script = """
+import torch
+
+torch.cuda.memory._set_allocator_settings(
+    f"pinned_use_background_threads:True"
+)
+t = torch.ones(1024 * 1024, pin_memory=True)
+print(t.is_pinned())
+"""
+        proc = subprocess.run([sys.executable, "-c", script], capture_output=True)
+        self.assertEqual(proc.returncode, 0)
+
     def test_cudart_register(self):
         t = torch.ones(20)
         self.assertFalse(t.is_pinned())