[GPU Snapshot] Add Clear History Flag (pytorch#149352)

sraikund16 · amathewc · commit f7db51c40fb8 · 2025-04-17T07:03:12.000+03:00
Summary: Oftentimes, users complain that a bunch of extra events are prepended to their desired GPU snapshot. This is because they usually attach an OOM logger without knowing and when they go to collect the actual snapshot, it adds all the OOM logger contents. Since OOM and regular snapshot use the same backend, we currently don't have the infra in place to split these snapshots. As a solution we add a flag to the snapshot frontend to clear out the history when starting the auto-trace record memory history. A more thorough solution would be to have a user pass in a handle and to have snapshots per handle to seperate the events. However, this would likely be complicated and more work than it is worth as we would have to change the callbacks in the caching allocator and pass these objects between python and cpp. Test Plan: See diff below Differential Revision: D71159720 Pull Request resolved: pytorch#149352 Approved by: https://github.com/eqy, https://github.com/aaronenyeshi
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
@@ -1119,14 +1119,15 @@ class DeviceCachingAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_buffer_max_entries,
-      RecordContext when) {
+      RecordContext when,
+      bool clearHistory) {
     std::unique_lock<std::recursive_mutex> lock(mutex);
     TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
     record_history = enabled;
     context_recorder_.store(record_history ? context_recorder : nullptr);
     alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
     record_context_ = enabled ? when : RecordContext::NEVER;
-    if (!enabled) {
+    if (!enabled || clearHistory) {
       alloc_buffer.clear();
     }
   }
@@ -3441,13 +3442,18 @@ class NativeCachingAllocator : public CUDAAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_buffer_max_entries,
-      RecordContext when) override {
+      RecordContext when,
+      bool clearHistory) override {
     record_history = enabled;
     annotation_buffer.setMaxEntries(alloc_buffer_max_entries);
     annotation_buffer.clear();
     for (auto& allocator : device_allocator) {
       allocator->recordHistory(
-          enabled, context_recorder, alloc_buffer_max_entries, when);
+          enabled,
+          context_recorder,
+          alloc_buffer_max_entries,
+          when,
+          clearHistory);
     }
   }
 
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
@@ -264,7 +264,8 @@ class CUDAAllocator : public Allocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
-      RecordContext when) = 0;
+      RecordContext when,
+      bool clearHistory) = 0;
   virtual void recordAnnotation(
       const std::vector<std::pair<std::string, std::string>>& md) {}
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
@@ -414,9 +415,10 @@ inline void recordHistory(
     bool enabled,
     CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
-    RecordContext when) {
+    RecordContext when,
+    bool clearHistory) {
   return get()->recordHistory(
-      enabled, context_recorder, alloc_trace_max_entries, when);
+      enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
 }
 
 inline void recordAnnotation(
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -648,7 +648,8 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
-      RecordContext when) override {
+      RecordContext when,
+      bool clearHistory) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support recordHistory. "
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -1917,12 +1917,14 @@ def _cuda_record_memory_history_legacy(
     record_context_cpp: _bool,
     alloc_trace_max_entries: _int,
     alloc_trace_record_context: _bool,
+    clear_history: _bool,
 ) -> None: ...
 def _cuda_record_memory_history(
     enabled: Optional[str],
     context: Optional[str],
     stacks: str,
-    max_entries
+    max_entries: _int,
+    clear_history: _bool,
 ) -> None: ...
 def _cuda_isHistoryEnabled() -> _bool: ...
 
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -290,7 +290,8 @@ void CUDAPluggableAllocator::recordHistory(
     bool enabled,
     c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
-    c10::cuda::CUDACachingAllocator::RecordContext when) {
+    c10::cuda::CUDACachingAllocator::RecordContext when,
+    bool clearHistory) {
   TORCH_CHECK(
       false,
       "CUDAPluggableAllocator does not yet support recordHistory. "
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -145,7 +145,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
       bool enabled,
       c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
-      c10::cuda::CUDACachingAllocator::RecordContext when) override;
+      c10::cuda::CUDACachingAllocator::RecordContext when,
+      bool clearHistory) override;
   void attachOutOfMemoryObserver(
       c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) override;
   void attachAllocatorTraceTracker(
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
@@ -1128,7 +1128,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
 
   m.def(
       "_cuda_record_memory_history_legacy",
-      static_cast<void (*)(bool, bool, int64_t, bool, bool)>(
+      static_cast<void (*)(bool, bool, int64_t, bool, bool, bool)>(
           torch::cuda::_record_memory_history));
 
   m.def(
@@ -1137,7 +1137,8 @@ static void registerCudaDeviceProperties(PyObject* module) {
           std::optional<std::string>,
           std::optional<std::string>,
           const std::string&,
-          size_t)>(torch::cuda::_record_memory_history));
+          size_t,
+          bool)>(torch::cuda::_record_memory_history));
 
   m.def("_cuda_isHistoryEnabled", []() {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
@@ -124,7 +124,8 @@ void _record_memory_history(
     bool record_context,
     int64_t trace_alloc_max_entries,
     bool trace_alloc_record_context,
-    bool record_cpp_context) {
+    bool record_cpp_context,
+    bool clearHistory) {
   c10::cuda::CUDACachingAllocator::CreateContextFn recorder = gather;
   if (enabled && record_cpp_context &&
       (trace_alloc_record_context || record_context)) {
@@ -141,7 +142,7 @@ void _record_memory_history(
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
   _initRecordAnnotations();
   c10::cuda::CUDACachingAllocator::recordHistory(
-      enabled, recorder, trace_alloc_max_entries, when);
+      enabled, recorder, trace_alloc_max_entries, when, clearHistory);
 }
 
 static void checkOptionIn(
@@ -156,7 +157,8 @@ void _record_memory_history(
     std::optional<std::string> enabled,
     std::optional<std::string> context,
     const std::string& stacks,
-    size_t max_entries) {
+    size_t max_entries,
+    bool clearHistory) {
   if (enabled) {
     checkOptionIn(
         *enabled,
@@ -192,7 +194,7 @@ void _record_memory_history(
   at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
   _initRecordAnnotations();
   c10::cuda::CUDACachingAllocator::recordHistory(
-      enabled.has_value(), recorder, max_entries, when);
+      enabled.has_value(), recorder, max_entries, when, clearHistory);
 }
 
 std::string _memory_snapshot_pickled() {
diff --git a/torch/csrc/cuda/memory_snapshot.h b/torch/csrc/cuda/memory_snapshot.h
@@ -14,13 +14,15 @@ TORCH_CUDA_CU_API void _record_memory_history(
     bool record_context = true,
     int64_t trace_alloc_max_entries = 1,
     bool trace_alloc_record_context = false,
-    bool record_cpp_context = false);
+    bool record_cpp_context = false,
+    bool clearHistory = false);
 
 TORCH_CUDA_CU_API void _record_memory_history(
     std::optional<std::string> enabled = "all",
     std::optional<std::string> context = "all",
     const std::string& stacks = "all",
-    size_t max_entries = SIZE_MAX);
+    size_t max_entries = SIZE_MAX,
+    bool clearHistory = false);
 
 TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
 
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
@@ -843,13 +843,15 @@ def _record_memory_history_legacy(
     trace_alloc_record_context=False,
     device: Union[Device, int] = None,
     record_context_cpp=False,
+    clear_history=False,
 ):
     _C._cuda_record_memory_history_legacy(
         enabled,
         record_context,
         trace_alloc_max_entries,
         trace_alloc_record_context,
         record_context_cpp,
+        clear_history,
     )
 
 
@@ -904,8 +906,9 @@ def _record_memory_history_impl(
     stacks: str = "all",
     max_entries: int = sys.maxsize,
     device: Union[Device, int] = None,
+    clear_history: bool = False,
 ):
-    _C._cuda_record_memory_history(enabled, context, stacks, max_entries)
+    _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history)
 
 
 _record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]