Added multilayer storage feature to the tape now when the RAM gets full the data can be offloaded to the Disk similar to the LRU

Vedant2005goyal · vgvassilev · commit 6842b901eb04 · 2026-02-15T12:03:10.000+02:00
diff --git a/benchmark/MemoryComplexity.cpp b/benchmark/MemoryComplexity.cpp
@@ -1,7 +1,9 @@
 #include "benchmark/benchmark.h"
 
 #include "clad/Differentiator/Differentiator.h"
-
+#include "clad/Differentiator/Tape.h"
+#include <cstddef>
+#include <cstdint>
 namespace {
   struct MemoryManager : public benchmark::MemoryManager {
     size_t cur_num_allocs = 0;
@@ -60,21 +62,29 @@ void operator delete(void* p) noexcept {
   free(p);
 }
 
-template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
-void func(clad::tape<T, SBO_SIZE, SLAB_SIZE>& t, T x, int n) {
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false>
+void func(clad::tape_impl<T, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
+                          DiskOffload>& t,
+          T x, int n) {
   for (int i = 0; i < n; i++)
-    clad::push<T, SBO_SIZE, SLAB_SIZE>(t, x);
+    clad::push(t, x);
 
-  for (int i = 0; i < n; i++)
-    benchmark::DoNotOptimize(clad::pop<T, SBO_SIZE, SLAB_SIZE>(t));
+  for (int i = 0; i < n; i++) {
+    benchmark::DoNotOptimize(t.back());
+    t.pop_back();
+  }
 }
 
 static void BM_TapeMemory(benchmark::State& state) {
   int block = state.range(0);
   AddBMCounterRAII MemCounters(*mm.get(), state);
   for (auto _ : state) {
-    clad::tape<double> t;
-    func<double>(t, 1, block * 2 + 1);
+    // Explicitly using false for DiskOffload to test baseline
+    clad::tape_impl<double, 64, 1024, /*is_Multithread=*/false,
+                    /*DiskOffload=*/false>
+        t;
+    func<double, 64, 1024, /*DiskOffload=*/false>(t, 1, block * 2 + 1);
   }
 }
 BENCHMARK(BM_TapeMemory)->RangeMultiplier(2)->Range(0, 4096);
@@ -84,8 +94,11 @@ static void BM_TapeMemory_Templated(benchmark::State& state) {
   int block = state.range(0);
   AddBMCounterRAII MemCounters(*mm.get(), state);
   for (auto _ : state) {
-    clad::tape<double, SBO_SIZE, SLAB_SIZE> t;
-    func<double, SBO_SIZE, SLAB_SIZE>(t, 1, block * 2 + 1);
+    clad::tape_impl<double, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
+                    /*DiskOffload=*/false>
+        t;
+    func<double, SBO_SIZE, SLAB_SIZE, /*DiskOffload=*/false>(t, 1,
+                                                             block * 2 + 1);
   }
 }
 
@@ -98,8 +111,27 @@ static void BM_TapeMemory_Templated(benchmark::State& state) {
 REGISTER_TAPE_BENCHMARK(64, 1024);
 REGISTER_TAPE_BENCHMARK(32, 512);
 
-#include "BenchmarkedFunctions.h"
+// This explicitly tests the case where DiskOffload = true
+template <std::size_t SBO_SIZE, std::size_t SLAB_SIZE>
+static void BM_Multilayer_Storage(benchmark::State& state) {
+  int64_t block = state.range(0);
+  AddBMCounterRAII MemCounters(*mm, state);
+  for (auto _ : state) {
+    // Set DiskOffload = true here
+    clad::tape_impl<double, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
+                    /*DiskOffload=*/true>
+        t;
+    func<double, SBO_SIZE, SLAB_SIZE, /*DiskOffload=*/true>(t, 1,
+                                                            block * 2 + 1);
+  }
+}
+
+BENCHMARK_TEMPLATE(BM_Multilayer_Storage, 64, 1024)
+    ->RangeMultiplier(2)
+    ->Range(0, 4096)
+    ->Name("BM_Multilayer_Storage/SBO_64_SLAB_1024_DISK");
 
+#include "BenchmarkedFunctions.h"
 static void BM_ReverseGausMemoryP(benchmark::State& state) {
   auto dfdp_grad = clad::gradient(gaus, "p");
   unsigned dim = state.range(0);
@@ -118,5 +150,39 @@ static void BM_ReverseGausMemoryP(benchmark::State& state) {
 }
 BENCHMARK(BM_ReverseGausMemoryP)->RangeMultiplier(2)->Range(0, 4096);
 
-// Define our main.
-BENCHMARK_MAIN();
+const size_t TARGET_ELEMENTS = 20000;
+
+static void BM_CrashTest_OS_Paging(benchmark::State& state) {
+  AddBMCounterRAII MemCounters(*mm, state);
+  for (auto _ : state) {
+    clad::tape_impl<double, 64, 1024, /*is_Multithread=*/false,
+                    /*DiskOffload=*/false>
+        t;
+
+    for (size_t i = 0; i < TARGET_ELEMENTS; ++i) {
+      try {
+        clad::push(t, 1.0);
+      } catch (std::bad_alloc& e) {
+        state.SkipWithError("OS ran out of memory!");
+        break;
+      }
+    }
+  }
+}
+
+BENCHMARK(BM_CrashTest_OS_Paging)->Iterations(1);
+
+static void BM_CrashTest_Clad_Offload(benchmark::State& state) {
+  AddBMCounterRAII MemCounters(*mm, state);
+  for (auto _ : state) {
+    clad::tape_impl<double, 64, 1310720, /*is_Multithread=*/false,
+                    /*DiskOffload=*/true>
+        t;
+
+    for (size_t i = 0; i < TARGET_ELEMENTS; ++i)
+      clad::push(t, 1.0);
+  }
+}
+BENCHMARK(BM_CrashTest_Clad_Offload)->Iterations(1);
+
+BENCHMARK_MAIN();
diff --git a/include/clad/Differentiator/Differentiator.h b/include/clad/Differentiator/Differentiator.h
@@ -59,90 +59,107 @@ inline CUDA_HOST_DEVICE unsigned int GetLength(const char* code) {
 
 /// Tape type used for storing values in reverse-mode AD inside loops.
 template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
-          bool is_multithread = false>
-using tape = tape_impl<T, SBO_SIZE, SLAB_SIZE, is_multithread>;
+          bool is_multithread = false, bool DiskOffload = false>
+using tape = tape_impl<T, SBO_SIZE, SLAB_SIZE, is_multithread, DiskOffload>;
 
 /// Add value to the end of the tape, return the same value.
 template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
-          typename... ArgsT>
-CUDA_HOST_DEVICE T& push(tape<T, SBO_SIZE, SLAB_SIZE>& to, ArgsT... val) {
+          bool DiskOffload = false, typename... ArgsT>
+CUDA_HOST_DEVICE T&
+push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to,
+     ArgsT... val) {
   to.emplace_back(std::forward<ArgsT>(val)...);
   return to.back();
 }
 
 /// A specialization for C arrays
 template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
-          std::size_t SLAB_SIZE = 1024>
-CUDA_HOST_DEVICE void push(tape<T[N], SBO_SIZE, SLAB_SIZE>& to, const U& val) {
+          std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
+CUDA_HOST_DEVICE void
+push(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to,
+     const U& val) {
   to.emplace_back();
   std::copy(std::begin(val), std::end(val), std::begin(to.back()));
 }
 
   /// Remove the last value from the tape, return it.
-  template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
-  CUDA_HOST_DEVICE T pop(tape<T, SBO_SIZE, SLAB_SIZE>& to) {
-    T val = std::move(to.back());
-    to.pop_back();
-    return val;
-  }
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false>
+CUDA_HOST_DEVICE T
+pop(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to) {
+  T val = std::move(to.back());
+  to.pop_back();
+  return val;
+}
 
   /// A specialization for C arrays
-  template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
-            std::size_t SLAB_SIZE = 1024>
-  CUDA_HOST_DEVICE void pop(tape<T[N], SBO_SIZE, SLAB_SIZE>& to) {
-    to.pop_back();
-  }
+template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
+          std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
+CUDA_HOST_DEVICE void pop(tape<T[N], SBO_SIZE, SLAB_SIZE,
+                               /*is_multithread=*/false, DiskOffload>& to) {
+  to.pop_back();
+}
 
   /// Access return the last value in the tape.
-  template <typename T> CUDA_HOST_DEVICE T& back(tape<T>& of) {
-    return of.back();
-  }
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false>
+CUDA_HOST_DEVICE T&
+back(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& of) {
+  return of.back();
+}
 
   /// Thread safe tape access functions with mutex locking mechanism
+/// Thread safe tape access functions with mutex locking mechanism
 #ifndef __CUDACC__
-  /// Add value to the end of the tape, return the same value.
-  template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
-            typename... ArgsT>
-  T push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to,
-         ArgsT&&... val) {
-    std::lock_guard<std::mutex> lock(to.mutex());
-    to.emplace_back(std::forward<ArgsT>(val)...);
-    return to.back();
-  }
+/// Add value to the end of the tape, return the same value.
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false, typename... ArgsT>
+T push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to,
+       ArgsT&&... val) {
+  std::lock_guard<std::mutex> lock(to.mutex());
+  to.emplace_back(std::forward<ArgsT>(val)...);
+  return to.back();
+}
 
   /// A specialization for C arrays
-  template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
-            std::size_t SLAB_SIZE = 1024>
-  void push(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to,
-            const U& val) {
-    std::lock_guard<std::mutex> lock(to.mutex());
-    to.emplace_back();
-    std::copy(std::begin(val), std::end(val), std::begin(to.back()));
-  }
+template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
+          std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
+void push(
+    tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to,
+    const U& val) {
+  std::lock_guard<std::mutex> lock(to.mutex());
+  to.emplace_back();
+  std::copy(std::begin(val), std::end(val), std::begin(to.back()));
+}
 
   /// Remove the last value from the tape, return it.
-  template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
-  T pop(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to) {
-    std::lock_guard<std::mutex> lock(to.mutex());
-    T val = std::move(to.back());
-    to.pop_back();
-    return val;
-  }
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false>
+T pop(
+    tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to) {
+  std::lock_guard<std::mutex> lock(to.mutex());
+  T val = std::move(to.back());
+  to.pop_back();
+  return val;
+}
 
   /// A specialization for C arrays
-  template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
-            std::size_t SLAB_SIZE = 1024>
-  void pop(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to) {
-    std::lock_guard<std::mutex> lock(to.mutex());
-    to.pop_back();
-  }
+template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
+          std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
+void pop(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true,
+              DiskOffload>& to) {
+  std::lock_guard<std::mutex> lock(to.mutex());
+  to.pop_back();
+}
 
   /// Access return the last value in the tape.
-  template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
-  T& back(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& of) {
-    std::lock_guard<std::mutex> lock(of.mutex());
-    return of.back();
-  }
+template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
+          bool DiskOffload = false>
+T& back(
+    tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& of) {
+  std::lock_guard<std::mutex> lock(of.mutex());
+  return of.back();
+}
 #endif
 
   /// The purpose of this function is to initialize adjoints
diff --git a/include/clad/Differentiator/Tape.h b/include/clad/Differentiator/Tape.h