Skip to content

Commit 6842b90

Browse files
Vedant2005goyalvgvassilev
authored andcommitted
Added multilayer storage feature to the tape now when the RAM gets full the data can be offloaded to the Disk similar to the LRU
1 parent 1be087a commit 6842b90

File tree

3 files changed

+384
-90
lines changed

3 files changed

+384
-90
lines changed

benchmark/MemoryComplexity.cpp

Lines changed: 79 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include "benchmark/benchmark.h"
22

33
#include "clad/Differentiator/Differentiator.h"
4-
4+
#include "clad/Differentiator/Tape.h"
5+
#include <cstddef>
6+
#include <cstdint>
57
namespace {
68
struct MemoryManager : public benchmark::MemoryManager {
79
size_t cur_num_allocs = 0;
@@ -60,21 +62,29 @@ void operator delete(void* p) noexcept {
6062
free(p);
6163
}
6264

63-
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
64-
void func(clad::tape<T, SBO_SIZE, SLAB_SIZE>& t, T x, int n) {
65+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
66+
bool DiskOffload = false>
67+
void func(clad::tape_impl<T, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
68+
DiskOffload>& t,
69+
T x, int n) {
6570
for (int i = 0; i < n; i++)
66-
clad::push<T, SBO_SIZE, SLAB_SIZE>(t, x);
71+
clad::push(t, x);
6772

68-
for (int i = 0; i < n; i++)
69-
benchmark::DoNotOptimize(clad::pop<T, SBO_SIZE, SLAB_SIZE>(t));
73+
for (int i = 0; i < n; i++) {
74+
benchmark::DoNotOptimize(t.back());
75+
t.pop_back();
76+
}
7077
}
7178

7279
static void BM_TapeMemory(benchmark::State& state) {
7380
int block = state.range(0);
7481
AddBMCounterRAII MemCounters(*mm.get(), state);
7582
for (auto _ : state) {
76-
clad::tape<double> t;
77-
func<double>(t, 1, block * 2 + 1);
83+
// Explicitly using false for DiskOffload to test baseline
84+
clad::tape_impl<double, 64, 1024, /*is_Multithread=*/false,
85+
/*DiskOffload=*/false>
86+
t;
87+
func<double, 64, 1024, /*DiskOffload=*/false>(t, 1, block * 2 + 1);
7888
}
7989
}
8090
BENCHMARK(BM_TapeMemory)->RangeMultiplier(2)->Range(0, 4096);
@@ -84,8 +94,11 @@ static void BM_TapeMemory_Templated(benchmark::State& state) {
8494
int block = state.range(0);
8595
AddBMCounterRAII MemCounters(*mm.get(), state);
8696
for (auto _ : state) {
87-
clad::tape<double, SBO_SIZE, SLAB_SIZE> t;
88-
func<double, SBO_SIZE, SLAB_SIZE>(t, 1, block * 2 + 1);
97+
clad::tape_impl<double, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
98+
/*DiskOffload=*/false>
99+
t;
100+
func<double, SBO_SIZE, SLAB_SIZE, /*DiskOffload=*/false>(t, 1,
101+
block * 2 + 1);
89102
}
90103
}
91104

@@ -98,8 +111,27 @@ static void BM_TapeMemory_Templated(benchmark::State& state) {
98111
REGISTER_TAPE_BENCHMARK(64, 1024);
99112
REGISTER_TAPE_BENCHMARK(32, 512);
100113

101-
#include "BenchmarkedFunctions.h"
114+
// This explicitly tests the case where DiskOffload = true
115+
template <std::size_t SBO_SIZE, std::size_t SLAB_SIZE>
116+
static void BM_Multilayer_Storage(benchmark::State& state) {
117+
int64_t block = state.range(0);
118+
AddBMCounterRAII MemCounters(*mm, state);
119+
for (auto _ : state) {
120+
// Set DiskOffload = true here
121+
clad::tape_impl<double, SBO_SIZE, SLAB_SIZE, /*is_Multithread=*/false,
122+
/*DiskOffload=*/true>
123+
t;
124+
func<double, SBO_SIZE, SLAB_SIZE, /*DiskOffload=*/true>(t, 1,
125+
block * 2 + 1);
126+
}
127+
}
128+
129+
BENCHMARK_TEMPLATE(BM_Multilayer_Storage, 64, 1024)
130+
->RangeMultiplier(2)
131+
->Range(0, 4096)
132+
->Name("BM_Multilayer_Storage/SBO_64_SLAB_1024_DISK");
102133

134+
#include "BenchmarkedFunctions.h"
103135
static void BM_ReverseGausMemoryP(benchmark::State& state) {
104136
auto dfdp_grad = clad::gradient(gaus, "p");
105137
unsigned dim = state.range(0);
@@ -118,5 +150,39 @@ static void BM_ReverseGausMemoryP(benchmark::State& state) {
118150
}
119151
BENCHMARK(BM_ReverseGausMemoryP)->RangeMultiplier(2)->Range(0, 4096);
120152

121-
// Define our main.
122-
BENCHMARK_MAIN();
153+
const size_t TARGET_ELEMENTS = 20000;
154+
155+
static void BM_CrashTest_OS_Paging(benchmark::State& state) {
156+
AddBMCounterRAII MemCounters(*mm, state);
157+
for (auto _ : state) {
158+
clad::tape_impl<double, 64, 1024, /*is_Multithread=*/false,
159+
/*DiskOffload=*/false>
160+
t;
161+
162+
for (size_t i = 0; i < TARGET_ELEMENTS; ++i) {
163+
try {
164+
clad::push(t, 1.0);
165+
} catch (std::bad_alloc& e) {
166+
state.SkipWithError("OS ran out of memory!");
167+
break;
168+
}
169+
}
170+
}
171+
}
172+
173+
BENCHMARK(BM_CrashTest_OS_Paging)->Iterations(1);
174+
175+
static void BM_CrashTest_Clad_Offload(benchmark::State& state) {
176+
AddBMCounterRAII MemCounters(*mm, state);
177+
for (auto _ : state) {
178+
clad::tape_impl<double, 64, 1310720, /*is_Multithread=*/false,
179+
/*DiskOffload=*/true>
180+
t;
181+
182+
for (size_t i = 0; i < TARGET_ELEMENTS; ++i)
183+
clad::push(t, 1.0);
184+
}
185+
}
186+
BENCHMARK(BM_CrashTest_Clad_Offload)->Iterations(1);
187+
188+
BENCHMARK_MAIN();

include/clad/Differentiator/Differentiator.h

Lines changed: 72 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -59,90 +59,107 @@ inline CUDA_HOST_DEVICE unsigned int GetLength(const char* code) {
5959

6060
/// Tape type used for storing values in reverse-mode AD inside loops.
6161
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
62-
bool is_multithread = false>
63-
using tape = tape_impl<T, SBO_SIZE, SLAB_SIZE, is_multithread>;
62+
bool is_multithread = false, bool DiskOffload = false>
63+
using tape = tape_impl<T, SBO_SIZE, SLAB_SIZE, is_multithread, DiskOffload>;
6464

6565
/// Add value to the end of the tape, return the same value.
6666
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
67-
typename... ArgsT>
68-
CUDA_HOST_DEVICE T& push(tape<T, SBO_SIZE, SLAB_SIZE>& to, ArgsT... val) {
67+
bool DiskOffload = false, typename... ArgsT>
68+
CUDA_HOST_DEVICE T&
69+
push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to,
70+
ArgsT... val) {
6971
to.emplace_back(std::forward<ArgsT>(val)...);
7072
return to.back();
7173
}
7274

7375
/// A specialization for C arrays
7476
template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
75-
std::size_t SLAB_SIZE = 1024>
76-
CUDA_HOST_DEVICE void push(tape<T[N], SBO_SIZE, SLAB_SIZE>& to, const U& val) {
77+
std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
78+
CUDA_HOST_DEVICE void
79+
push(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to,
80+
const U& val) {
7781
to.emplace_back();
7882
std::copy(std::begin(val), std::end(val), std::begin(to.back()));
7983
}
8084

8185
/// Remove the last value from the tape, return it.
82-
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
83-
CUDA_HOST_DEVICE T pop(tape<T, SBO_SIZE, SLAB_SIZE>& to) {
84-
T val = std::move(to.back());
85-
to.pop_back();
86-
return val;
87-
}
86+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
87+
bool DiskOffload = false>
88+
CUDA_HOST_DEVICE T
89+
pop(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& to) {
90+
T val = std::move(to.back());
91+
to.pop_back();
92+
return val;
93+
}
8894

8995
/// A specialization for C arrays
90-
template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
91-
std::size_t SLAB_SIZE = 1024>
92-
CUDA_HOST_DEVICE void pop(tape<T[N], SBO_SIZE, SLAB_SIZE>& to) {
93-
to.pop_back();
94-
}
96+
template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
97+
std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
98+
CUDA_HOST_DEVICE void pop(tape<T[N], SBO_SIZE, SLAB_SIZE,
99+
/*is_multithread=*/false, DiskOffload>& to) {
100+
to.pop_back();
101+
}
95102

96103
/// Access return the last value in the tape.
97-
template <typename T> CUDA_HOST_DEVICE T& back(tape<T>& of) {
98-
return of.back();
99-
}
104+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
105+
bool DiskOffload = false>
106+
CUDA_HOST_DEVICE T&
107+
back(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithread=*/false, DiskOffload>& of) {
108+
return of.back();
109+
}
100110

101111
/// Thread safe tape access functions with mutex locking mechanism
112+
/// Thread safe tape access functions with mutex locking mechanism
102113
#ifndef __CUDACC__
103-
/// Add value to the end of the tape, return the same value.
104-
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
105-
typename... ArgsT>
106-
T push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to,
107-
ArgsT&&... val) {
108-
std::lock_guard<std::mutex> lock(to.mutex());
109-
to.emplace_back(std::forward<ArgsT>(val)...);
110-
return to.back();
111-
}
114+
/// Add value to the end of the tape, return the same value.
115+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
116+
bool DiskOffload = false, typename... ArgsT>
117+
T push(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to,
118+
ArgsT&&... val) {
119+
std::lock_guard<std::mutex> lock(to.mutex());
120+
to.emplace_back(std::forward<ArgsT>(val)...);
121+
return to.back();
122+
}
112123

113124
/// A specialization for C arrays
114-
template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
115-
std::size_t SLAB_SIZE = 1024>
116-
void push(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to,
117-
const U& val) {
118-
std::lock_guard<std::mutex> lock(to.mutex());
119-
to.emplace_back();
120-
std::copy(std::begin(val), std::end(val), std::begin(to.back()));
121-
}
125+
template <typename T, typename U, size_t N, std::size_t SBO_SIZE = 64,
126+
std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
127+
void push(
128+
tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to,
129+
const U& val) {
130+
std::lock_guard<std::mutex> lock(to.mutex());
131+
to.emplace_back();
132+
std::copy(std::begin(val), std::end(val), std::begin(to.back()));
133+
}
122134

123135
/// Remove the last value from the tape, return it.
124-
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
125-
T pop(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to) {
126-
std::lock_guard<std::mutex> lock(to.mutex());
127-
T val = std::move(to.back());
128-
to.pop_back();
129-
return val;
130-
}
136+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
137+
bool DiskOffload = false>
138+
T pop(
139+
tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& to) {
140+
std::lock_guard<std::mutex> lock(to.mutex());
141+
T val = std::move(to.back());
142+
to.pop_back();
143+
return val;
144+
}
131145

132146
/// A specialization for C arrays
133-
template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
134-
std::size_t SLAB_SIZE = 1024>
135-
void pop(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& to) {
136-
std::lock_guard<std::mutex> lock(to.mutex());
137-
to.pop_back();
138-
}
147+
template <typename T, std::size_t N, std::size_t SBO_SIZE = 64,
148+
std::size_t SLAB_SIZE = 1024, bool DiskOffload = false>
149+
void pop(tape<T[N], SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true,
150+
DiskOffload>& to) {
151+
std::lock_guard<std::mutex> lock(to.mutex());
152+
to.pop_back();
153+
}
139154

140155
/// Access return the last value in the tape.
141-
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024>
142-
T& back(tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true>& of) {
143-
std::lock_guard<std::mutex> lock(of.mutex());
144-
return of.back();
145-
}
156+
template <typename T, std::size_t SBO_SIZE = 64, std::size_t SLAB_SIZE = 1024,
157+
bool DiskOffload = false>
158+
T& back(
159+
tape<T, SBO_SIZE, SLAB_SIZE, /*is_multithreaded=*/true, DiskOffload>& of) {
160+
std::lock_guard<std::mutex> lock(of.mutex());
161+
return of.back();
162+
}
146163
#endif
147164

148165
/// The purpose of this function is to initialize adjoints

0 commit comments

Comments
 (0)