Skip to content

Commit 4b43915

Browse files
authored
Add helper functions to clear MatX caches and allocations (#1092)
* Add helper functions to clear MatX caches and allocations MatX caches (e.g., cuFFT plans or cuBLAS handles) and allocations (user or internal allocations via matxAlloc) are stored in static data structures. These data structures are destroyed during program exit using the corresponding destructors. However, due to ordering of static destructors and atexit handlers, it is possible for resources to be freed after the CUDA context or some other dependent resource has been destroyed. This can result in a segmentation fault during program exit. The helper function ClearMatXCachesAndAllocations() can be called prior to program exit to free resources allocated by MatX. This will prevent conflicts with other static destructors and atexit handlers and thus allow clean shutdown. The function may also be useful at other times that the user wishes to free resources allocated via MatX. There are two other helpers, ClearMatXCaches() and FreeMatXAllocations(), to dellocate data associated with the caches (plans, handles, workspaces, etc.) and allocations made via matxAlloc(), respectively. Signed-off-by: Thomas Benson <[email protected]> * Add new test case for clearing the cache/allocations Signed-off-by: Thomas Benson <[email protected]> --------- Signed-off-by: Thomas Benson <[email protected]>
1 parent f4117f3 commit 4b43915

File tree

6 files changed

+207
-11
lines changed

6 files changed

+207
-11
lines changed

examples/channelize_poly_bench.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,5 +139,7 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
139139
// printf("Benchmarking complex<double> -> complex<double>\n");
140140
// ChannelizePolyBench<cuda::std::complex<double>,cuda::std::complex<double>>(channel_start, channel_stop);
141141

142+
matx::ClearCachesAndAllocations();
143+
142144
MATX_EXIT_HANDLER();
143145
}

include/matx/core/allocator.h

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,11 @@ struct MemTracker {
103103
iter->second.stream = stream;
104104
}
105105

106+
// deallocate_internal assumes that the caller has already acquired the memory_mtx mutex.
106107
template <typename StreamType>
107108
auto deallocate_internal(void *ptr, [[maybe_unused]] StreamType st) {
108109
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
109110

110-
[[maybe_unused]] std::unique_lock lck(memory_mtx);
111111
auto iter = allocationMap.find(ptr);
112112

113113
if (iter == allocationMap.end()) {
@@ -159,10 +159,12 @@ struct MemTracker {
159159
struct valid_stream_t { cudaStream_t stream; };
160160

161161
auto deallocate(void *ptr) {
162+
[[maybe_unused]] std::unique_lock lck(memory_mtx);
162163
deallocate_internal(ptr, no_stream_t{});
163164
}
164165

165166
auto deallocate(void *ptr, cudaStream_t stream) {
167+
[[maybe_unused]] std::unique_lock lck(memory_mtx);
166168
deallocate_internal(ptr, valid_stream_t{stream});
167169
}
168170

@@ -256,11 +258,23 @@ struct MemTracker {
256258
return MATX_INVALID_MEMORY;
257259
}
258260

259-
~MemTracker() {
260-
while (allocationMap.size()) {
261-
deallocate(allocationMap.begin()->first);
261+
void free_all() {
262+
[[maybe_unused]] std::unique_lock lck(memory_mtx);
263+
while (! allocationMap.empty()) {
264+
auto it = allocationMap.begin();
265+
const auto ptr = it->first;
266+
deallocate_internal(ptr, no_stream_t{});
267+
if (allocationMap.find(ptr) != allocationMap.end()) {
268+
// deallocate_internal may have erased the pointer from the map
269+
// If not, erase it here to avoid an infinite loop.
270+
allocationMap.erase(ptr);
271+
}
262272
}
263273
}
274+
275+
~MemTracker() {
276+
free_all();
277+
}
264278
};
265279

266280

@@ -271,6 +285,19 @@ __MATX_INLINE__ MemTracker &GetAllocMap() {
271285
return tracker;
272286
}
273287

288+
// Helper function to free all MatX allocations. This function frees all allocations
289+
// made with matxAlloc. These allocations may have been made directly by the user or they
290+
// may have been made by MatX internally for workspaces. This function does not free the
291+
// caches (i.e., allocations made for FFT plans, cuBLAS handles, and other state required
292+
// for MatX transforms). To free those caches, use matx::ClearCaches(). It is not safe to
293+
// call matxFree() on user-managed pointers after calling this function. This function should
294+
// be called after the user application has called matxFree() on any pointers for which it
295+
// will call matxFree().
296+
__attribute__ ((visibility ("default")))
297+
__MATX_INLINE__ void FreeAllocations() {
298+
GetAllocMap().free_all();
299+
}
300+
274301
/**
275302
* @brief Determine if a pointer is printable by the host
276303
*

include/matx/core/cache.h

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ struct LTOIRData {
9393

9494
static constexpr size_t MAX_CUDA_DEVICES_PER_SYSTEM = 16;
9595
using CacheId = uint64_t;
96+
struct CacheFreeHelper {
97+
void (*free)(std::any&);
98+
};
9699

97100
// Common cache parameters that every cache entry needs
98101
struct CacheCommonParamsKey {
@@ -118,13 +121,34 @@ __attribute__ ((visibility ("default")))
118121
inline cuda::std::atomic<CacheId> CacheIdCounter{0};
119122
inline std::recursive_mutex cache_mtx; ///< Mutex protecting updates from map
120123
inline std::recursive_mutex ltoir_mutex; ///< Mutex protecting LTOIR cache operations
124+
inline std::recursive_mutex stream_alloc_mutex; ///< Mutex protecting stream allocation cache operations
125+
126+
inline auto& CacheRegistry() {
127+
// Protected by cache_mtx
128+
static std::unordered_map<CacheId, CacheFreeHelper> registry;
129+
return registry;
130+
}
121131

122132
template<typename CacheType>
123133
__attribute__ ((visibility ("default")))
124134
CacheId GetCacheIdFromType()
125135
{
126136
static CacheId id = CacheIdCounter.fetch_add(1);
127-
137+
[[maybe_unused]] std::lock_guard<std::recursive_mutex> lock(cache_mtx);
138+
auto &registry = CacheRegistry();
139+
if (registry.find(id) != registry.end()) {
140+
// Registry already contains this ID, so no need to insert it again
141+
// with its CacheFreHelper.
142+
return id;
143+
}
144+
registry.emplace(id, CacheFreeHelper{
145+
.free = [](std::any& any) -> void {
146+
using CacheMap = std::unordered_map<CacheCommonParamsKey, CacheType, CacheCommonParamsKeyHash>;
147+
// This clear is the unordered_map's clear, which will ultimately call the
148+
// destructors of the cache entries.
149+
std::any_cast<CacheMap&>(any).clear();
150+
},
151+
});
128152
return id;
129153
}
130154

@@ -144,10 +168,7 @@ class matxCache_t {
144168
public:
145169
matxCache_t() {}
146170
~matxCache_t() {
147-
// Destroy all outstanding objects in the cache to free memory
148-
for (auto &[k, v]: cache) {
149-
v.reset();
150-
}
171+
ClearAll();
151172
}
152173

153174
/**
@@ -165,6 +186,38 @@ class matxCache_t {
165186
std::any_cast<CacheMap&>(el->second).clear();
166187
}
167188

189+
void ClearAll() {
190+
// Clear all cache entries for all cache types
191+
{
192+
[[maybe_unused]] std::lock_guard<std::recursive_mutex> lock(cache_mtx);
193+
for (auto &[id, v]: cache) {
194+
auto entry = CacheRegistry().find(id);
195+
if (entry == CacheRegistry().end()) {
196+
continue;
197+
}
198+
auto &info = entry->second;
199+
info.free(v);
200+
}
201+
cache.clear();
202+
}
203+
{
204+
[[maybe_unused]] std::lock_guard<std::recursive_mutex> lock(stream_alloc_mutex);
205+
for (auto &[outer_key, inner_map]: stream_alloc_cache) {
206+
for (auto &[inner_key, value]: inner_map) {
207+
if (value.ptr) {
208+
matxFree(value.ptr);
209+
}
210+
}
211+
inner_map.clear();
212+
}
213+
stream_alloc_cache.clear();
214+
}
215+
{
216+
[[maybe_unused]] std::lock_guard<std::recursive_mutex> lock(ltoir_mutex);
217+
ltoir_cache.clear();
218+
}
219+
}
220+
168221
template <typename CacheType, typename InParams, typename MakeFun, typename ExecFun, typename Executor>
169222
void LookupAndExec(const CacheId &id, const InParams &params, const MakeFun &mfun, const ExecFun &efun, [[maybe_unused]] const Executor &exec) {
170223
// This mutex should eventually be finer-grained so each transform doesn't get blocked by others
@@ -211,6 +264,8 @@ class matxCache_t {
211264
key.thread_id = std::this_thread::get_id();
212265
cudaGetDevice(&key.device_id);
213266

267+
[[maybe_unused]] std::lock_guard<std::recursive_mutex> lock(stream_alloc_mutex);
268+
214269
auto &common_params_cache = stream_alloc_cache[key];
215270
auto el = common_params_cache.find(stream);
216271
if (el == common_params_cache.end()) {
@@ -689,8 +744,25 @@ __MATX_INLINE__ matxCache_t &GetCache() {
689744
return InitCache();
690745
}
691746

747+
} // namespace detail
692748

749+
// Helper function to free all MatX caches. This function frees caches created for
750+
// FFT plans, cuBLAS handles, and other state required for MatX transforms. This
751+
// function does not clear the allocator cache (i.e., allocations made with matxAlloc
752+
// other than those created to support transforms).
753+
// To free the allocator cache, use matx::FreeAllocations().
754+
__attribute__ ((visibility ("default")))
755+
__MATX_INLINE__ void ClearCaches() {
756+
detail::GetCache().ClearAll();
757+
}
693758

759+
// Helper function to clear both MatX caches and allocations. This provides a single
760+
// function that can be called prior to program exit to support clean shutdown
761+
// (i.e., to avoid issues with the order of destruction of static objects and CUDA contexts).
762+
__attribute__ ((visibility ("default")))
763+
__MATX_INLINE__ void ClearCachesAndAllocations() {
764+
ClearCaches();
765+
FreeAllocations();
766+
}
694767

695-
} // namespace detail
696768
}; // namespace matx

test/00_misc/ClearCacheTests.cu

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
////////////////////////////////////////////////////////////////////////////////
2+
// BSD 3-Clause License
3+
//
4+
// Copyright (c) 2021, NVIDIA Corporation
5+
// All rights reserved.
6+
//
7+
// Redistribution and use in source and binary forms, with or without
8+
// modification, are permitted provided that the following conditions are met:
9+
//
10+
// 1. Redistributions of source code must retain the above copyright notice, this
11+
// list of conditions and the following disclaimer.
12+
//
13+
// 2. Redistributions in binary form must reproduce the above copyright notice,
14+
// this list of conditions and the following disclaimer in the documentation
15+
// and/or other materials provided with the distribution.
16+
//
17+
// 3. Neither the name of the copyright holder nor the names of its
18+
// contributors may be used to endorse or promote products derived from
19+
// this software without specific prior written permission.
20+
//
21+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
/////////////////////////////////////////////////////////////////////////////////
32+
33+
#include "assert.h"
34+
#include "matx.h"
35+
#include "test_types.h"
36+
#include "utilities.h"
37+
#include "gtest/gtest.h"
38+
#include <iostream>
39+
#include <vector>
40+
#include <unordered_map>
41+
42+
using namespace matx;
43+
44+
TEST(ClearCacheTests, TestCase) {
45+
MATX_ENTER_HANDLER();
46+
47+
size_t initial_free_mem = 0;
48+
size_t total_mem = 0;
49+
cudaError_t err = cudaMemGetInfo(&initial_free_mem, &total_mem);
50+
ASSERT_EQ(err, cudaSuccess);
51+
52+
// The cuBLAS handle will allocate an associated workspace of 4 MiB on pre-Hopper and
53+
// 32 MiB on Hopper+.
54+
{
55+
auto c = matx::make_tensor<float, 2>({1024, 1024});
56+
auto a = matx::make_tensor<float, 2>({1024, 1024});
57+
auto b = matx::make_tensor<float, 2>({1024, 1024});
58+
(c = matx::matmul(a, b)).run();
59+
cudaDeviceSynchronize();
60+
}
61+
62+
// Manually allocate 4 MiB
63+
const size_t four_MiB = 4 * 1024 * 1024;
64+
void *ptr;
65+
matxAlloc(&ptr, four_MiB, MATX_DEVICE_MEMORY);
66+
67+
size_t post_alloc_free_mem = 0;
68+
err = cudaMemGetInfo(&post_alloc_free_mem, &total_mem);
69+
ASSERT_EQ(err, cudaSuccess);
70+
71+
matx::ClearCachesAndAllocations();
72+
73+
size_t post_clear_free_mem = 0;
74+
err = cudaMemGetInfo(&post_clear_free_mem, &total_mem);
75+
ASSERT_EQ(err, cudaSuccess);
76+
77+
const ssize_t allocated = static_cast<ssize_t>(initial_free_mem) - static_cast<ssize_t>(post_alloc_free_mem);
78+
const ssize_t freed = static_cast<ssize_t>(post_clear_free_mem) - static_cast<ssize_t>(post_alloc_free_mem);
79+
80+
// The cuBLAS cache and allocator data structure should have allocated at least 8 MiB
81+
// in total and thus at least 8 MiB should be freed when clearing the caches/allocations.
82+
ASSERT_GE(allocated, 2 * four_MiB);
83+
ASSERT_GE(freed, 2 * four_MiB);
84+
85+
MATX_EXIT_HANDLER();
86+
}

test/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ list(TRANSFORM OPERATOR_TEST_FILES PREPEND "00_operators/")
55

66
set (test_sources
77
00_misc/AllocatorTests.cu
8+
00_misc/ClearCacheTests.cu
89
00_misc/ProfilingTests.cu
910
00_tensor/BasicTensorTests.cu
1011
00_tensor/CUBTests.cu
@@ -141,6 +142,8 @@ endforeach()
141142
# Number of test jobs to run in parallel
142143
set(CTEST_PARALLEL_JOBS 4)
143144

145+
set_tests_properties(test_00_misc_ClearCacheTests PROPERTIES RUN_SERIAL TRUE)
146+
144147
# Create a legacy matx_test script for CI compatibility
145148
configure_file(
146149
${CMAKE_CURRENT_SOURCE_DIR}/matx_test.sh

test/main.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,16 @@
3333
#include "gtest/gtest.h"
3434
#include <pybind11/embed.h>
3535

36+
#include "matx.h"
37+
3638
int main(int argc, char **argv)
3739
{
3840
printf("Running MatX unit tests. Press Ctrl+\\ (SIGQUIT) to kill tests\n");
3941

4042
::testing::InitGoogleTest(&argc, argv);
41-
return RUN_ALL_TESTS();
43+
const int result = RUN_ALL_TESTS();
44+
45+
matx::ClearCachesAndAllocations();
46+
47+
return result;
4248
}

0 commit comments

Comments
 (0)