Skip to content

Commit 890fe37

Browse files
authored
Add gpu recorder (#5471)
1 parent a2fdb95 commit 890fe37

File tree

3 files changed

+258
-17
lines changed

3 files changed

+258
-17
lines changed

source/module_base/memory.cpp

Lines changed: 201 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,19 @@ int Memory::n_memory = 1000;
2525
int Memory::n_now = 0;
2626
bool Memory::init_flag = false;
2727

28+
#if defined(__CUDA) || defined(__ROCM)
29+
30+
double Memory::total_gpu = 0.0;
31+
int Memory::n_now_gpu = 0;
32+
bool Memory::init_flag_gpu = false;
33+
34+
std::string *Memory::name_gpu;
35+
std::string *Memory::class_name_gpu;
36+
double *Memory::consume_gpu;
37+
38+
#endif
39+
40+
2841
std::string *Memory::name;
2942
std::string *Memory::class_name;
3043
double *Memory::consume;
@@ -208,6 +221,126 @@ void Memory::record
208221
return;
209222
}
210223

224+
#if defined(__CUDA) || defined(__ROCM)
225+
226+
double Memory::record_gpu
227+
(
228+
const std::string &class_name_in,
229+
const std::string &name_in,
230+
const long &n_in,
231+
const std::string &type,
232+
const bool accumulate
233+
)
234+
{
235+
if(!Memory::init_flag_gpu)
236+
{
237+
name_gpu = new std::string[n_memory];
238+
class_name_gpu = new std::string[n_memory];
239+
consume_gpu = new double[n_memory];
240+
for(int i=0;i<n_memory;i++)
241+
{
242+
consume_gpu[i] = 0.0;
243+
}
244+
Memory::init_flag_gpu = true;
245+
}
246+
247+
int find = 0;
248+
for(find = 0; find < n_now_gpu; find++)
249+
{
250+
if( name_in == name_gpu[find] )
251+
{
252+
break;
253+
}
254+
}
255+
256+
// find == n_now : found a new record.
257+
if(find == n_now_gpu)
258+
{
259+
n_now_gpu++;
260+
name_gpu[find] = name_in;
261+
class_name_gpu[find] = class_name_in;
262+
}
263+
if(n_now_gpu >= n_memory)
264+
{
265+
std::cout<<" Error! Too many gpu memories required.";
266+
return 0.0;
267+
}
268+
269+
consume_gpu[find] = Memory::calculate_mem(n_in,type);
270+
271+
if(consume_gpu[find] > 5)
272+
{
273+
print(find);
274+
}
275+
return consume_gpu[find];
276+
}
277+
278+
void Memory::record_gpu
279+
(
280+
const std::string &name_in,
281+
const size_t &n_in,
282+
const bool accumulate
283+
)
284+
{
285+
if(!Memory::init_flag_gpu)
286+
{
287+
name_gpu = new std::string[n_memory];
288+
class_name_gpu = new std::string[n_memory];
289+
consume_gpu = new double[n_memory];
290+
for(int i=0;i<n_memory;i++)
291+
{
292+
consume_gpu[i] = 0.0;
293+
}
294+
Memory::init_flag_gpu = true;
295+
}
296+
297+
int find = 0;
298+
for(find = 0; find < n_now_gpu; find++)
299+
{
300+
if( name_in == name_gpu[find] )
301+
{
302+
break;
303+
}
304+
}
305+
306+
// find == n_now : found a new record.
307+
if(find == n_now_gpu)
308+
{
309+
n_now_gpu++;
310+
name_gpu[find] = name_in;
311+
class_name_gpu[find] = "";
312+
}
313+
if(n_now_gpu >= n_memory)
314+
{
315+
std::cout<<" Error! Too many gpu memories has been recorded.";
316+
return;
317+
}
318+
319+
const double factor = 1.0/1024.0/1024.0;
320+
double size_mb = n_in * factor;
321+
322+
if(accumulate)
323+
{
324+
consume_gpu[find] += size_mb;
325+
Memory::total_gpu += size_mb;
326+
}
327+
else
328+
{
329+
if(consume_gpu[find] < size_mb)
330+
{
331+
Memory::total_gpu += size_mb - consume_gpu[find];
332+
consume_gpu[find] = size_mb;
333+
if(consume_gpu[find] > 5)
334+
{
335+
print(find);
336+
}
337+
}
338+
}
339+
return;
340+
}
341+
342+
#endif
343+
211344
void Memory::print(const int find)
212345
{
213346
GlobalV::ofs_running <<"\n Warning_Memory_Consuming allocated: "
@@ -226,19 +359,34 @@ void Memory::finish(std::ofstream &ofs)
226359
delete[] consume;
227360
init_flag = false;
228361
}
362+
#if defined(__CUDA) || defined(__ROCM)
363+
if(init_flag_gpu)
364+
{
365+
delete[] name_gpu;
366+
delete[] class_name_gpu;
367+
delete[] consume_gpu;
368+
}
369+
#endif
229370
return;
230371
}
231372

232373
void Memory::print_all(std::ofstream &ofs)
233374
{
234-
if(!init_flag)
375+
if(!init_flag
376+
#if defined(__CUDA) || defined(__ROCM)
377+
&& !init_flag_gpu
378+
#endif
379+
)
235380
{
236381
return;
237382
}
238383

239384
const double small = 1.0; // unit is MB
240385
#ifdef __MPI
241386
Parallel_Reduce::reduce_all(Memory::total);
387+
#if defined(__CUDA) || defined(__ROCM)
388+
Parallel_Reduce::reduce_all(Memory::total_gpu);
389+
#endif
242390
#endif
243391
ofs <<"\n NAME-------------------------|MEMORY(MB)--------" << std::endl;
244392
ofs <<std::setw(30)<< "total" << std::setw(15) <<std::setprecision(4)<< Memory::total << std::endl;
@@ -254,23 +402,7 @@ void Memory::print_all(std::ofstream &ofs)
254402

255403
for (int i=0; i<n_memory; i++)
256404
{
257-
// int k = 0;
258-
// double tmp = -1.0;
259-
// for(int j=0; j<n_memory; j++)
260-
// {
261-
// if(print_flag[j])
262-
// {
263-
// continue;
264-
// }
265-
// else if(tmp < consume[j])
266-
// {
267-
// k = j;
268-
// tmp = consume[j];
269-
// }
270-
// }
271-
// print_flag[k] = true;
272405
#ifdef __MPI
273-
// Parallel_Reduce::reduce_all(consume[k]);
274406
Parallel_Reduce::reduce_all(consume[i]);
275407
#endif
276408
}
@@ -304,6 +436,58 @@ void Memory::print_all(std::ofstream &ofs)
304436

305437
}
306438

439+
#if defined(__CUDA) || defined(__ROCM)
440+
ofs <<"\n NAME-------------------------|GPU MEMORY(MB)----" << std::endl;
441+
ofs <<std::setw(30)<< "total" << std::setw(15) <<std::setprecision(4)<< Memory::total_gpu << std::endl;
442+
443+
assert(n_memory>0);
444+
445+
bool *print_flag_gpu = new bool[n_memory];
446+
447+
for(int i=0; i<n_memory; i++)
448+
{
449+
print_flag_gpu[i] = false;
450+
}
451+
452+
for (int i=0; i<n_memory; i++)
453+
{
454+
#ifdef __MPI
455+
Parallel_Reduce::reduce_all(consume_gpu[i]);
456+
#endif
457+
}
458+
459+
for (int i=0; i<n_memory; i++) // Xiaoyang fix memory record sum bug 2023/10/25
460+
{
461+
int k = 0;
462+
double tmp = -1.0;
463+
for(int j=0; j<n_memory; j++)
464+
{
465+
if(print_flag_gpu[j])
466+
{
467+
continue;
468+
}
469+
else if(tmp < consume_gpu[j])
470+
{
471+
k = j;
472+
tmp = consume_gpu[j];
473+
}
474+
}
475+
print_flag_gpu[k] = true;
476+
if ( consume_gpu[k] < small )
477+
{
478+
continue;
479+
}
480+
else
481+
{
482+
ofs << std::setw(30) << name_gpu[k]
483+
<< std::setw(15) << consume_gpu[k] << std::endl;
484+
}
485+
486+
}
487+
488+
delete[] print_flag_gpu;
489+
#endif
490+
307491
ofs<<" ------------- < 1.0 MB has been ignored ----------------"<<std::endl;
308492
ofs<<" ----------------------------------------------------------"<<std::endl;
309493

source/module_base/memory.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,39 @@ class Memory
4949
const bool accumulate = false
5050
);
5151

52+
#if defined(__CUDA) || defined(__ROCM)
53+
54+
/**
55+
* @brief Record memory consumed on gpu during computation
56+
*
57+
* @param class_name The name of a class
58+
* @param name The name of a quantity
59+
* @param n The number of the quantity
60+
* @param type The type of data
61+
* @param accumulate Useless, always set false
62+
* @return double
63+
*/
64+
static double record_gpu(const std::string &class_name,
65+
const std::string &name,
66+
const long &n,
67+
const std::string &type,
68+
const bool accumulate = false);
69+
70+
/**
71+
* @brief Record memory consumed on gpu during computation
72+
*
73+
* @param name The name of a quantity
74+
* @param n The number of the quantity
75+
* @param accumulate Useless, always set false
76+
*/
77+
static void record_gpu(
78+
const std::string &name_in,
79+
const size_t &n_in,
80+
const bool accumulate = false
81+
);
82+
83+
#endif
84+
5285
static double &get_total(void)
5386
{
5487
return total;
@@ -84,6 +117,15 @@ class Memory
84117
static int n_now;
85118
static bool init_flag;
86119

120+
#if defined(__CUDA) || defined(__ROCM)
121+
static double total_gpu;
122+
static std::string *name_gpu;
123+
static std::string *class_name_gpu;
124+
static double *consume_gpu;
125+
static int n_now_gpu;
126+
static bool init_flag_gpu;
127+
#endif
128+
87129
static int complex_matrix_memory; //(16 Byte)
88130
static int double_memory; //(8 Byte)
89131
static int int_memory; //(4 Byte)

source/module_base/module_device/cuda/memory_op.cu

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "module_base/module_device/memory_op.h"
2+
#include "module_base/memory.h"
23

34
#include <base/macros/macros.h>
45
#include <cuda_runtime.h>
@@ -61,6 +62,20 @@ void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
6162
delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
6263
}
6364
cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE) * size));
65+
std::string record_string;
66+
if (record_in != nullptr)
67+
{
68+
record_string = record_in;
69+
}
70+
else
71+
{
72+
record_string = "no_record";
73+
}
74+
75+
if (record_string != "no_record")
76+
{
77+
ModuleBase::Memory::record_gpu(record_string, sizeof(FPTYPE) * size);
78+
}
6479
}
6580

6681
template <typename FPTYPE>

0 commit comments

Comments
 (0)