@@ -25,6 +25,19 @@ int Memory::n_memory = 1000;
2525int Memory::n_now = 0 ;
2626bool Memory::init_flag = false ;
2727
28+ #if defined(__CUDA) || defined(__ROCM)
29+
30+ double Memory::total_gpu = 0.0 ;
31+ int Memory::n_now_gpu = 0 ;
32+ bool Memory::init_flag_gpu = false ;
33+
34+ std::string *Memory::name_gpu;
35+ std::string *Memory::class_name_gpu;
36+ double *Memory::consume_gpu;
37+
38+ #endif
39+
40+
2841std::string *Memory::name;
2942std::string *Memory::class_name;
3043double *Memory::consume;
@@ -208,6 +221,126 @@ void Memory::record
208221 return ;
209222}
210223
224+ #if defined(__CUDA) || defined(__ROCM)
225+
226+ double Memory::record_gpu
227+ (
228+ const std::string &class_name_in,
229+ const std::string &name_in,
230+ const long &n_in,
231+ const std::string &type,
232+ const bool accumulate
233+ )
234+ {
235+ if (!Memory::init_flag_gpu)
236+ {
237+ name_gpu = new std::string[n_memory];
238+ class_name_gpu = new std::string[n_memory];
239+ consume_gpu = new double [n_memory];
240+ for (int i=0 ;i<n_memory;i++)
241+ {
242+ consume_gpu[i] = 0.0 ;
243+ }
244+ Memory::init_flag_gpu = true ;
245+ }
246+
247+ int find = 0 ;
248+ for (find = 0 ; find < n_now_gpu; find++)
249+ {
250+ if ( name_in == name_gpu[find] )
251+ {
252+ break ;
253+ }
254+ }
255+
256+ // find == n_now : found a new record.
257+ if (find == n_now_gpu)
258+ {
259+ n_now_gpu++;
260+ name_gpu[find] = name_in;
261+ class_name_gpu[find] = class_name_in;
262+ }
263+ if (n_now_gpu >= n_memory)
264+ {
265+ std::cout<<" Error! Too many gpu memories required." ;
266+ return 0.0 ;
267+ }
268+
269+ consume_gpu[find] = Memory::calculate_mem (n_in,type);
270+
271+ if (consume_gpu[find] > 5 )
272+ {
273+ print (find);
274+ }
275+ return consume_gpu[find];
276+ }
277+
278+ void Memory::record_gpu
279+ (
280+ const std::string &name_in,
281+ const size_t &n_in,
282+ const bool accumulate
283+ )
284+ {
285+ if (!Memory::init_flag_gpu)
286+ {
287+ name_gpu = new std::string[n_memory];
288+ class_name_gpu = new std::string[n_memory];
289+ consume_gpu = new double [n_memory];
290+ for (int i=0 ;i<n_memory;i++)
291+ {
292+ consume_gpu[i] = 0.0 ;
293+ }
294+ Memory::init_flag_gpu = true ;
295+ }
296+
297+ int find = 0 ;
298+ for (find = 0 ; find < n_now_gpu; find++)
299+ {
300+ if ( name_in == name_gpu[find] )
301+ {
302+ break ;
303+ }
304+ }
305+
306+ // find == n_now : found a new record.
307+ if (find == n_now_gpu)
308+ {
309+ n_now_gpu++;
310+ name_gpu[find] = name_in;
311+ class_name_gpu[find] = " " ;
312+ }
313+ if (n_now_gpu >= n_memory)
314+ {
315+ std::cout<<" Error! Too many gpu memories has been recorded." ;
316+ return ;
317+ }
318+
319+ const double factor = 1.0 /1024.0 /1024.0 ;
320+ double size_mb = n_in * factor;
321+
322+ if (accumulate)
323+ {
324+ consume_gpu[find] += size_mb;
325+ Memory::total_gpu += size_mb;
326+ }
327+ else
328+ {
329+ if (consume_gpu[find] < size_mb)
330+ {
331+ Memory::total_gpu += size_mb - consume_gpu[find];
332+ consume_gpu[find] = size_mb;
333+ if (consume_gpu[find] > 5 )
334+ {
335+ print (find);
336+ }
337+ }
338+ }
339+ return ;
340+ }
341+
342+ #endif
343+
211344void Memory::print (const int find)
212345{
213346 GlobalV::ofs_running <<" \n Warning_Memory_Consuming allocated: "
@@ -226,19 +359,34 @@ void Memory::finish(std::ofstream &ofs)
226359 delete[] consume;
227360 init_flag = false ;
228361 }
362+ #if defined(__CUDA) || defined(__ROCM)
363+ if (init_flag_gpu)
364+ {
365+ delete[] name_gpu;
366+ delete[] class_name_gpu;
367+ delete[] consume_gpu;
368+ }
369+ #endif
229370 return ;
230371}
231372
232373void Memory::print_all (std::ofstream &ofs)
233374{
234- if (!init_flag)
375+ if (!init_flag
376+ #if defined(__CUDA) || defined(__ROCM)
377+ && !init_flag_gpu
378+ #endif
379+ )
235380 {
236381 return ;
237382 }
238383
239384 const double small = 1.0 ; // unit is MB
240385#ifdef __MPI
241386 Parallel_Reduce::reduce_all (Memory::total);
387+ #if defined(__CUDA) || defined(__ROCM)
388+ Parallel_Reduce::reduce_all (Memory::total_gpu);
389+ #endif
242390#endif
243391 ofs <<" \n NAME-------------------------|MEMORY(MB)--------" << std::endl;
244392 ofs <<std::setw (30 )<< " total" << std::setw (15 ) <<std::setprecision (4 )<< Memory::total << std::endl;
@@ -254,23 +402,7 @@ void Memory::print_all(std::ofstream &ofs)
254402
255403 for (int i=0 ; i<n_memory; i++)
256404 {
257- // int k = 0;
258- // double tmp = -1.0;
259- // for(int j=0; j<n_memory; j++)
260- // {
261- // if(print_flag[j])
262- // {
263- // continue;
264- // }
265- // else if(tmp < consume[j])
266- // {
267- // k = j;
268- // tmp = consume[j];
269- // }
270- // }
271- // print_flag[k] = true;
272405#ifdef __MPI
273- // Parallel_Reduce::reduce_all(consume[k]);
274406 Parallel_Reduce::reduce_all (consume[i]);
275407#endif
276408 }
@@ -304,6 +436,58 @@ void Memory::print_all(std::ofstream &ofs)
304436
305437 }
306438
439+ #if defined(__CUDA) || defined(__ROCM)
440+ ofs <<" \n NAME-------------------------|GPU MEMORY(MB)----" << std::endl;
441+ ofs <<std::setw (30 )<< " total" << std::setw (15 ) <<std::setprecision (4 )<< Memory::total_gpu << std::endl;
442+
443+ assert (n_memory>0 );
444+
445+ bool *print_flag_gpu = new bool [n_memory];
446+
447+ for (int i=0 ; i<n_memory; i++)
448+ {
449+ print_flag_gpu[i] = false ;
450+ }
451+
452+ for (int i=0 ; i<n_memory; i++)
453+ {
454+ #ifdef __MPI
455+ Parallel_Reduce::reduce_all (consume_gpu[i]);
456+ #endif
457+ }
458+
459+ for (int i=0 ; i<n_memory; i++) // Xiaoyang fix memory record sum bug 2023/10/25
460+ {
461+ int k = 0 ;
462+ double tmp = -1.0 ;
463+ for (int j=0 ; j<n_memory; j++)
464+ {
465+ if (print_flag_gpu[j])
466+ {
467+ continue ;
468+ }
469+ else if (tmp < consume_gpu[j])
470+ {
471+ k = j;
472+ tmp = consume_gpu[j];
473+ }
474+ }
475+ print_flag_gpu[k] = true ;
476+ if ( consume_gpu[k] < small )
477+ {
478+ continue ;
479+ }
480+ else
481+ {
482+ ofs << std::setw (30 ) << name_gpu[k]
483+ << std::setw (15 ) << consume_gpu[k] << std::endl;
484+ }
485+
486+ }
487+
488+ delete[] print_flag_gpu;
489+ #endif
490+
307491 ofs<<" ------------- < 1.0 MB has been ignored ----------------" <<std::endl;
308492 ofs<<" ----------------------------------------------------------" <<std::endl;
309493
0 commit comments