diff --git a/source/module_base/memory.cpp b/source/module_base/memory.cpp index 931d17f360..bc4cd642df 100644 --- a/source/module_base/memory.cpp +++ b/source/module_base/memory.cpp @@ -25,6 +25,19 @@ int Memory::n_memory = 1000; int Memory::n_now = 0; bool Memory::init_flag = false; +#if defined(__CUDA) || defined(__ROCM) + +double Memory::total_gpu = 0.0; +int Memory::n_now_gpu = 0; +bool Memory::init_flag_gpu = false; + +std::string *Memory::name_gpu; +std::string *Memory::class_name_gpu; +double *Memory::consume_gpu; + +#endif + + std::string *Memory::name; std::string *Memory::class_name; double *Memory::consume; @@ -208,6 +221,126 @@ void Memory::record return; } +#if defined(__CUDA) || defined(__ROCM) + +double Memory::record_gpu +( + const std::string &class_name_in, + const std::string &name_in, + const long &n_in, + const std::string &type, + const bool accumulate +) +{ + if(!Memory::init_flag_gpu) + { + name_gpu = new std::string[n_memory]; + class_name_gpu = new std::string[n_memory]; + consume_gpu = new double[n_memory]; + for(int i=0;i= n_memory) + { + std::cout<<" Error! Too many gpu memories required."; + return 0.0; + } + + consume_gpu[find] = Memory::calculate_mem(n_in,type); + + if(consume_gpu[find] > 5) + { + print(find); + } + return consume_gpu[find]; +} + +void Memory::record_gpu +( + const std::string &name_in, + const size_t &n_in, + const bool accumulate +) +{ + if(!Memory::init_flag_gpu) + { + name_gpu = new std::string[n_memory]; + class_name_gpu = new std::string[n_memory]; + consume_gpu = new double[n_memory]; + for(int i=0;i= n_memory) + { + std::cout<<" Error! Too many gpu memories has been recorded."; + return; + } + + const double factor = 1.0/1024.0/1024.0; + double size_mb = n_in * factor; + + if(accumulate) + { + consume_gpu[find] += size_mb; + Memory::total_gpu += size_mb; + } + else + { + if(consume_gpu[find] < size_mb) + { + Memory::total_gpu += size_mb - consume_gpu[find]; + consume_gpu[find] = size_mb; + if(consume_gpu[find] > 5) + { + print(find); + } + } + } + return; +} + +#endif + void Memory::print(const int find) { GlobalV::ofs_running <<"\n Warning_Memory_Consuming allocated: " @@ -226,12 +359,24 @@ void Memory::finish(std::ofstream &ofs) delete[] consume; init_flag = false; } +#if defined(__CUDA) || defined(__ROCM) + if(init_flag_gpu) + { + delete[] name_gpu; + delete[] class_name_gpu; + delete[] consume_gpu; + } +#endif return; } void Memory::print_all(std::ofstream &ofs) { - if(!init_flag) + if(!init_flag +#if defined(__CUDA) || defined(__ROCM) + && !init_flag_gpu +#endif + ) { return; } @@ -239,6 +384,9 @@ void Memory::print_all(std::ofstream &ofs) const double small = 1.0; // unit is MB #ifdef __MPI Parallel_Reduce::reduce_all(Memory::total); +#if defined(__CUDA) || defined(__ROCM) + Parallel_Reduce::reduce_all(Memory::total_gpu); +#endif #endif ofs <<"\n NAME-------------------------|MEMORY(MB)--------" << std::endl; ofs <0); + + bool *print_flag_gpu = new bool[n_memory]; + + for(int i=0; i #include @@ -61,6 +62,20 @@ void resize_memory_op::operator()(const base_de delete_memory_op()(dev, arr); } cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE) * size)); + std::string record_string; + if (record_in != nullptr) + { + record_string = record_in; + } + else + { + record_string = "no_record"; + } + + if (record_string != "no_record") + { + ModuleBase::Memory::record_gpu(record_string, sizeof(FPTYPE) * size); + } } template