diff --git a/CMakeLists.txt b/CMakeLists.txt index 277f1924ec..c0c7b83bf8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,7 +287,7 @@ if (USE_SW) set(SW ON) include_directories(${SW_MATH}/include) include_directories(${SW_FFT}/include) - + target_link_libraries(${ABACUS_BIN_NAME} ${SW_FFT}/lib/libfftw3.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswfft.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a) @@ -373,6 +373,7 @@ if(USE_CUDA) if(USE_CUDA) add_compile_definitions(__CUDA) add_compile_definitions(__UT_USE_CUDA) + target_compile_definitions(${ABACUS_BIN_NAME} PRIVATE __USE_NVTX) if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE) endif() @@ -520,7 +521,7 @@ if(ENABLE_MLALGO) include_directories(${libnpy_INCLUDE_DIR}) endif() include_directories(${libnpy_SOURCE_DIR}/include) - + add_compile_definitions(__MLALGO) endif() @@ -560,7 +561,7 @@ if (ENABLE_CNPY) include_directories(${cnpy_INCLUDE_DIR}) endif() include_directories(${cnpy_SOURCE_DIR}) - + # find ZLIB and link find_package(ZLIB REQUIRED) target_link_libraries(${ABACUS_BIN_NAME} cnpy ZLIB::ZLIB) diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index db4ff3b4f4..a2024dec66 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -22,6 +22,7 @@ - [min\_dist\_coef](#min_dist_coef) - [device](#device) - [precision](#precision) + - [timer_enable_nvtx](#timer_enable_nvtx) - [nb2d](#nb2d) - [Input Files](#variables-related-to-input-files) - [stru\_file](#stru_file) @@ -706,6 +707,15 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c - double: double precision - **Default**: double +### timer_enable_nvtx + +- **Type**: Boolean +- **Description**: Controls whether NVTX profiling labels are emitted by the timer. This feature is only effective on CUDA platforms. + + - True: Enable NVTX profiling labels in the timer. + - False: Disable NVTX profiling labels in the timer. +- **Default**: False + ### nb2d - **Type**: Integer diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp index f7a4be636d..e7f22df70d 100644 --- a/source/source_base/timer.cpp +++ b/source/source_base/timer.cpp @@ -14,6 +14,11 @@ #include "chrono" #include "source_base/formatter.h" +#if defined(__CUDA) && defined(__USE_NVTX) +#include +#include "source_io/module_parameter/parameter.h" +#endif + namespace ModuleBase { @@ -93,6 +98,12 @@ void timer::tick(const std::string &class_name,const std::string &name) #endif ++timer_one.calls; timer_one.start_flag = false; +#if defined(__CUDA) && defined(__USE_NVTX) + if (PARAM.inp.timer_enable_nvtx){ + std::string label = class_name + ":" + name; + nvtxRangePushA(label.data()); + } +#endif } else { @@ -107,6 +118,11 @@ void timer::tick(const std::string &class_name,const std::string &name) timer_one.cpu_second += (cpu_time() - timer_one.cpu_start); #endif timer_one.start_flag = true; +#if defined(__CUDA) && defined(__USE_NVTX) + if (PARAM.inp.timer_enable_nvtx){ + nvtxRangePop(); + } +#endif } } // end if(!omp_get_thread_num()) } @@ -128,7 +144,7 @@ void timer::write_to_json(std::string file_name) int is_initialized = 0; MPI_Initialized(&is_initialized); if (!is_initialized) { - return; + return; } int my_rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -195,12 +211,12 @@ void timer::write_to_json(std::string file_name) const Timer_One timer_one = timer_pool_B.second; ofs << indent << indent << indent << indent << "{\n"; ofs << indent << indent << indent << indent << "\"name\": \"" << name << "\",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second\": " + ofs << indent << indent << indent << indent << "\"cpu_second\": " << std::setprecision(15) << timer_one.cpu_second << ",\n"; ofs << indent << indent << indent << indent << "\"calls\": " << timer_one.calls << ",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": " + ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": " << double_to_string(timer_one.cpu_second/timer_one.calls) << ",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": " + ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": " << double_to_string(timer_one.cpu_second/timer_pool[""]["total"].cpu_second) << "\n"; if (order_b == timer_pool_A.second.size()) @@ -283,11 +299,11 @@ void timer::print_all(std::ofstream &ofs) // if the total time is too small, we do not calculate the percentage - if (timer_pool_order[0].second.cpu_second < 1e-9) + if (timer_pool_order[0].second.cpu_second < 1e-9) { pers.push_back(0); - } - else + } + else { pers.push_back(percentage); } @@ -300,10 +316,10 @@ void timer::print_all(std::ofstream &ofs) std::vector titles = {"CLASS_NAME", "NAME", "TIME/s", "CALLS", "AVG/s", "PER/%"}; std::vector formats = {"%-10s", "%-10s", "%6.2f", "%8d", "%6.2f", "%6.2f"}; - FmtTable time_statistics(/*titles=*/titles, - /*nrows=*/pers.size(), - /*formats=*/formats, - /*indent=*/0, + FmtTable time_statistics(/*titles=*/titles, + /*nrows=*/pers.size(), + /*formats=*/formats, + /*indent=*/0, /*align=*/{/*value*/FmtTable::Align::LEFT, /*title*/FmtTable::Align::CENTER}); time_statistics << class_names << names << times << calls << avgs << pers; const std::string table = "\nTIME STATISTICS\n" + time_statistics.str(); diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h index 65a82ac2ac..db2b5d5a72 100644 --- a/source/source_io/module_parameter/input_parameter.h +++ b/source/source_io/module_parameter/input_parameter.h @@ -67,6 +67,7 @@ struct Input_para std::string device = "auto"; std::string precision = "double"; + bool timer_enable_nvtx = false; // ============== #Parameters (2.Electronic structure) =========================== std::string ks_solver = "default"; ///< xiaohui add 2013-09-01 @@ -375,7 +376,7 @@ struct Input_para bool out_proj_band = false; ///< projected band structure calculation jiyy add 2022-05-11 std::string out_level = "ie"; ///< control the output information. std::vector out_dmr = {0, 8}; ///< output density matrix in real space DM(R) - std::vector out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k) + std::vector out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k) bool out_bandgap = false; ///< QO added for bandgap printing std::vector out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis. std::vector out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis. @@ -659,29 +660,29 @@ struct Input_para * the following two sets of parameters are for the XC parameterization. * The first element should be the LibXC id, to assign the analytical * form of the eXchange and Correlation part of the functional. - * + * * Starting from the second parameter, the parameters are the coefficients * of the functional. For example the M06-L functional, one should refer * to the source file (source code of LibXC) - * + * * src/mgga_x_m06l.c - * + * * the implementation can be found in the file - * + * * src/maple2c/mgga_exc/mgga_x_m06l.c. - * + * * There are 18 parameters for the exchange part, so the whole length of * the xc_exch_ext should be 19. (MGGA_X_M06L, id = 203) - * + * * Likewise, the correlation part can be found in corresponding files. - * + * * PBE functional is used as the default functional for XCPNet. */ // src/gga_x_pbe.c std::vector xc_exch_ext = { - 101, 0.8040, 0.2195149727645171}; + 101, 0.8040, 0.2195149727645171}; // src/gga_c_pbe.c std::vector xc_corr_ext = { - 130, 0.06672455060314922, 0.031090690869654895034, 1.00000}; + 130, 0.06672455060314922, 0.031090690869654895034, 1.00000}; }; #endif diff --git a/source/source_io/read_input_item_system.cpp b/source/source_io/read_input_item_system.cpp index 2eb2234d9f..c9854c9ece 100644 --- a/source/source_io/read_input_item_system.cpp +++ b/source/source_io/read_input_item_system.cpp @@ -830,6 +830,12 @@ void ReadInput::item_system() }; this->add_item(item); } + { + Input_Item item("timer_enable_nvtx"); + item.annotation = "enable NVTX labeling for profiling or not"; + read_sync_bool(input.timer_enable_nvtx); + this->add_item(item); + } } } // namespace ModuleIO