Skip to content

Commit ec3f0a6

Browse files
Feature: support NVTX profiling via timer_enable_nvtx flag (#6495)
* Feature: support NVTX profiling via timer_enable_nvtx flag Signed-off-by:Tianxiang Wang<[email protected]>, Contributed under MetaX Integrated Circuits (Shanghai) Co., Ltd. * Add timer_enable_nvtx section in markdown Signed-off-by:Tianxiang Wang<[email protected]>, Contributed under MetaX Integrated Circuits (Shanghai) Co., Ltd. * Fix: Use __USE_NVTX macro to avoid NVTX linking errors in tests. Clarify in docs that timer_enable_nvtx parameter only takes effect on CUDA platforms. Signed-off-by:Tianxiang Wang<[email protected]>, Contributed under MetaX Integrated Circuits (Shanghai) Co., Ltd.
1 parent b6c4c75 commit ec3f0a6

File tree

5 files changed

+58
-24
lines changed

5 files changed

+58
-24
lines changed

CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ if (USE_SW)
287287
set(SW ON)
288288
include_directories(${SW_MATH}/include)
289289
include_directories(${SW_FFT}/include)
290-
290+
291291
target_link_libraries(${ABACUS_BIN_NAME} ${SW_FFT}/lib/libfftw3.a)
292292
target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswfft.a)
293293
target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a)
@@ -373,6 +373,7 @@ if(USE_CUDA)
373373
if(USE_CUDA)
374374
add_compile_definitions(__CUDA)
375375
add_compile_definitions(__UT_USE_CUDA)
376+
target_compile_definitions(${ABACUS_BIN_NAME} PRIVATE __USE_NVTX)
376377
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
377378
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE)
378379
endif()
@@ -520,7 +521,7 @@ if(ENABLE_MLALGO)
520521
include_directories(${libnpy_INCLUDE_DIR})
521522
endif()
522523
include_directories(${libnpy_SOURCE_DIR}/include)
523-
524+
524525
add_compile_definitions(__MLALGO)
525526
endif()
526527

@@ -560,7 +561,7 @@ if (ENABLE_CNPY)
560561
include_directories(${cnpy_INCLUDE_DIR})
561562
endif()
562563
include_directories(${cnpy_SOURCE_DIR})
563-
564+
564565
# find ZLIB and link
565566
find_package(ZLIB REQUIRED)
566567
target_link_libraries(${ABACUS_BIN_NAME} cnpy ZLIB::ZLIB)

docs/advanced/input_files/input-main.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
- [min\_dist\_coef](#min_dist_coef)
2323
- [device](#device)
2424
- [precision](#precision)
25+
- [timer_enable_nvtx](#timer_enable_nvtx)
2526
- [nb2d](#nb2d)
2627
- [Input Files](#variables-related-to-input-files)
2728
- [stru\_file](#stru_file)
@@ -706,6 +707,15 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
706707
- double: double precision
707708
- **Default**: double
708709

710+
### timer_enable_nvtx
711+
712+
- **Type**: Boolean
713+
- **Description**: Controls whether NVTX profiling labels are emitted by the timer. This feature is only effective on CUDA platforms.
714+
715+
- True: Enable NVTX profiling labels in the timer.
716+
- False: Disable NVTX profiling labels in the timer.
717+
- **Default**: False
718+
709719
### nb2d
710720

711721
- **Type**: Integer

source/source_base/timer.cpp

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414
#include "chrono"
1515
#include "source_base/formatter.h"
1616

17+
#if defined(__CUDA) && defined(__USE_NVTX)
18+
#include <nvToolsExt.h>
19+
#include "source_io/module_parameter/parameter.h"
20+
#endif
21+
1722
namespace ModuleBase
1823
{
1924

@@ -93,6 +98,12 @@ void timer::tick(const std::string &class_name,const std::string &name)
9398
#endif
9499
++timer_one.calls;
95100
timer_one.start_flag = false;
101+
#if defined(__CUDA) && defined(__USE_NVTX)
102+
if (PARAM.inp.timer_enable_nvtx){
103+
std::string label = class_name + ":" + name;
104+
nvtxRangePushA(label.data());
105+
}
106+
#endif
96107
}
97108
else
98109
{
@@ -107,6 +118,11 @@ void timer::tick(const std::string &class_name,const std::string &name)
107118
timer_one.cpu_second += (cpu_time() - timer_one.cpu_start);
108119
#endif
109120
timer_one.start_flag = true;
121+
#if defined(__CUDA) && defined(__USE_NVTX)
122+
if (PARAM.inp.timer_enable_nvtx){
123+
nvtxRangePop();
124+
}
125+
#endif
110126
}
111127
} // end if(!omp_get_thread_num())
112128
}
@@ -128,7 +144,7 @@ void timer::write_to_json(std::string file_name)
128144
int is_initialized = 0;
129145
MPI_Initialized(&is_initialized);
130146
if (!is_initialized) {
131-
return;
147+
return;
132148
}
133149
int my_rank = 0;
134150
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
@@ -195,12 +211,12 @@ void timer::write_to_json(std::string file_name)
195211
const Timer_One timer_one = timer_pool_B.second;
196212
ofs << indent << indent << indent << indent << "{\n";
197213
ofs << indent << indent << indent << indent << "\"name\": \"" << name << "\",\n";
198-
ofs << indent << indent << indent << indent << "\"cpu_second\": "
214+
ofs << indent << indent << indent << indent << "\"cpu_second\": "
199215
<< std::setprecision(15) << timer_one.cpu_second << ",\n";
200216
ofs << indent << indent << indent << indent << "\"calls\": " << timer_one.calls << ",\n";
201-
ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": "
217+
ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": "
202218
<< double_to_string(timer_one.cpu_second/timer_one.calls) << ",\n";
203-
ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": "
219+
ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": "
204220
<< double_to_string(timer_one.cpu_second/timer_pool[""]["total"].cpu_second) << "\n";
205221

206222
if (order_b == timer_pool_A.second.size())
@@ -283,11 +299,11 @@ void timer::print_all(std::ofstream &ofs)
283299

284300

285301
// if the total time is too small, we do not calculate the percentage
286-
if (timer_pool_order[0].second.cpu_second < 1e-9)
302+
if (timer_pool_order[0].second.cpu_second < 1e-9)
287303
{
288304
pers.push_back(0);
289-
}
290-
else
305+
}
306+
else
291307
{
292308
pers.push_back(percentage);
293309
}
@@ -300,10 +316,10 @@ void timer::print_all(std::ofstream &ofs)
300316

301317
std::vector<std::string> titles = {"CLASS_NAME", "NAME", "TIME/s", "CALLS", "AVG/s", "PER/%"};
302318
std::vector<std::string> formats = {"%-10s", "%-10s", "%6.2f", "%8d", "%6.2f", "%6.2f"};
303-
FmtTable time_statistics(/*titles=*/titles,
304-
/*nrows=*/pers.size(),
305-
/*formats=*/formats,
306-
/*indent=*/0,
319+
FmtTable time_statistics(/*titles=*/titles,
320+
/*nrows=*/pers.size(),
321+
/*formats=*/formats,
322+
/*indent=*/0,
307323
/*align=*/{/*value*/FmtTable::Align::LEFT, /*title*/FmtTable::Align::CENTER});
308324
time_statistics << class_names << names << times << calls << avgs << pers;
309325
const std::string table = "\nTIME STATISTICS\n" + time_statistics.str();

source/source_io/module_parameter/input_parameter.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ struct Input_para
6767

6868
std::string device = "auto";
6969
std::string precision = "double";
70+
bool timer_enable_nvtx = false;
7071

7172
// ============== #Parameters (2.Electronic structure) ===========================
7273
std::string ks_solver = "default"; ///< xiaohui add 2013-09-01
@@ -375,7 +376,7 @@ struct Input_para
375376
bool out_proj_band = false; ///< projected band structure calculation jiyy add 2022-05-11
376377
std::string out_level = "ie"; ///< control the output information.
377378
std::vector<int> out_dmr = {0, 8}; ///< output density matrix in real space DM(R)
378-
std::vector<int> out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k)
379+
std::vector<int> out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k)
379380
bool out_bandgap = false; ///< QO added for bandgap printing
380381
std::vector<int> out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis.
381382
std::vector<int> out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis.
@@ -659,29 +660,29 @@ struct Input_para
659660
* the following two sets of parameters are for the XC parameterization.
660661
* The first element should be the LibXC id, to assign the analytical
661662
* form of the eXchange and Correlation part of the functional.
662-
*
663+
*
663664
* Starting from the second parameter, the parameters are the coefficients
664665
* of the functional. For example the M06-L functional, one should refer
665666
* to the source file (source code of LibXC)
666-
*
667+
*
667668
* src/mgga_x_m06l.c
668-
*
669+
*
669670
* the implementation can be found in the file
670-
*
671+
*
671672
* src/maple2c/mgga_exc/mgga_x_m06l.c.
672-
*
673+
*
673674
* There are 18 parameters for the exchange part, so the whole length of
674675
* the xc_exch_ext should be 19. (MGGA_X_M06L, id = 203)
675-
*
676+
*
676677
* Likewise, the correlation part can be found in corresponding files.
677-
*
678+
*
678679
* PBE functional is used as the default functional for XCPNet.
679680
*/
680681
// src/gga_x_pbe.c
681682
std::vector<double> xc_exch_ext = {
682-
101, 0.8040, 0.2195149727645171};
683+
101, 0.8040, 0.2195149727645171};
683684
// src/gga_c_pbe.c
684685
std::vector<double> xc_corr_ext = {
685-
130, 0.06672455060314922, 0.031090690869654895034, 1.00000};
686+
130, 0.06672455060314922, 0.031090690869654895034, 1.00000};
686687
};
687688
#endif

source/source_io/read_input_item_system.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,12 @@ void ReadInput::item_system()
830830
};
831831
this->add_item(item);
832832
}
833+
{
834+
Input_Item item("timer_enable_nvtx");
835+
item.annotation = "enable NVTX labeling for profiling or not";
836+
read_sync_bool(input.timer_enable_nvtx);
837+
this->add_item(item);
838+
}
833839
}
834840

835841
} // namespace ModuleIO

0 commit comments

Comments
 (0)