deepmodeling · mohanchen · Sep 9, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 9, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -287,7 +287,7 @@ if (USE_SW)
   set(SW ON)
   include_directories(${SW_MATH}/include)
   include_directories(${SW_FFT}/include)
-  
+
   target_link_libraries(${ABACUS_BIN_NAME} ${SW_FFT}/lib/libfftw3.a)
   target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswfft.a)
   target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a)
@@ -373,6 +373,7 @@ if(USE_CUDA)
   if(USE_CUDA)
     add_compile_definitions(__CUDA)
     add_compile_definitions(__UT_USE_CUDA)
+    target_compile_definitions(${ABACUS_BIN_NAME} PRIVATE __USE_NVTX)
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
       set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE)
     endif()
@@ -520,7 +521,7 @@ if(ENABLE_MLALGO)
     include_directories(${libnpy_INCLUDE_DIR})
   endif()
   include_directories(${libnpy_SOURCE_DIR}/include)
-  
+
   add_compile_definitions(__MLALGO)
 endif()
 
@@ -560,7 +561,7 @@ if (ENABLE_CNPY)
     include_directories(${cnpy_INCLUDE_DIR})
   endif()
   include_directories(${cnpy_SOURCE_DIR})
-  
+
   # find ZLIB and link
   find_package(ZLIB REQUIRED)
   target_link_libraries(${ABACUS_BIN_NAME} cnpy ZLIB::ZLIB)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -22,6 +22,7 @@
     - [min\_dist\_coef](#min_dist_coef)
     - [device](#device)
     - [precision](#precision)
+    - [timer_enable_nvtx](#timer_enable_nvtx)
     - [nb2d](#nb2d)
   - [Input Files](#variables-related-to-input-files)
     - [stru\_file](#stru_file)
@@ -706,6 +707,15 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c
   - double: double precision
 - **Default**: double
 
+### timer_enable_nvtx
+
+- **Type**: Boolean
+- **Description**: Controls whether NVTX profiling labels are emitted by the timer. This feature is only effective on CUDA platforms.
+
+  - True: Enable NVTX profiling labels in the timer.
+  - False: Disable NVTX profiling labels in the timer.
+- **Default**: False
+
 ### nb2d
 
 - **Type**: Integer

diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp
@@ -14,6 +14,11 @@
 #include "chrono"
 #include "source_base/formatter.h"
 
+#if defined(__CUDA) && defined(__USE_NVTX)
+#include <nvToolsExt.h>
+#include "source_io/module_parameter/parameter.h"
+#endif
+
 namespace ModuleBase
 {
 
@@ -93,6 +98,12 @@ void timer::tick(const std::string &class_name,const std::string &name)
 #endif
 			++timer_one.calls;
 			timer_one.start_flag = false;
+#if defined(__CUDA) && defined(__USE_NVTX)
+            if (PARAM.inp.timer_enable_nvtx){
+                std::string label = class_name + ":" + name;
+                nvtxRangePushA(label.data());
+            }
+#endif
 		}
 		else
 		{
@@ -107,6 +118,11 @@ void timer::tick(const std::string &class_name,const std::string &name)
 			timer_one.cpu_second += (cpu_time() - timer_one.cpu_start);
 #endif
 			timer_one.start_flag = true;
+#if defined(__CUDA) && defined(__USE_NVTX)
+            if (PARAM.inp.timer_enable_nvtx){
+                nvtxRangePop();
+            }
+#endif
 		}
 	} // end if(!omp_get_thread_num())
 }
@@ -128,7 +144,7 @@ void timer::write_to_json(std::string file_name)
 	int is_initialized = 0;
     MPI_Initialized(&is_initialized);
 	if (!is_initialized) {
-		return;	
+		return;
 }
 	int my_rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
@@ -195,12 +211,12 @@ void timer::write_to_json(std::string file_name)
 			const Timer_One timer_one = timer_pool_B.second;
 			ofs << indent << indent << indent << indent << "{\n";
 			ofs << indent << indent << indent << indent << "\"name\": \"" << name << "\",\n";
-			ofs << indent << indent << indent << indent << "\"cpu_second\": " 
+			ofs << indent << indent << indent << indent << "\"cpu_second\": "
 				<< std::setprecision(15) << timer_one.cpu_second << ",\n";
 			ofs << indent << indent << indent << indent << "\"calls\": " << timer_one.calls << ",\n";
-			ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": " 
+			ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": "
 				<< double_to_string(timer_one.cpu_second/timer_one.calls) << ",\n";
-			ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": " 
+			ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": "
 				<< double_to_string(timer_one.cpu_second/timer_pool[""]["total"].cpu_second) << "\n";
 
 			if (order_b == timer_pool_A.second.size())
@@ -283,11 +299,11 @@ void timer::print_all(std::ofstream &ofs)
 
 
 		// if the total time is too small, we do not calculate the percentage
-		if (timer_pool_order[0].second.cpu_second < 1e-9) 
+		if (timer_pool_order[0].second.cpu_second < 1e-9)
 		{
 			pers.push_back(0);
-		} 
-		else 
+		}
+		else
 		{
 			pers.push_back(percentage);
 		}
@@ -300,10 +316,10 @@ void timer::print_all(std::ofstream &ofs)
 
 	std::vector<std::string> titles = {"CLASS_NAME", "NAME", "TIME/s", "CALLS", "AVG/s", "PER/%"};
 	std::vector<std::string> formats = {"%-10s", "%-10s", "%6.2f", "%8d", "%6.2f", "%6.2f"};
-	FmtTable time_statistics(/*titles=*/titles, 
-							 /*nrows=*/pers.size(), 
-							 /*formats=*/formats, 
-							 /*indent=*/0, 
+	FmtTable time_statistics(/*titles=*/titles,
+							 /*nrows=*/pers.size(),
+							 /*formats=*/formats,
+							 /*indent=*/0,
 							 /*align=*/{/*value*/FmtTable::Align::LEFT, /*title*/FmtTable::Align::CENTER});
 	time_statistics << class_names << names << times << calls << avgs << pers;
 	const std::string table = "\nTIME STATISTICS\n" + time_statistics.str();

diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h
@@ -67,6 +67,7 @@ struct Input_para
 
     std::string device = "auto";
     std::string precision = "double";
+    bool timer_enable_nvtx = false;
 
     // ==============   #Parameters (2.Electronic structure) ===========================
     std::string ks_solver = "default"; ///< xiaohui add 2013-09-01
@@ -375,7 +376,7 @@ struct Input_para
     bool out_proj_band = false;           ///< projected band structure calculation jiyy add 2022-05-11
     std::string out_level = "ie";         ///< control the output information.
     std::vector<int> out_dmr = {0, 8};    ///< output density matrix in real space DM(R)
-    std::vector<int> out_dmk = {0, 8};    ///< output density matrix in reciprocal space DM(k)   
+    std::vector<int> out_dmk = {0, 8};    ///< output density matrix in reciprocal space DM(k)
     bool out_bandgap = false;             ///< QO added for bandgap printing
     std::vector<int> out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis.
     std::vector<int> out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis.
@@ -659,29 +660,29 @@ struct Input_para
      * the following two sets of parameters are for the XC parameterization.
      * The first element should be the LibXC id, to assign the analytical
      * form of the eXchange and Correlation part of the functional.
-     * 
+     *
      * Starting from the second parameter, the parameters are the coefficients
      * of the functional. For example the M06-L functional, one should refer
      * to the source file (source code of LibXC)
-     * 
+     *
      * src/mgga_x_m06l.c
-     * 
+     *
      * the implementation can be found in the file
-     * 
+     *
      * src/maple2c/mgga_exc/mgga_x_m06l.c.
-     * 
+     *
      * There are 18 parameters for the exchange part, so the whole length of
      * the xc_exch_ext should be 19. (MGGA_X_M06L, id = 203)
-     * 
+     *
      * Likewise, the correlation part can be found in corresponding files.
-     * 
+     *
      * PBE functional is used as the default functional for XCPNet.
      */
     // src/gga_x_pbe.c
     std::vector<double> xc_exch_ext = {
-        101, 0.8040, 0.2195149727645171}; 
+        101, 0.8040, 0.2195149727645171};
     // src/gga_c_pbe.c
     std::vector<double> xc_corr_ext = {
-        130, 0.06672455060314922, 0.031090690869654895034, 1.00000}; 
+        130, 0.06672455060314922, 0.031090690869654895034, 1.00000};
 };
 #endif
diff --git a/source/source_io/read_input_item_system.cpp b/source/source_io/read_input_item_system.cpp
@@ -830,6 +830,12 @@ void ReadInput::item_system()
         };
         this->add_item(item);
     }
+    {
+        Input_Item item("timer_enable_nvtx");
+        item.annotation = "enable NVTX labeling for profiling or not";
+        read_sync_bool(input.timer_enable_nvtx);
+        this->add_item(item);
+    }
 }
 
 } // namespace ModuleIO