diff --git a/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu index 01a5625f5..4fb29e5b6 100644 --- a/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu +++ b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu @@ -57,6 +57,7 @@ #include #include #include +#include /* every tool needs to include this once */ #include "nvbit_tool.h" @@ -138,6 +139,117 @@ std::string spinlock_run_dir = "./"; int spinlock_keep_intermediate_files = 0; void spinlock_check(); +/* Kernel range filter */ +// Maybe move these to a util lib for all tracer tools? +std::string kernel_ranges = ""; + +struct KernelRange { + uint64_t start; + uint64_t end; // UINT64_MAX means open-ended + std::vector + kernel_name_regexes; // Vector of regexes for multiple patterns +}; +std::vector g_kernel_ranges; +uint64_t g_max_kernel_id = 0; +void parse_kernel_ranges_from_env() { + g_kernel_ranges.clear(); + g_max_kernel_id = 0; + + const char *env_var = std::getenv("DYNAMIC_KERNEL_RANGE"); + if (!env_var || std::string(env_var).empty()) { + g_kernel_ranges.push_back({0, 0, {std::regex(".*")}}); // 0 end = trace all + return; + } + std::string input(env_var); + std::istringstream stream(input); + std::string token; + + while (stream >> token) { + if (token.empty()) + continue; + + uint64_t start = 0, end = 0; + std::vector regexes; + + size_t at_pos = token.find('@'); + std::string range_part, regex_part; + + if (at_pos != std::string::npos) { + range_part = token.substr(0, at_pos); + regex_part = token.substr(at_pos + 1); + } else { + range_part = token; + } + + // Parse the range + if (!range_part.empty()) { + size_t dash_pos = range_part.find('-'); + if (dash_pos != std::string::npos) { + std::string start_str = range_part.substr(0, dash_pos); + std::string end_str = range_part.substr(dash_pos + 1); + + start = std::stoull(start_str); + if (!end_str.empty()) { + end = std::stoull(end_str); + } else { + end = 0; // open-ended + } + } else { + start = std::stoull(range_part); + end = start; + } + } else { + // No range → match all IDs + start = 0; + end = 0; + } + + // Parse the regexes + if (!regex_part.empty()) { + std::istringstream regex_stream(regex_part); + std::string regex_token; + while (std::getline(regex_stream, regex_token, ',')) { + try { + regexes.emplace_back(regex_token); + } catch (const std::regex_error &e) { + std::cerr << "Invalid regex: " << regex_token << std::endl; + } + } + } else { + regexes.emplace_back(".*"); // match all kernel names + } + + g_kernel_ranges.push_back({start, end, regexes}); + if (end > g_max_kernel_id) { + g_max_kernel_id = end; + } + } +} + +bool should_trace_kernel(uint64_t kernel_id, const std::string &kernel_name) { + for (const auto &range : g_kernel_ranges) { + // Check range for kernel ID + if (range.end == 0) { + if (kernel_id >= range.start) { + // Match any of the regexes for this range + for (const auto ®ex : range.kernel_name_regexes) { + if (std::regex_match(kernel_name, regex)) { + return true; + } + } + } + } else if (kernel_id >= range.start && kernel_id <= range.end) { + // Match any of the regexes for this range + for (const auto ®ex : range.kernel_name_regexes) { + if (std::regex_match(kernel_name, regex)) { + return true; + } + } + } + } + return false; +} + void* recv_thread_fun(void* args); void nvbit_at_init() { @@ -152,6 +264,16 @@ void nvbit_at_init() { GET_VAR_INT(spinlock_phase, "SPINLOCK_PHASE", 0, "Spinlock phase"); GET_VAR_STR(spinlock_run_dir, "TRACES_FOLDER", "Spinlock detection base directory, use the same as the traces folder"); GET_VAR_INT(spinlock_keep_intermediate_files, "SPINLOCK_KEEP_INTERMEDIATE_FILES", 0, "Keep intermediate files"); + GET_VAR_STR( + kernel_ranges, "DYNAMIC_KERNEL_RANGE", + "Specify kernel IDs or ranges to trace. Format:\n" + " - Single ID: \"2\" traces only kernel 2.\n" + " - Range: \"5-8\" traces kernels 5 through 8 (inclusive).\n" + " - Open-ended: \"10-\" traces from kernel 10 onward.\n" + " - Multiple ranges: \"2 5-8 10-\" (space-separated).\n" + " - With regex: \"5-8@kernel_a.*,kernel_b.*\" traces kernels 5-8 " + "with matching names.\n" + "If unset or empty, all kernels will be traced from the beginning."); std::string pad(100, '-'); printf("%s\n", pad.c_str()); @@ -167,6 +289,9 @@ void nvbit_at_init() { if (!spinlock_run_dir.empty()) { spinlock_run_dir += "/"; } + + // Parse the kernel ranges + parse_kernel_ranges_from_env(); } /** @@ -179,6 +304,7 @@ void nvbit_at_init() { void nvbit_at_term() { // Read the spinlock_run_PHASE dir under ctx_ and for each unique kernel name, // we will have a vector of kernel histograms + printf("Spinlock: Start to merge histograms from %s\n", spinlock_run_dir.c_str()); using HistogramMapByName = std::map>; HistogramMapByName map; @@ -192,6 +318,7 @@ void nvbit_at_term() { // Now we iterate the spinlock_run_PHASE dir under ctx_ folder std::string context_run_dir = folder.path().string() + "/spinlock_run_" + std::to_string(spinlock_phase); + DPRINTF("Spinlock: Read saved histograms from %s\n", context_run_dir.c_str()); // Build this histogram vector for this context for (auto& file : std::filesystem::directory_iterator(context_run_dir)) { @@ -201,11 +328,15 @@ void nvbit_at_term() { map[histogram->name].push_back(histogram); } } + + DPRINTF("Spinlock: Read %zu kernels from %s\n", map.size(), context_run_dir.c_str()); + } // Now, we merge all the histograms for each kernel name std::vector merged_histograms; size_t id = 0; + DPRINTF("Spinlock: Start to merge histograms\n"); for (auto& [kernel_name, histograms] : map) { KernelInstructionHistogram* merged_histogram = new KernelInstructionHistogram(); // Set the name to the kernel name @@ -218,6 +349,7 @@ void nvbit_at_term() { } merged_histograms.push_back(merged_histogram); } + DPRINTF("Spinlock: Merged %zu kernels\n", merged_histograms.size()); // For each merged histogram, save under spinlock_run_PHASE_merged dir std::string merged_run_dir = spinlock_run_dir + "spinlock_detection/spinlock_run_" + std::to_string(spinlock_phase) + "_merged"; @@ -228,6 +360,7 @@ void nvbit_at_term() { assert(false); } + DPRINTF("Spinlock: Start to save merged histograms to %s\n", merged_run_dir.c_str()); for (auto& histogram : merged_histograms) { histogram->saveToFile(merged_run_dir + "/kernel-" + std::to_string(histogram->id) + ".histogram"); } @@ -244,6 +377,7 @@ void nvbit_at_term() { // Check for spinlock if (spinlock_phase == SPINLOCK_PHASE_CHECK) { + DPRINTF("Spinlock: Start to check for spinlock\n"); spinlock_check(); } } @@ -346,16 +480,25 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func, assert(cudaGetLastError() == cudaSuccess); } + // Plus 1 since tracer_tool use 1-based kernel id + uint64_t kernel_id = grid_launch_id + 1; + std::string mangled_func_name = std::string(nvbit_get_func_name(ctx, func, true)); + // Initialize kernel instruction histogram map if (ctx_state->instr_histogram == nullptr) { - ctx_state->instr_histogram = new KernelInstructionHistogram(grid_launch_id, nvbit_get_func_name(ctx, func, true)); + ctx_state->instr_histogram = new KernelInstructionHistogram(kernel_id, mangled_func_name); } else { - ctx_state->instr_histogram->reinit(grid_launch_id, nvbit_get_func_name(ctx, func, true)); + ctx_state->instr_histogram->reinit(kernel_id, mangled_func_name); } /* instrument */ instrument_function_if_needed(ctx, func); + /* Determine if need to enable instrumentation */ + // Plus 1 since tracer_tool use 1-based kernel id + bool enable_instrumentation = should_trace_kernel(kernel_id, mangled_func_name); + bool disable_print = !enable_instrumentation; + int nregs = 0; CUDA_SAFECALL( cuFuncGetAttribute(&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, func)); @@ -379,29 +522,33 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func, if (cbid == API_CUDA_cuLaunchKernelEx_ptsz || cbid == API_CUDA_cuLaunchKernelEx) { cuLaunchKernelEx_params* p = (cuLaunchKernelEx_params*)params; - printf( - "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " - "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " - "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " - "id %ld\n", - (uint64_t)ctx, pc, func_name, grid_launch_id, - p->config->gridDimX, p->config->gridDimY, - p->config->gridDimZ, p->config->blockDimX, - p->config->blockDimY, p->config->blockDimZ, nregs, - shmem_static_nbytes + p->config->sharedMemBytes, - (uint64_t)p->config->hStream); + if (!disable_print) { + printf( + "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " + "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " + "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " + "id %ld\n", + (uint64_t)ctx, pc, func_name, grid_launch_id, + p->config->gridDimX, p->config->gridDimY, + p->config->gridDimZ, p->config->blockDimX, + p->config->blockDimY, p->config->blockDimZ, nregs, + shmem_static_nbytes + p->config->sharedMemBytes, + (uint64_t)p->config->hStream); + } } else { cuLaunchKernel_params* p = (cuLaunchKernel_params*)params; - printf( - "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " - "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " - "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " - "id %ld\n", - (uint64_t)ctx, pc, func_name, grid_launch_id, p->gridDimX, - p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, - p->blockDimZ, nregs, - shmem_static_nbytes + p->sharedMemBytes, - (uint64_t)p->hStream); + if (!disable_print) { + printf( + "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " + "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " + "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " + "id %ld\n", + (uint64_t)ctx, pc, func_name, grid_launch_id, p->gridDimX, + p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, + p->blockDimZ, nregs, + shmem_static_nbytes + p->sharedMemBytes, + (uint64_t)p->hStream); + } } // increment grid launch id for next launch @@ -410,8 +557,7 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func, grid_launch_id++; } - /* enable instrumented code to run */ - nvbit_enable_instrumented(ctx, func, true); + nvbit_enable_instrumented(ctx, func, enable_instrumentation); // Reset the kernel receiving done flag for new kernel launch ctx_state->kernel_receiving_done = false; @@ -450,8 +596,13 @@ static void leave_kernel_launch(CTXstate *ctx_state, uint64_t &grid_launch_id) { } // Save the histogram to file in form of kernel-.histogram - bool success = ctx_state->instr_histogram->saveToFile( folder_name + "/" + "kernel-" + std::to_string(ctx_state->instr_histogram->id) + ".histogram"); - assert(success); + // if we have specified to trace this kernel + uint64_t kernel_id = ctx_state->instr_histogram->id; + bool enable_save = should_trace_kernel(kernel_id, ctx_state->instr_histogram->name); + if (enable_save) { + bool success = ctx_state->instr_histogram->saveToFile( folder_name + "/" + "kernel-" + std::to_string(kernel_id) + ".histogram"); + assert(success); + } } void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,