diff --git a/util/tracer_nvbit/.gitignore b/util/tracer_nvbit/.gitignore index c1adccf67..5545e9e6d 100644 --- a/util/tracer_nvbit/.gitignore +++ b/util/tracer_nvbit/.gitignore @@ -5,3 +5,5 @@ tracer_tool/tracer_tool.o tracer_tool/tracer_tool.so tracer_tool/inject_funcs.o tracer_tool/traces-processing/post-traces-processing +traceDsm +traceAsm \ No newline at end of file diff --git a/util/tracer_nvbit/tracer_tool/Makefile b/util/tracer_nvbit/tracer_tool/Makefile index 950639750..7096821c9 100644 --- a/util/tracer_nvbit/tracer_tool/Makefile +++ b/util/tracer_nvbit/tracer_tool/Makefile @@ -43,16 +43,18 @@ current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path)))) NVBIT_TOOL=$(current_dir).so -all: $(NVBIT_TOOL) +all: $(NVBIT_TOOL) traceDsm $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a $(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@ %.o: %.cu common.h - $(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ + $(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -Xcompiler -Wno-unused-result -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ inject_funcs.o: inject_funcs.cu common.h - $(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@ + $(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -Wno-unused-result -Xcompiler -fPIC -c $< -o $@ +traceDsm: traceDsm.cpp common.h + $(CXX) -std=c++17 -O3 -g -Wno-unused-result -o $@ $^ clean: rm -f *.so *.o diff --git a/util/tracer_nvbit/tracer_tool/common.h b/util/tracer_nvbit/tracer_tool/common.h index 7eff0feb7..300d7e70f 100644 --- a/util/tracer_nvbit/tracer_tool/common.h +++ b/util/tracer_nvbit/tracer_tool/common.h @@ -1,26 +1,51 @@ /* Author1: Mahmoud Khairy, abdallm@purdue.com - 2019 */ /* Author2: Jason Shen, shen203@purdue.edu - 2019 */ -#include +#ifndef COMMON_H +#define COMMON_H -static __managed__ uint64_t total_dynamic_instr_counter = 0; -static __managed__ uint64_t reported_dynamic_instr_counter = 0; -static __managed__ bool stop_report = false; +#include +#include +#include +#include +#include +#include /* information collected in the instrumentation function and passed * on the channel from the GPU to the CPU */ #define MAX_SRC 5 +#define MAX_OPCODE_LENGTH 32 + +typedef enum : uint8_t { + INST_BASE = 0, + INST_FLAT, + INST_DELTA, + INST_STRIDE, +} inst_type_t; typedef struct { - int cta_id_x; - int cta_id_y; - int cta_id_z; - int warpid_tb; - int warpid_sm; - int sm_id; - int opcode_id; - uint64_t addrs[32]; - uint32_t line_num; + unsigned kernel_id; + unsigned grid_dim_x; + unsigned grid_dim_y; + unsigned grid_dim_z; + unsigned block_dim_x; + unsigned block_dim_y; + unsigned block_dim_z; + unsigned shared_mem_bytes; + unsigned shmem; + unsigned nregs; + unsigned binary_version; + uint64_t cuda_stream_id; + uint64_t shmem_base_addr; + uint64_t local_mem_base_addr; + char nvbit_version[10]; + char accelsim_tracer_version[10]; + bool enable_lineinfo; +} kernel_header; + +typedef struct { + char opcode[MAX_OPCODE_LENGTH]; + uint32_t vpc; bool is_mem; int32_t GPRDst; @@ -30,4 +55,168 @@ typedef struct { uint32_t active_mask; uint32_t predicate_mask; uint64_t imm; +} sim_inst_trace_t; + +typedef struct { + sim_inst_trace_t base; + uint64_t addrs[32]; +} sim_inst_trace_flat_t; + +typedef struct { + sim_inst_trace_t base; + + uint64_t base_addr; + int32_t delta[32]; +} sim_inst_trace_delta_t; + +typedef struct { + sim_inst_trace_t base; + uint64_t base_addr; + int32_t stride; +} sim_inst_trace_stride_t; + +typedef struct { + sim_inst_trace_t base; + + int cta_id_x; + int cta_id_y; + int cta_id_z; + int warpid_tb; + int warpid_sm; + int sm_id; + int opcode_id; + uint32_t line_num; + uint64_t addrs[32]; } inst_trace_t; + +typedef union { + sim_inst_trace_t sim_inst_base; + sim_inst_trace_delta_t sim_inst_delta; + sim_inst_trace_flat_t sim_inst_flat; + sim_inst_trace_stride_t sim_inst_stride; + +} sim_inst_u; + +// unsigned get_inst_size(inst_type_t type) { +// switch (type) { +// case INST_BASE: +// return sizeof(sim_inst_trace_t); +// case INST_FLAT: +// return sizeof(sim_inst_trace_flat_t); +// case INST_DELTA: +// return sizeof(sim_inst_trace_delta_t); +// case INST_STRIDE: +// return sizeof(sim_inst_trace_stride_t); +// default: +// assert(0); +// exit(1); +// } +// } + +inline bool hasEnding(const std::string &fullString, + const std::string &ending) { + if (fullString.length() >= ending.length()) { + return (0 == fullString.compare(fullString.length() - ending.length(), + ending.length(), ending)); + } + return false; +} + +/** + * Opens a file for reading, automatically handling .xz decompression + * @param filepath Path to the file to open + * @return FILE pointer to the opened file/pipe, or nullptr on failure + * @throws std::runtime_error if file type is unsupported or opening fails + */ +inline FILE *openFileForReading(const std::string &filepath) { + FILE *file = nullptr; + + if (hasEnding(filepath, ".xz")) { + // Use xz command to decompress .xz files + std::string command = "xz -dc " + filepath; + file = popen(command.c_str(), "r"); + } else if (hasEnding(filepath, ".trace") || hasEnding(filepath, ".traceg")) { + // Use fopen for regular trace files + file = fopen(filepath.c_str(), "rb"); + } else { + throw std::runtime_error("Unsupported file type: " + filepath); + } + + if (!file) { + throw std::runtime_error("Failed to open file for reading: " + filepath); + } + + return file; +} + +/** + * Opens a file for writing, automatically handling .xz compression + * @param filepath Base filepath (without extension) + * @param use_xz_compression Whether to compress with xz + * @return FILE pointer to the opened file/pipe, or nullptr on failure + * @throws std::runtime_error if opening fails + */ +inline FILE *openFileForWriting(const std::string &filepath, + bool use_xz_compression = false) { + FILE *file = nullptr; + + if (use_xz_compression) { + std::string command = "xz -1 -T0 > " + filepath + ".xz"; + file = popen(command.c_str(), "w"); + if (!file) { + throw std::runtime_error( + "Failed to open xz compression pipe for: " + filepath + ".xz"); + } + } else { + file = fopen(filepath.c_str(), "wb"); + if (!file) { + throw std::runtime_error("Failed to open file for writing: " + filepath); + } + } + + return file; +} + +/** + * Opens a file for writing with xz compression using a command buffer + * @param base_filepath Base filepath (without extension) + * @param cmd_buffer Buffer to store the xz command + * @param buffer_size Size of the command buffer + * @return FILE pointer to the opened pipe, or nullptr on failure + * @throws std::runtime_error if buffer is too small or opening fails + */ +inline FILE *openFileForWritingXz(const std::string &base_filepath, + char *cmd_buffer, size_t buffer_size) { + if (snprintf(cmd_buffer, buffer_size, "xz -1 -T0 > %s.xz", + base_filepath.c_str()) >= (int)buffer_size) { + throw std::runtime_error("Command buffer too small for xz compression"); + } + + FILE *file = popen(cmd_buffer, "w"); + if (!file) { + throw std::runtime_error( + "Failed to open xz compression pipe for: " + base_filepath + ".xz"); + } + + return file; +} + +/** + * Generates output filepath based on input filepath and desired extension + * @param input_filepath Input file path + * @param new_extension New extension to append (without dot) + * @return Output filepath with new extension + */ +inline std::string generateOutputFilepath(const std::string &input_filepath, + const std::string &new_extension) { + if (hasEnding(input_filepath, ".xz")) { + // Remove .xz and add new extension + return input_filepath.substr(0, input_filepath.find_last_of(".")) + + new_extension; + } else { + // Add new extension to existing path + return input_filepath + "." + new_extension; + } +} + +#endif \ No newline at end of file diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu index dfc3f8769..b697acaef 100644 --- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu +++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu @@ -38,17 +38,17 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem, } } - inst_trace_t ma; + inst_trace_t ma = {0}; if (is_mem) { /* collect memory address information */ for (int i = 0; i < 32; i++) { ma.addrs[i] = __shfl_sync(active_mask, addr, i); } - ma.width = width; - ma.is_mem = true; + ma.base.width = width; + ma.base.is_mem = true; } else { - ma.is_mem = false; + ma.base.is_mem = false; } int4 cta = get_ctaid(); @@ -62,17 +62,17 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem, ma.cta_id_z = cta.z; ma.warpid_sm = get_warpid(); ma.opcode_id = opcode_id; - ma.vpc = vpc; - ma.GPRDst = desReg; - ma.GPRSrcs[0] = srcReg1; - ma.GPRSrcs[1] = srcReg2; - ma.GPRSrcs[2] = srcReg3; - ma.GPRSrcs[3] = srcReg4; - ma.GPRSrcs[4] = srcReg5; - ma.numSrcs = srcNum; - ma.imm = immediate; - ma.active_mask = active_mask; - ma.predicate_mask = predicate_mask; + ma.base.vpc = vpc; + ma.base.GPRDst = desReg; + ma.base.GPRSrcs[0] = srcReg1; + ma.base.GPRSrcs[1] = srcReg2; + ma.base.GPRSrcs[2] = srcReg3; + ma.base.GPRSrcs[3] = srcReg4; + ma.base.GPRSrcs[4] = srcReg5; + ma.base.numSrcs = srcNum; + ma.base.imm = immediate; + ma.base.active_mask = active_mask; + ma.base.predicate_mask = predicate_mask; ma.sm_id = get_smid(); /* first active lane pushes information on the channel */ diff --git a/util/tracer_nvbit/tracer_tool/traceDsm.cpp b/util/tracer_nvbit/tracer_tool/traceDsm.cpp new file mode 100644 index 000000000..7e98a8f21 --- /dev/null +++ b/util/tracer_nvbit/tracer_tool/traceDsm.cpp @@ -0,0 +1,195 @@ +#include "common.h" +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + std::string filepath = argv[1]; + FILE *file; + FILE *output_file; + + try { + // Use utility function to open input file + file = openFileForReading(filepath); + + // Open output file + std::string output_filepath = filepath + ".txt"; + output_file = openFileForWriting(output_filepath, false); + } catch (const std::runtime_error &e) { + fprintf(stderr, "Error: %s\n", e.what()); + return 1; + } + + // Read the kernel header + std::string kernel_name; + uint64_t name_size; + fread(&name_size, sizeof(uint64_t), 1, file); + + kernel_name.resize(name_size); + fread(kernel_name.data(), name_size, 1, file); + + // Read the kernel header + kernel_header header; + fread(&header, sizeof(kernel_header), 1, file); + + fprintf(output_file, "-kernel name = %s\n", kernel_name.c_str()); + fprintf(output_file, "-kernel id = %d\n", header.kernel_id); + fprintf(output_file, "-grid dim = (%d,%d,%d)\n", header.grid_dim_x, + header.grid_dim_y, header.grid_dim_z); + fprintf(output_file, "-block dim = (%d,%d,%d)\n", header.block_dim_x, + header.block_dim_y, header.block_dim_z); + fprintf(output_file, "-shmem = %d\n", header.shared_mem_bytes); + fprintf(output_file, "-nregs = %d\n", header.nregs); + fprintf(output_file, "-binary version = %d\n", header.binary_version); + fprintf(output_file, "-cuda stream id = %lu\n", header.cuda_stream_id); + fprintf(output_file, "-shmem base_addr = 0x%016lx\n", header.shmem_base_addr); + fprintf(output_file, "-local mem base_addr = 0x%016lx\n", + header.local_mem_base_addr); + fprintf(output_file, "-nvbit version = %s\n", header.nvbit_version); + fprintf(output_file, "-accelsim tracer version = %s\n", + header.accelsim_tracer_version); + fprintf(output_file, "-enable lineinfo = %d\n", header.enable_lineinfo); + + // Read the traces + unsigned tot_warp_in_id; + unsigned tb_count = 0; + while (fread(&tot_warp_in_id, sizeof(unsigned), 1, file) == 1) { + // TB + fprintf(output_file, "#BEGIN_TB\n"); + fprintf(output_file, "thread block = %d,%d,%d\n", tb_count, 0, 0); + + unsigned read_tb = 0; + while (read_tb < tot_warp_in_id) { + // WARP + unsigned num_insts; + fread(&num_insts, sizeof(unsigned), 1, file); + fprintf(output_file, "warp = %d\n", read_tb); + fprintf(output_file, "insts = %d\n", num_insts); + + unsigned read_inst = 0; + while (read_inst < num_insts) { + // INST + sim_inst_u full_inst; + inst_type_t inst_type; + fread(&inst_type, sizeof(inst_type), 1, file); + unsigned size; + switch (inst_type) { + case INST_BASE: + size = sizeof(sim_inst_trace_t); + fread(&full_inst.sim_inst_base, size, 1, file); + break; + case INST_FLAT: + size = sizeof(sim_inst_trace_flat_t); + fread(&full_inst.sim_inst_flat, size, 1, file); + break; + case INST_DELTA: + size = sizeof(sim_inst_trace_delta_t); + fread(&full_inst.sim_inst_delta, size, 1, file); + break; + case INST_STRIDE: + size = sizeof(sim_inst_trace_stride_t); + fread(&full_inst.sim_inst_stride, size, 1, file); + break; + default: + assert(0); + exit(1); + } + + sim_inst_trace_t inst = full_inst.sim_inst_base; + // print VPC + fprintf(output_file, "%04x ", inst.vpc); + + // print active mask + fprintf(output_file, "%08x ", inst.active_mask & inst.predicate_mask); + + // print GPRDst + if (inst.GPRDst >= 0) { + fprintf(output_file, "1 R%d ", inst.GPRDst); + } else { + fprintf(output_file, "0 "); + } + + // print opcode + fprintf(output_file, "%s ", inst.opcode); + + // print src count + fprintf(output_file, "%d ", inst.numSrcs); + + // print GPRSrcs + for (int i = 0; i < MAX_SRC; i++) { + if (inst.GPRSrcs[i] >= 0) { + fprintf(output_file, "R%d ", inst.GPRSrcs[i]); + } + } + + // print width + fprintf(output_file, "%d ", inst.width); + + // print is_mem + if (inst.is_mem) { + std::bitset<32> mask(inst.active_mask & inst.predicate_mask); + if (inst_type == INST_FLAT) { + // default no compression for now + fprintf(output_file, "0 "); + + for (int i = 0; i < 32; i++) { + if (mask[i]) { + fprintf(output_file, "0x%llx ", + (unsigned long long)full_inst.sim_inst_flat.addrs[i]); + } + } + } else if (inst_type == INST_DELTA) { + // enabled compression + fprintf(output_file, "2 "); + fprintf(output_file, "0x%llx ", + (unsigned long long)full_inst.sim_inst_delta.base_addr); + + for (int i = 0; i < 32; i++) { + if (mask[i]) { + fprintf(output_file, "%llx ", + (unsigned long long)full_inst.sim_inst_delta.base_addr + + full_inst.sim_inst_delta.delta[i]); + } + } + } else if (inst_type == INST_STRIDE) { + // enabled compression + fprintf(output_file, "1 "); + fprintf(output_file, "0x%llx ", + (unsigned long long)full_inst.sim_inst_stride.base_addr); + + for (int i = 0; i < 32; i++) { + if (mask[i]) { + fprintf( + output_file, "%llx ", + (unsigned long long)full_inst.sim_inst_stride.base_addr + + full_inst.sim_inst_stride.stride * i); + } + } + } + } + + // print imm + fprintf(output_file, "%ld ", inst.imm); + + // newline + fprintf(output_file, "\n"); + + read_inst++; + } + read_tb++; + } + fprintf(output_file, "#END_TB\n"); + tb_count++; + } + + fclose(file); + fclose(output_file); + + return 0; +} \ No newline at end of file diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu index f4649c5ee..73995eabf 100644 --- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu +++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu @@ -2,6 +2,8 @@ /* Author2: Jason Shen, shen203@purdue.edu - 2019 */ #include +#include +#include #include #include #include @@ -33,6 +35,10 @@ #define TRACER_VERSION "5" +static __managed__ uint64_t total_dynamic_instr_counter = 0; +static __managed__ uint64_t reported_dynamic_instr_counter = 0; +static __managed__ bool stop_report = false; + /* Channel used to communicate from GPU to CPU receiving thread */ #define CHANNEL_SIZE (1l << 20) static __managed__ ChannelDev channel_dev; @@ -83,6 +89,9 @@ std::unordered_map ctx_resultsFile; std::string kernel_ranges = ""; +std::unordered_map>> + tb_warp_inst_ct; + struct KernelRange { uint64_t start; uint64_t end; // UINT64_MAX means open-ended @@ -502,45 +511,57 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func, ctx_kernelid[ctx], ctx); if (!stop_report) { - if (!xz_compress_trace) { - ctx_resultsFile[ctx] = fopen(buffer, "w"); - printf("Writing results to %s\n", buffer); - } else { - char cmd_buffer[1039]; - sprintf(cmd_buffer, "xz -1 -T0 > %s.xz", buffer); - ctx_resultsFile[ctx] = popen(cmd_buffer, "w"); - printf("Writing results to %s.xz\n", buffer); + try { + if (!xz_compress_trace) { + ctx_resultsFile[ctx] = openFileForWriting(buffer, false); + printf("Writing results to %s\n", buffer); + } else { + char cmd_buffer[1039]; + ctx_resultsFile[ctx] = + openFileForWritingXz(buffer, cmd_buffer, sizeof(cmd_buffer)); + printf("Writing results to %s.xz\n", buffer); + } + } catch (const std::runtime_error &e) { + perror("Failed to open results file"); + fprintf(stderr, "Error: %s\n", e.what()); + exit(1); } - // Writing header information - fprintf(ctx_resultsFile[ctx], "-kernel name = %s\n", - nvbit_get_func_name(ctx, func, true)); - fprintf(ctx_resultsFile[ctx], "-kernel id = %d\n", ctx_kernelid[ctx]); - fprintf(ctx_resultsFile[ctx], "-grid dim = (%d,%d,%d)\n", gridDimX, - gridDimY, gridDimZ); - fprintf(ctx_resultsFile[ctx], "-block dim = (%d,%d,%d)\n", blockDimX, - blockDimY, blockDimZ); - fprintf(ctx_resultsFile[ctx], "-shmem = %d\n", - shmem_static_nbytes + sharedMemBytes); - fprintf(ctx_resultsFile[ctx], "-nregs = %d\n", nregs); - fprintf(ctx_resultsFile[ctx], "-binary version = %d\n", binary_version); - fprintf(ctx_resultsFile[ctx], "-cuda stream id = %lu\n", (uint64_t)hStream); - fprintf(ctx_resultsFile[ctx], "-shmem base_addr = 0x%016lx\n", - (uint64_t)nvbit_get_shmem_base_addr(ctx)); - fprintf(ctx_resultsFile[ctx], "-local mem base_addr = 0x%016lx\n", - (uint64_t)nvbit_get_local_mem_base_addr(ctx)); - fprintf(ctx_resultsFile[ctx], "-nvbit version = %s\n", NVBIT_VERSION); - fprintf(ctx_resultsFile[ctx], "-accelsim tracer version = %s\n", - TRACER_VERSION); - fprintf(ctx_resultsFile[ctx], "-enable lineinfo = %d\n", lineinfo); - fprintf(ctx_resultsFile[ctx], "\n"); - - fprintf(ctx_resultsFile[ctx], - "#traces format = [line_num] PC mask dest_num [reg_dests] " - "opcode src_num " - "[reg_srcs] mem_width [adrrescompress?] [mem_addresses] " - "immediate\n"); - fprintf(ctx_resultsFile[ctx], "\n"); + kernel_header header; + header.kernel_id = ctx_kernelid[ctx]; + header.grid_dim_x = gridDimX; + header.grid_dim_y = gridDimY; + header.grid_dim_z = gridDimZ; + header.block_dim_x = blockDimX; + header.block_dim_y = blockDimY; + header.block_dim_z = blockDimZ; + header.shared_mem_bytes = shmem_static_nbytes + sharedMemBytes; + header.nregs = nregs; + header.binary_version = binary_version; + header.cuda_stream_id = (uint64_t)hStream; + header.shmem_base_addr = (uint64_t)nvbit_get_shmem_base_addr(ctx); + header.local_mem_base_addr = (uint64_t)nvbit_get_local_mem_base_addr(ctx); + strcpy(header.nvbit_version, NVBIT_VERSION); + strcpy(header.accelsim_tracer_version, TRACER_VERSION); + header.enable_lineinfo = lineinfo; + + // write kernel name + std::string kernel_name = std::string(nvbit_get_func_name(ctx, func, true)); + // write name size + uint64_t name_size = kernel_name.size(); + fwrite(&name_size, sizeof(uint64_t), 1, ctx_resultsFile[ctx]); + fwrite(kernel_name.c_str(), kernel_name.size(), 1, ctx_resultsFile[ctx]); + printf("Kernel name: %s\n", kernel_name.c_str()); + fwrite(&header, sizeof(kernel_header), 1, ctx_resultsFile[ctx]); + + unsigned tot_tb = header.grid_dim_x * header.grid_dim_y * header.grid_dim_z; + unsigned tb_size = + header.block_dim_x * header.block_dim_y * header.block_dim_z; + tb_warp_inst_ct[ctx] = std::vector>(); + tb_warp_inst_ct[ctx].resize(tot_tb, std::vector(tb_size, 0)); + + unsigned tot_warp = tot_tb * tb_size; + unsigned print_0 = 0; } kernelsFile = fopen(ctx_kernelslist[ctx].c_str(), "a"); @@ -833,61 +854,6 @@ bool check_opcode_contain(const std::vector &opcode, return false; } -bool base_stride_compress(const uint64_t *addrs, const std::bitset<32> &mask, - uint64_t &base_addr, int &stride) { - // calulcate the difference between addresses - // write cosnsctive addresses with constant stride in a more - // compressed way (i.e. start adress and stride) - bool const_stride = true; - bool first_bit1_found = false; - bool last_bit1_found = false; - - for (int s = 0; s < 32; s++) { - if (mask.test(s) && !first_bit1_found) { - first_bit1_found = true; - base_addr = addrs[s]; - if (s < 31 && mask.test(s + 1)) - stride = addrs[s + 1] - addrs[s]; - else { - const_stride = false; - break; - } - } else if (first_bit1_found && !last_bit1_found) { - if (mask.test(s)) { - if (stride != addrs[s] - addrs[s - 1]) { - const_stride = false; - break; - } - } else - last_bit1_found = true; - } else if (last_bit1_found) { - if (mask.test(s)) { - const_stride = false; - break; - } - } - } - - return const_stride; -} - -void base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask, - uint64_t &base_addr, std::vector &deltas) { - // save the delta from the previous address - bool first_bit1_found = false; - uint64_t last_address = 0; - for (int s = 0; s < 32; s++) { - if (mask.test(s) && !first_bit1_found) { - base_addr = addrs[s]; - first_bit1_found = true; - last_address = addrs[s]; - } else if (mask.test(s) && first_bit1_found) { - deltas.push_back(addrs[s] - last_address); - last_address = addrs[s]; - } - } -} - void *recv_thread_fun(void *args) { CUcontext ctx = (CUcontext)args; char *recv_buffer = (char *)malloc(CHANNEL_SIZE); @@ -898,6 +864,9 @@ void *recv_thread_fun(void *args) { uint32_t num_processed_bytes = 0; while (num_processed_bytes < num_recv_bytes) { inst_trace_t *ma = (inst_trace_t *)&recv_buffer[num_processed_bytes]; + std::string opcode = id_to_opcode_map[ma->opcode_id]; + assert(opcode.size() <= MAX_OPCODE_LENGTH); + strcpy(ma->base.opcode, opcode.c_str()); /* when we get this cta_id_x it means the kernel has completed */ @@ -906,94 +875,110 @@ void *recv_thread_fun(void *args) { break; } - fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x); - fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y); - fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z); - fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_tb); - if (print_core_id) { - fprintf(ctx_resultsFile[ctx], "%d ", ma->sm_id); - fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_sm); - } - if (lineinfo) { - fprintf(ctx_resultsFile[ctx], "%d ", ma->line_num); - } - fprintf(ctx_resultsFile[ctx], "%04x ", ma->vpc); // Print the virtual PC - fprintf(ctx_resultsFile[ctx], "%08x ", - ma->active_mask & ma->predicate_mask); - if (ma->GPRDst >= 0) { - fprintf(ctx_resultsFile[ctx], "1 "); - fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRDst); - } else - fprintf(ctx_resultsFile[ctx], "0 "); - - // Print the opcode. - fprintf(ctx_resultsFile[ctx], "%s ", - id_to_opcode_map[ma->opcode_id].c_str()); - unsigned src_count = 0; - for (int s = 0; s < MAX_SRC; s++) // GPR srcs count. - if (ma->GPRSrcs[s] >= 0) - src_count++; - fprintf(ctx_resultsFile[ctx], "%d ", src_count); - - for (int s = 0; s < MAX_SRC; s++) // GPR srcs. - if (ma->GPRSrcs[s] >= 0) - fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRSrcs[s]); - - // print addresses - std::bitset<32> mask(ma->active_mask & ma->predicate_mask); - if (ma->is_mem) { - std::istringstream iss(id_to_opcode_map[ma->opcode_id]); - std::vector tokens; - std::string token; - while (std::getline(iss, token, '.')) { - if (!token.empty()) - tokens.push_back(token); - } - fprintf(ctx_resultsFile[ctx], "%d ", - get_datawidth_from_opcode(tokens)); - - bool base_stride_success = false; - uint64_t base_addr = 0; - int stride = 0; - std::vector deltas; - - if (enable_compress) { - // try base+stride format - base_stride_success = - base_stride_compress(ma->addrs, mask, base_addr, stride); - if (!base_stride_success) { - // if base+stride fails, try base+delta format - base_delta_compress(ma->addrs, mask, base_addr, deltas); - } - } - - if (base_stride_success && enable_compress) { - // base + stride format - fprintf(ctx_resultsFile[ctx], "%u 0x%llx %d ", - address_format::base_stride, base_addr, stride); - } else if (!base_stride_success && enable_compress) { - // base + delta format - fprintf(ctx_resultsFile[ctx], "%u 0x%llx ", - address_format::base_delta, base_addr); - for (int s = 0; s < deltas.size(); s++) { - fprintf(ctx_resultsFile[ctx], "%lld ", deltas[s]); - } - } else { - // list all the addresses - fprintf(ctx_resultsFile[ctx], "%u ", address_format::list_all); - for (int s = 0; s < 32; s++) { - if (mask.test(s)) - fprintf(ctx_resultsFile[ctx], "0x%016lx ", ma->addrs[s]); - } - } - } else { - fprintf(ctx_resultsFile[ctx], "0 "); + // Write the inst_trace_t structure as binary data to the file + unsigned size = sizeof(inst_trace_t); + if (!ma->base.is_mem) { + // write only the part without addrs + size = offsetof(inst_trace_t, addrs); } - - // Print the immediate - fprintf(ctx_resultsFile[ctx], "%d ", ma->imm); - - fprintf(ctx_resultsFile[ctx], "\n"); + fwrite(&size, sizeof(unsigned), 1, ctx_resultsFile[ctx]); + fwrite(ma, size, 1, ctx_resultsFile[ctx]); + // unsigned tb_id_x = ma->cta_id_x; + // unsigned tb_id_y = ma->cta_id_y; + // unsigned tb_id_z = ma->cta_id_z; + // unsigned tb_id = tb_id_z * header.grid_dim_y * header.grid_dim_x + + // tb_id_y * header.grid_dim_x + tb_id_x; + // unsigned warp_id = ma->warpid_tb; + // tb_warp_inst_ct[ctx][tb_id][warp_id]++; + + // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x); + // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y); + // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z); + // fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_tb); + // if (print_core_id) { + // fprintf(ctx_resultsFile[ctx], "%d ", ma->sm_id); + // fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_sm); + // } + // if (lineinfo) { + // fprintf(ctx_resultsFile[ctx], "%d ", ma->line_num); + // } + // fprintf(ctx_resultsFile[ctx], "%04x ", ma->vpc); // Print the + // virtual PC fprintf(ctx_resultsFile[ctx], "%08x ", + // ma->active_mask & ma->predicate_mask); + // if (ma->GPRDst >= 0) { + // fprintf(ctx_resultsFile[ctx], "1 "); + // fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRDst); + // } else + // fprintf(ctx_resultsFile[ctx], "0 "); + + // // Print the opcode. + // fprintf(ctx_resultsFile[ctx], "%s ", + // id_to_opcode_map[ma->opcode_id].c_str()); + // unsigned src_count = 0; + // for (int s = 0; s < MAX_SRC; s++) // GPR srcs count. + // if (ma->GPRSrcs[s] >= 0) + // src_count++; + // fprintf(ctx_resultsFile[ctx], "%d ", src_count); + + // for (int s = 0; s < MAX_SRC; s++) // GPR srcs. + // if (ma->GPRSrcs[s] >= 0) + // fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRSrcs[s]); + + // // print addresses + // std::bitset<32> mask(ma->active_mask & ma->predicate_mask); + // if (ma->is_mem) { + // std::istringstream iss(id_to_opcode_map[ma->opcode_id]); + // std::vector tokens; + // std::string token; + // while (std::getline(iss, token, '.')) { + // if (!token.empty()) + // tokens.push_back(token); + // } + // fprintf(ctx_resultsFile[ctx], "%d ", + // get_datawidth_from_opcode(tokens)); + + // bool base_stride_success = false; + // uint64_t base_addr = 0; + // int stride = 0; + // std::vector deltas; + + // if (enable_compress) { + // // try base+stride format + // base_stride_success = + // base_stride_compress(ma->addrs, mask, base_addr, stride); + // if (!base_stride_success) { + // // if base+stride fails, try base+delta format + // base_delta_compress(ma->addrs, mask, base_addr, deltas); + // } + // } + + // if (base_stride_success && enable_compress) { + // // base + stride format + // fprintf(ctx_resultsFile[ctx], "%u 0x%llx %d ", + // address_format::base_stride, base_addr, stride); + // } else if (!base_stride_success && enable_compress) { + // // base + delta format + // fprintf(ctx_resultsFile[ctx], "%u 0x%llx ", + // address_format::base_delta, base_addr); + // for (int s = 0; s < deltas.size(); s++) { + // fprintf(ctx_resultsFile[ctx], "%lld ", deltas[s]); + // } + // } else { + // // list all the addresses + // fprintf(ctx_resultsFile[ctx], "%u ", + // address_format::list_all); for (int s = 0; s < 32; s++) { + // if (mask.test(s)) + // fprintf(ctx_resultsFile[ctx], "0x%016lx ", ma->addrs[s]); + // } + // } + // } else { + // fprintf(ctx_resultsFile[ctx], "0 "); + // } + + // // Print the immediate + // fprintf(ctx_resultsFile[ctx], "%d ", ma->imm); + + // fprintf(ctx_resultsFile[ctx], "\n"); num_processed_bytes += sizeof(inst_trace_t); } diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/Makefile b/util/tracer_nvbit/tracer_tool/traces-processing/Makefile index 899a95329..acdcd54d5 100755 --- a/util/tracer_nvbit/tracer_tool/traces-processing/Makefile +++ b/util/tracer_nvbit/tracer_tool/traces-processing/Makefile @@ -1,7 +1,7 @@ TARGET := post-traces-processing $(TARGET): post-traces-processing.cpp - g++ -std=c++17 -O3 -g -o $@ $^ + g++ -std=c++17 -O3 -g -Wno-unused-result -o $@ $^ run: $(TARGET) ./$(TARGET) diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp index cb71f5d45..ed1ade8a7 100644 --- a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp +++ b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp @@ -1,3 +1,5 @@ +#include +#include #include #include #include @@ -9,6 +11,7 @@ #include #include +#include "../common.h" #include #include #include @@ -18,13 +21,7 @@ using namespace std; struct threadblock_info { - bool initialized; - unsigned tb_id_x, tb_id_y, tb_id_z; - vector> warp_insts_array; - threadblock_info() { - initialized = false; - tb_id_x = tb_id_y = tb_id_z = 0; - } + vector> warp_insts_array; }; /// @brief There exist significant repetition in the trace. The WarpInstLUT @@ -81,13 +78,78 @@ struct WarpInstLUT { void group_per_block(const char *filepath); void group_per_core(const char *filepath); -// This program works by redirecting the stdin/stdout to child processes. The -// stdin is piped to a process that reads from disk the input trace file. The -// stdout is piped to a process that writes to disk the post-process trace. We -// should preserve the original file descriptors for stdin/stdout before doing -// redirections. -int preserved_stdin_fileno; -int preserved_stdout_fileno; +bool base_stride_compress(const uint64_t *addrs, const std::bitset<32> &mask, + uint64_t &base_addr, int &stride) { + // calulcate the difference between addresses + // write cosnsctive addresses with constant stride in a more + // compressed way (i.e. start adress and stride) + bool const_stride = true; + bool first_bit1_found = false; + bool last_bit1_found = false; + + for (int s = 0; s < 32; s++) { + if (mask.test(s) && !first_bit1_found) { + first_bit1_found = true; + base_addr = addrs[s]; + if (s < 31 && mask.test(s + 1)) + stride = addrs[s + 1] - addrs[s]; + else { + const_stride = false; + break; + } + } else if (first_bit1_found && !last_bit1_found) { + if (mask.test(s)) { + if (stride != addrs[s] - addrs[s - 1]) { + const_stride = false; + break; + } + } else + last_bit1_found = true; + } else if (last_bit1_found) { + if (mask.test(s)) { + const_stride = false; + break; + } + } + } + + return const_stride; +} + +bool base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask, + uint64_t &base_addr, std::vector &deltas) { + // save the delta from the previous address + bool first_bit1_found = false; + uint64_t last_address = 0; + for (int s = 0; s < 32; s++) { + if (mask.test(s) && !first_bit1_found) { + base_addr = addrs[s]; + first_bit1_found = true; + last_address = addrs[s]; + } else if (mask.test(s) && first_bit1_found) { + // Check if delta can fit into int32_t + uint64_t delta; + if (addrs[s] >= last_address) { + delta = addrs[s] - last_address; + if (delta > INT32_MAX) { + // Overflow detected - return false + return false; + } + } else { + delta = last_address - addrs[s]; + if (delta > (uint64_t)INT32_MAX) { + // Overflow detected - return false + return false; + } + } + + // Delta fits in int32_t, add it to the vector + deltas.push_back(addrs[s] - last_address); + last_address = addrs[s]; + } + } + return true; // Success +} std::vector kernelslist_list; ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -184,122 +246,173 @@ int main(int argc, char **argv) { // stderr stream. The io redirection will be restored by the time the function // returns. void group_per_block(const char *filepath) { - preserved_stdin_fileno = dup(STDIN_FILENO); - preserved_stdout_fileno = dup(STDOUT_FILENO); - - string filepath_str{filepath}; - WarpInstLUT warp_inst_lut; - - pid_t sink_process_pid = 0; - string trace_sink_cmd; - int sink_pipe_fd[2]; - - pid_t source_process_pid = 0; - string trace_source_cmd; - int source_pipe_fd[2]; string output_filepath; - - bool input_file_is_xz = false; - int _l = filepath_str.length(); - if (_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz") { - // kernel-1.trace.xz --(xz -dc)--> f --(xz -1 -T0)--> kernel-1.traceg.xz - input_file_is_xz = true; - output_filepath = filepath_str.substr(0, _l - 3) + "g.xz"; - trace_source_cmd = "xz -dc " + filepath_str; - trace_sink_cmd = "xz -1 -T0 > " + output_filepath; - } else if (_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace") { - // kernel-2.trace --(cat)--> f --(cat)--> kernel-2.traceg - input_file_is_xz = false; - output_filepath = filepath_str + "g"; - trace_source_cmd = "cat " + filepath_str; - trace_sink_cmd = "cat > " + output_filepath; - } else { - cerr << "Only support xz or raw text format. Unable to process - and " - "skipping - trace file " - << filepath_str << endl; - close(preserved_stdin_fileno); - close(preserved_stdout_fileno); - return; + // Open the pipe + FILE *pipe; + FILE *kernel_out; + + try { + // Use utility function to open input file + pipe = openFileForReading(filepath); + + // Generate output filepath and open output file + if (hasEnding(filepath, ".xz")) { + output_filepath = generateOutputFilepath(filepath, "g"); + kernel_out = + openFileForWriting(output_filepath, true); // Use xz compression + } else if (hasEnding(filepath, ".trace")) { + output_filepath = string(filepath) + "g"; + kernel_out = openFileForWriting(output_filepath, false); // No compression + } else { + throw std::runtime_error("Unsupported file type!"); + } + } catch (const std::runtime_error &e) { + throw std::runtime_error("Failed to open files: " + string(e.what())); } - // cerr << "source cmd is "< 0) { - // parent process - the trace post processor - // stdin is now redirected to the read end of the source_pipe - close(source_pipe_fd[1]); - int r = dup2(source_pipe_fd[0], STDIN_FILENO); - } else { - cerr << "Failed to fork data source process\n"; - perror("fork"); - exit(1); + vector insts; + + // Read the kernel header + std::string kernel_name; + uint64_t name_size; + fread(&name_size, sizeof(name_size), 1, pipe); + kernel_name.resize(name_size); + fread(kernel_name.data(), name_size, 1, pipe); + + // Read the kernel header + kernel_header header; + fread(&header, sizeof(header), 1, pipe); + + insts.resize(header.grid_dim_x * header.grid_dim_y * header.grid_dim_z); + vector> ldgsts_flags(header.grid_dim_x * header.grid_dim_y * + header.grid_dim_z); + + for (unsigned tb = 0; tb < insts.size(); ++tb) { + insts[tb].warp_insts_array.resize(ceil( + float(header.block_dim_x * header.block_dim_y * header.block_dim_z) / + 32)); + + ldgsts_flags[tb].resize(ceil( + float(header.block_dim_x * header.block_dim_y * header.block_dim_z) / + 32)); + for (unsigned j = 0; j < ldgsts_flags[tb].size(); j++) { + ldgsts_flags[tb][j] = true; + } } - // fork a child process as the trace sink - if (pipe(sink_pipe_fd) != 0) { - cerr << "Failed to create pipe\n"; - perror("pipe"); - exit(1); + unsigned size; + while (fread(&size, sizeof(unsigned), 1, pipe)) { + inst_trace_t inst = {0}; + fread(&inst, size, 1, pipe); + + unsigned tb_id_x = inst.cta_id_x; + unsigned tb_id_y = inst.cta_id_y; + unsigned tb_id_z = inst.cta_id_z; + unsigned tb_id = tb_id_z * header.grid_dim_y * header.grid_dim_x + + tb_id_y * header.grid_dim_x + tb_id_x; + unsigned warp_id = inst.warpid_tb; + + std::string opcode = inst.base.opcode; + if (opcode.find("LDGSTS") != string::npos) { + if (!ldgsts_flags[tb_id][warp_id]) { + insts[tb_id].warp_insts_array[warp_id].push_back(inst); + } + ldgsts_flags[tb_id][warp_id] = !ldgsts_flags[tb_id][warp_id]; + } else { + insts[tb_id].warp_insts_array[warp_id].push_back(inst); + } } - sink_process_pid = fork(); - if (sink_process_pid == 0) { - // child process - close(sink_pipe_fd[1]); - dup2(sink_pipe_fd[0], STDIN_FILENO); - signal(SIGINT, SIG_IGN); // ignore SIGINT - execle("/bin/sh", "sh", "-c", trace_sink_cmd.c_str(), NULL, environ); - perror("execle"); // child shouldn't reach here if all is well. - exit(1); - } else if (sink_process_pid > 0) { - // parent process - the trace post processor - // stdout is now redirected to the write end of the sink_pipe - close(sink_pipe_fd[0]); - int r = dup2(sink_pipe_fd[1], STDOUT_FILENO); - } else { - cerr << "Failed to fork data sink process\n"; - perror("fork"); - exit(1); + + fwrite(&name_size, sizeof(uint64_t), 1, kernel_out); + fwrite(kernel_name.c_str(), kernel_name.size(), 1, kernel_out); + + fwrite(&header, sizeof(kernel_header), 1, kernel_out); + + for (unsigned tb_id = 0; tb_id < insts.size(); ++tb_id) { + if (insts[tb_id].warp_insts_array.size() > 0) { + // print total warp count in this thread block + unsigned total_warp_count = insts[tb_id].warp_insts_array.size(); + fwrite(&total_warp_count, sizeof(unsigned), 1, kernel_out); + + for (unsigned warp_id = 0; warp_id < insts[tb_id].warp_insts_array.size(); + ++warp_id) { + // print total inst count in this warp + unsigned total_inst_count = + insts[tb_id].warp_insts_array[warp_id].size(); + fwrite(&total_inst_count, sizeof(unsigned), 1, kernel_out); + + for (unsigned inst_id = 0; + inst_id < insts[tb_id].warp_insts_array[warp_id].size(); + ++inst_id) { + inst_trace_t &full_inst = + insts[tb_id].warp_insts_array[warp_id][inst_id]; + + if (!full_inst.base.is_mem) { + sim_inst_trace_t inst = full_inst.base; + inst_type_t type = INST_BASE; + + fwrite(&type, sizeof(inst_type_t), 1, kernel_out); + fwrite(&inst, sizeof(inst), 1, kernel_out); + } else { + std::bitset<32> mask(full_inst.base.active_mask & + full_inst.base.predicate_mask); + bool base_stride_success = false; + uint64_t base_addr = 0; + int stride = 0; + std::vector deltas; + bool base_delta_success = false; + + // try base+stride format + base_stride_success = + base_stride_compress(full_inst.addrs, mask, base_addr, stride); + if (!base_stride_success) { + // if base+stride fails, try base+delta format + base_delta_success = + base_delta_compress(full_inst.addrs, mask, base_addr, deltas); + } + + if (base_stride_success) { + sim_inst_trace_stride_t inst; + inst.base = full_inst.base; + inst.base_addr = base_addr; + inst.stride = stride; + inst_type_t type = INST_STRIDE; + + fwrite(&type, sizeof(inst_type_t), 1, kernel_out); + fwrite(&inst, sizeof(inst), 1, kernel_out); + } else if (base_delta_success) { + sim_inst_trace_delta_t inst; + inst.base = full_inst.base; + inst.base_addr = base_addr; + deltas.resize(32, 0ll); + memcpy(inst.delta, deltas.data(), sizeof(inst.delta)); + inst_type_t type = INST_DELTA; + + fwrite(&type, sizeof(inst_type_t), 1, kernel_out); + fwrite(&inst, sizeof(inst), 1, kernel_out); + } else { + // save the addresses as is + sim_inst_trace_flat_t inst; + inst.base = full_inst.base; + memcpy(inst.addrs, full_inst.addrs, sizeof(inst.addrs)); + inst_type_t type = INST_FLAT; + fwrite(&type, sizeof(inst_type_t), 1, kernel_out); + fwrite(&inst, sizeof(inst), 1, kernel_out); + } + } + } + } + } } + fclose(kernel_out); - cerr << "Processing file " << filepath << endl; + /* + // legacy code starts here. Pending to be removed. - vector insts; - unsigned grid_dim_x, grid_dim_y, grid_dim_z, tb_dim_x, tb_dim_y, tb_dim_z; - unsigned tb_id_x, tb_id_y, tb_id_z, tb_id, warpid_tb; - unsigned lineinfo, linenum; - string line; - stringstream ss; - string string1, string2; - bool found_grid_dim = false, found_block_dim = false; - - // Add a flag for LDGSTS instruction to indicate which one to remove - vector> ldgsts_flags; // true to remove, false to not - - // Important... without clear(), cin.eof() may evaluate to true on the second - // kernel + // Important... without clear(), cin.eof() may evaluate to true on the + // second kernel cin.clear(); clearerr(stdin); while (!cin.eof()) { @@ -315,14 +428,11 @@ void group_per_block(const char *filepath) { ss.ignore(); ss >> string1 >> string2; if (string1 == "grid" && string2 == "dim") { - sscanf(line.c_str(), "-grid dim = (%d,%d,%d)", &grid_dim_x, &grid_dim_y, - &grid_dim_z); - found_grid_dim = true; - } else if (string1 == "block" && string2 == "dim") { - sscanf(line.c_str(), "-block dim = (%d,%d,%d)", &tb_dim_x, &tb_dim_y, - &tb_dim_z); - found_block_dim = true; - } else if (string1 == "enable" && string2 == "lineinfo") { + sscanf(line.c_str(), "-grid dim = (%d,%d,%d)", &grid_dim_x, + &grid_dim_y, &grid_dim_z); found_grid_dim = true; } else if (string1 == + "block" && string2 == "dim") { sscanf(line.c_str(), "-block dim = + (%d,%d,%d)", &tb_dim_x, &tb_dim_y, &tb_dim_z); found_block_dim = true; } + else if (string1 == "enable" && string2 == "lineinfo") { sscanf(line.c_str(), "-enable lineinfo = %d", &lineinfo); } @@ -351,9 +461,8 @@ void group_per_block(const char *filepath) { ss.str(line); ss >> tb_id_x >> tb_id_y >> tb_id_z >> warpid_tb; tb_id = - tb_id_z * grid_dim_y * grid_dim_x + tb_id_y * grid_dim_x + tb_id_x; - if (!insts[tb_id].initialized) { - insts[tb_id].tb_id_x = tb_id_x; + tb_id_z * grid_dim_y * grid_dim_x + tb_id_y * grid_dim_x + + tb_id_x; if (!insts[tb_id].initialized) { insts[tb_id].tb_id_x = tb_id_x; insts[tb_id].tb_id_y = tb_id_y; insts[tb_id].tb_id_z = tb_id_z; insts[tb_id].initialized = true; @@ -384,7 +493,8 @@ void group_per_block(const char *filepath) { inst_ptr = warp_inst_lut.register_new_entry(rest_of_line); // One actual LDGSTS instruction includes 2 LDGSTS instructions in the - // trace, because it has two memory references. This is trying to remove + // trace, because it has two memory references. This is trying to + remove // the one with the shared memory address. if (opcode.find("LDGSTS") != string::npos) { @@ -398,33 +508,37 @@ void group_per_block(const char *filepath) { } } - for (unsigned i = 0; i < insts.size(); ++i) { + for (unsigned tb_id = 0; tb_id < insts.size(); ++tb_id) { // ofs< 0) { - cout << "\n" + if (insts[tb_id].initialized && insts[tb_id].warp_insts_array.size() > + 0) { cout << "\n" << "#BEGIN_TB" << "\n"; cout << "\n" - << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y - << "," << insts[i].tb_id_z << "\n"; + << "thread block = " << insts[tb_id].tb_id_x << "," + << insts[tb_id].tb_id_y << "," << insts[tb_id].tb_id_z << "\n"; } else { - cerr << "Warning: Thread block " << insts[i].tb_id_x << "," - << insts[i].tb_id_y << "," << insts[i].tb_id_z << " is empty" + cerr << "Warning: Thread block " << insts[tb_id].tb_id_x << "," + << insts[tb_id].tb_id_y << "," << insts[tb_id].tb_id_z << " is + empty" << "\n"; continue; } - for (unsigned j = 0; j < insts[i].warp_insts_array.size(); ++j) { + for (unsigned warp_id = 0; warp_id < + insts[tb_id].warp_insts_array.size(); + ++warp_id) { cout << "\n" - << "warp = " << j << "\n"; - cout << "insts = " << insts[i].warp_insts_array[j].size() << "\n"; - if (insts[i].warp_insts_array[j].size() == 0) { - cerr << "Warning: Warp " << j << " in thread block" << insts[i].tb_id_x - << "," << insts[i].tb_id_y << "," << insts[i].tb_id_z - << " is empty" + << "warp = " << warp_id << "\n"; + cout << "insts = " << insts[tb_id].warp_insts_array[warp_id].size() + << "\n"; + if (insts[tb_id].warp_insts_array[warp_id].size() == 0) { + cerr << "Warning: Warp " << warp_id << " in thread block" + << insts[tb_id].tb_id_x << "," << insts[tb_id].tb_id_y << "," + << insts[tb_id].tb_id_z << " is empty" << "\n"; } - for (auto it = insts[i].warp_insts_array[j].cbegin(); - it != insts[i].warp_insts_array[j].cend(); ++it) { + for (auto it = insts[tb_id].warp_insts_array[warp_id].cbegin(); + it != insts[tb_id].warp_insts_array[warp_id].cend(); ++it) { // dereference once: const string* // dereference twice: const string cout << **it << "\n"; @@ -432,17 +546,7 @@ void group_per_block(const char *filepath) { } cout << endl << "#END_TB" << endl; } - - close(source_pipe_fd[0]); - close(source_pipe_fd[1]); - close(sink_pipe_fd[0]); - close(sink_pipe_fd[1]); - - // restore stdin/stdout file descriptor - dup2(preserved_stdin_fileno, STDIN_FILENO); - dup2(preserved_stdout_fileno, STDOUT_FILENO); - close(preserved_stdin_fileno); - close(preserved_stdout_fileno); + */ } void group_per_core(const char *filepath) {