diff --git a/util/tracer_nvbit/.gitignore b/util/tracer_nvbit/.gitignore
index c1adccf67..5545e9e6d 100644
--- a/util/tracer_nvbit/.gitignore
+++ b/util/tracer_nvbit/.gitignore
@@ -5,3 +5,5 @@ tracer_tool/tracer_tool.o
 tracer_tool/tracer_tool.so
 tracer_tool/inject_funcs.o
 tracer_tool/traces-processing/post-traces-processing
+traceDsm
+traceAsm
\ No newline at end of file
diff --git a/util/tracer_nvbit/tracer_tool/Makefile b/util/tracer_nvbit/tracer_tool/Makefile
index 950639750..7096821c9 100644
--- a/util/tracer_nvbit/tracer_tool/Makefile
+++ b/util/tracer_nvbit/tracer_tool/Makefile
@@ -43,16 +43,18 @@ current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
 
 NVBIT_TOOL=$(current_dir).so
 
-all: $(NVBIT_TOOL)
+all: $(NVBIT_TOOL) traceDsm
 
 $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a
 	$(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
 
 %.o: %.cu common.h
-	$(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
+	$(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -Xcompiler -Wno-unused-result -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
 
 inject_funcs.o: inject_funcs.cu common.h
-	$(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@
+	$(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -Wno-unused-result -Xcompiler -fPIC -c $< -o $@
 
+traceDsm: traceDsm.cpp common.h
+	$(CXX) -std=c++17 -O3 -g -Wno-unused-result -o $@ $^
 clean:
 	rm -f *.so *.o
diff --git a/util/tracer_nvbit/tracer_tool/common.h b/util/tracer_nvbit/tracer_tool/common.h
index 7eff0feb7..300d7e70f 100644
--- a/util/tracer_nvbit/tracer_tool/common.h
+++ b/util/tracer_nvbit/tracer_tool/common.h
@@ -1,26 +1,51 @@
 /* Author1: Mahmoud Khairy, abdallm@purdue.com - 2019 */
 /* Author2: Jason Shen, shen203@purdue.edu - 2019 */
 
-#include <stdint.h>
+#ifndef COMMON_H
+#define COMMON_H
 
-static __managed__ uint64_t total_dynamic_instr_counter = 0;
-static __managed__ uint64_t reported_dynamic_instr_counter = 0;
-static __managed__ bool stop_report = false;
+#include <assert.h>
+#include <cstdint>
+#include <cstdio>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
 
 /* information collected in the instrumentation function and passed
  * on the channel from the GPU to the CPU */
 #define MAX_SRC 5
+#define MAX_OPCODE_LENGTH 32
+
+typedef enum : uint8_t {
+  INST_BASE = 0,
+  INST_FLAT,
+  INST_DELTA,
+  INST_STRIDE,
+} inst_type_t;
 
 typedef struct {
-  int cta_id_x;
-  int cta_id_y;
-  int cta_id_z;
-  int warpid_tb;
-  int warpid_sm;
-  int sm_id;
-  int opcode_id;
-  uint64_t addrs[32];
-  uint32_t line_num;
+  unsigned kernel_id;
+  unsigned grid_dim_x;
+  unsigned grid_dim_y;
+  unsigned grid_dim_z;
+  unsigned block_dim_x;
+  unsigned block_dim_y;
+  unsigned block_dim_z;
+  unsigned shared_mem_bytes;
+  unsigned shmem;
+  unsigned nregs;
+  unsigned binary_version;
+  uint64_t cuda_stream_id;
+  uint64_t shmem_base_addr;
+  uint64_t local_mem_base_addr;
+  char nvbit_version[10];
+  char accelsim_tracer_version[10];
+  bool enable_lineinfo;
+} kernel_header;
+
+typedef struct {
+  char opcode[MAX_OPCODE_LENGTH];
+
   uint32_t vpc;
   bool is_mem;
   int32_t GPRDst;
@@ -30,4 +55,168 @@ typedef struct {
   uint32_t active_mask;
   uint32_t predicate_mask;
   uint64_t imm;
+} sim_inst_trace_t;
+
+typedef struct {
+  sim_inst_trace_t base;
+  uint64_t addrs[32];
+} sim_inst_trace_flat_t;
+
+typedef struct {
+  sim_inst_trace_t base;
+
+  uint64_t base_addr;
+  int32_t delta[32];
+} sim_inst_trace_delta_t;
+
+typedef struct {
+  sim_inst_trace_t base;
+  uint64_t base_addr;
+  int32_t stride;
+} sim_inst_trace_stride_t;
+
+typedef struct {
+  sim_inst_trace_t base;
+
+  int cta_id_x;
+  int cta_id_y;
+  int cta_id_z;
+  int warpid_tb;
+  int warpid_sm;
+  int sm_id;
+  int opcode_id;
+  uint32_t line_num;
+  uint64_t addrs[32];
 } inst_trace_t;
+
+typedef union {
+  sim_inst_trace_t sim_inst_base;
+  sim_inst_trace_delta_t sim_inst_delta;
+  sim_inst_trace_flat_t sim_inst_flat;
+  sim_inst_trace_stride_t sim_inst_stride;
+
+} sim_inst_u;
+
+// unsigned get_inst_size(inst_type_t type) {
+//   switch (type) {
+//   case INST_BASE:
+//     return sizeof(sim_inst_trace_t);
+//   case INST_FLAT:
+//     return sizeof(sim_inst_trace_flat_t);
+//   case INST_DELTA:
+//     return sizeof(sim_inst_trace_delta_t);
+//   case INST_STRIDE:
+//     return sizeof(sim_inst_trace_stride_t);
+//   default:
+//     assert(0);
+//     exit(1);
+//   }
+// }
+
+inline bool hasEnding(const std::string &fullString,
+                      const std::string &ending) {
+  if (fullString.length() >= ending.length()) {
+    return (0 == fullString.compare(fullString.length() - ending.length(),
+                                    ending.length(), ending));
+  }
+  return false;
+}
+
+/**
+ * Opens a file for reading, automatically handling .xz decompression
+ * @param filepath Path to the file to open
+ * @return FILE pointer to the opened file/pipe, or nullptr on failure
+ * @throws std::runtime_error if file type is unsupported or opening fails
+ */
+inline FILE *openFileForReading(const std::string &filepath) {
+  FILE *file = nullptr;
+
+  if (hasEnding(filepath, ".xz")) {
+    // Use xz command to decompress .xz files
+    std::string command = "xz -dc " + filepath;
+    file = popen(command.c_str(), "r");
+  } else if (hasEnding(filepath, ".trace") || hasEnding(filepath, ".traceg")) {
+    // Use fopen for regular trace files
+    file = fopen(filepath.c_str(), "rb");
+  } else {
+    throw std::runtime_error("Unsupported file type: " + filepath);
+  }
+
+  if (!file) {
+    throw std::runtime_error("Failed to open file for reading: " + filepath);
+  }
+
+  return file;
+}
+
+/**
+ * Opens a file for writing, automatically handling .xz compression
+ * @param filepath Base filepath (without extension)
+ * @param use_xz_compression Whether to compress with xz
+ * @return FILE pointer to the opened file/pipe, or nullptr on failure
+ * @throws std::runtime_error if opening fails
+ */
+inline FILE *openFileForWriting(const std::string &filepath,
+                                bool use_xz_compression = false) {
+  FILE *file = nullptr;
+
+  if (use_xz_compression) {
+    std::string command = "xz -1 -T0 > " + filepath + ".xz";
+    file = popen(command.c_str(), "w");
+    if (!file) {
+      throw std::runtime_error(
+          "Failed to open xz compression pipe for: " + filepath + ".xz");
+    }
+  } else {
+    file = fopen(filepath.c_str(), "wb");
+    if (!file) {
+      throw std::runtime_error("Failed to open file for writing: " + filepath);
+    }
+  }
+
+  return file;
+}
+
+/**
+ * Opens a file for writing with xz compression using a command buffer
+ * @param base_filepath Base filepath (without extension)
+ * @param cmd_buffer Buffer to store the xz command
+ * @param buffer_size Size of the command buffer
+ * @return FILE pointer to the opened pipe, or nullptr on failure
+ * @throws std::runtime_error if buffer is too small or opening fails
+ */
+inline FILE *openFileForWritingXz(const std::string &base_filepath,
+                                  char *cmd_buffer, size_t buffer_size) {
+  if (snprintf(cmd_buffer, buffer_size, "xz -1 -T0 > %s.xz",
+               base_filepath.c_str()) >= (int)buffer_size) {
+    throw std::runtime_error("Command buffer too small for xz compression");
+  }
+
+  FILE *file = popen(cmd_buffer, "w");
+  if (!file) {
+    throw std::runtime_error(
+        "Failed to open xz compression pipe for: " + base_filepath + ".xz");
+  }
+
+  return file;
+}
+
+/**
+ * Generates output filepath based on input filepath and desired extension
+ * @param input_filepath Input file path
+ * @param new_extension New extension to append (without dot)
+ * @return Output filepath with new extension
+ */
+inline std::string generateOutputFilepath(const std::string &input_filepath,
+                                          const std::string &new_extension) {
+  if (hasEnding(input_filepath, ".xz")) {
+    // Remove .xz and add new extension
+    return input_filepath.substr(0, input_filepath.find_last_of(".")) +
+           new_extension;
+  } else {
+    // Add new extension to existing path
+    return input_filepath + "." + new_extension;
+  }
+}
+
+#endif
\ No newline at end of file
diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
index dfc3f8769..b697acaef 100644
--- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu
+++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
@@ -38,17 +38,17 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
     }
   }
 
-  inst_trace_t ma;
+  inst_trace_t ma = {0};
 
   if (is_mem) {
     /* collect memory address information */
     for (int i = 0; i < 32; i++) {
       ma.addrs[i] = __shfl_sync(active_mask, addr, i);
     }
-    ma.width = width;
-    ma.is_mem = true;
+    ma.base.width = width;
+    ma.base.is_mem = true;
   } else {
-    ma.is_mem = false;
+    ma.base.is_mem = false;
   }
 
   int4 cta = get_ctaid();
@@ -62,17 +62,17 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
   ma.cta_id_z = cta.z;
   ma.warpid_sm = get_warpid();
   ma.opcode_id = opcode_id;
-  ma.vpc = vpc;
-  ma.GPRDst = desReg;
-  ma.GPRSrcs[0] = srcReg1;
-  ma.GPRSrcs[1] = srcReg2;
-  ma.GPRSrcs[2] = srcReg3;
-  ma.GPRSrcs[3] = srcReg4;
-  ma.GPRSrcs[4] = srcReg5;
-  ma.numSrcs = srcNum;
-  ma.imm = immediate;
-  ma.active_mask = active_mask;
-  ma.predicate_mask = predicate_mask;
+  ma.base.vpc = vpc;
+  ma.base.GPRDst = desReg;
+  ma.base.GPRSrcs[0] = srcReg1;
+  ma.base.GPRSrcs[1] = srcReg2;
+  ma.base.GPRSrcs[2] = srcReg3;
+  ma.base.GPRSrcs[3] = srcReg4;
+  ma.base.GPRSrcs[4] = srcReg5;
+  ma.base.numSrcs = srcNum;
+  ma.base.imm = immediate;
+  ma.base.active_mask = active_mask;
+  ma.base.predicate_mask = predicate_mask;
   ma.sm_id = get_smid();
 
   /* first active lane pushes information on the channel */
diff --git a/util/tracer_nvbit/tracer_tool/traceDsm.cpp b/util/tracer_nvbit/tracer_tool/traceDsm.cpp
new file mode 100644
index 000000000..7e98a8f21
--- /dev/null
+++ b/util/tracer_nvbit/tracer_tool/traceDsm.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include <bitset>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+int main(int argc, char *argv[]) {
+  if (argc != 2) {
+    printf("Usage: %s <trace_file>\n", argv[0]);
+    return 1;
+  }
+
+  std::string filepath = argv[1];
+  FILE *file;
+  FILE *output_file;
+
+  try {
+    // Use utility function to open input file
+    file = openFileForReading(filepath);
+
+    // Open output file
+    std::string output_filepath = filepath + ".txt";
+    output_file = openFileForWriting(output_filepath, false);
+  } catch (const std::runtime_error &e) {
+    fprintf(stderr, "Error: %s\n", e.what());
+    return 1;
+  }
+
+  // Read the kernel header
+  std::string kernel_name;
+  uint64_t name_size;
+  fread(&name_size, sizeof(uint64_t), 1, file);
+
+  kernel_name.resize(name_size);
+  fread(kernel_name.data(), name_size, 1, file);
+
+  // Read the kernel header
+  kernel_header header;
+  fread(&header, sizeof(kernel_header), 1, file);
+
+  fprintf(output_file, "-kernel name = %s\n", kernel_name.c_str());
+  fprintf(output_file, "-kernel id = %d\n", header.kernel_id);
+  fprintf(output_file, "-grid dim = (%d,%d,%d)\n", header.grid_dim_x,
+          header.grid_dim_y, header.grid_dim_z);
+  fprintf(output_file, "-block dim = (%d,%d,%d)\n", header.block_dim_x,
+          header.block_dim_y, header.block_dim_z);
+  fprintf(output_file, "-shmem = %d\n", header.shared_mem_bytes);
+  fprintf(output_file, "-nregs = %d\n", header.nregs);
+  fprintf(output_file, "-binary version = %d\n", header.binary_version);
+  fprintf(output_file, "-cuda stream id = %lu\n", header.cuda_stream_id);
+  fprintf(output_file, "-shmem base_addr = 0x%016lx\n", header.shmem_base_addr);
+  fprintf(output_file, "-local mem base_addr = 0x%016lx\n",
+          header.local_mem_base_addr);
+  fprintf(output_file, "-nvbit version = %s\n", header.nvbit_version);
+  fprintf(output_file, "-accelsim tracer version = %s\n",
+          header.accelsim_tracer_version);
+  fprintf(output_file, "-enable lineinfo = %d\n", header.enable_lineinfo);
+
+  // Read the traces
+  unsigned tot_warp_in_id;
+  unsigned tb_count = 0;
+  while (fread(&tot_warp_in_id, sizeof(unsigned), 1, file) == 1) {
+    // TB
+    fprintf(output_file, "#BEGIN_TB\n");
+    fprintf(output_file, "thread block = %d,%d,%d\n", tb_count, 0, 0);
+
+    unsigned read_tb = 0;
+    while (read_tb < tot_warp_in_id) {
+      // WARP
+      unsigned num_insts;
+      fread(&num_insts, sizeof(unsigned), 1, file);
+      fprintf(output_file, "warp = %d\n", read_tb);
+      fprintf(output_file, "insts = %d\n", num_insts);
+
+      unsigned read_inst = 0;
+      while (read_inst < num_insts) {
+        // INST
+        sim_inst_u full_inst;
+        inst_type_t inst_type;
+        fread(&inst_type, sizeof(inst_type), 1, file);
+        unsigned size;
+        switch (inst_type) {
+        case INST_BASE:
+          size = sizeof(sim_inst_trace_t);
+          fread(&full_inst.sim_inst_base, size, 1, file);
+          break;
+        case INST_FLAT:
+          size = sizeof(sim_inst_trace_flat_t);
+          fread(&full_inst.sim_inst_flat, size, 1, file);
+          break;
+        case INST_DELTA:
+          size = sizeof(sim_inst_trace_delta_t);
+          fread(&full_inst.sim_inst_delta, size, 1, file);
+          break;
+        case INST_STRIDE:
+          size = sizeof(sim_inst_trace_stride_t);
+          fread(&full_inst.sim_inst_stride, size, 1, file);
+          break;
+        default:
+          assert(0);
+          exit(1);
+        }
+
+        sim_inst_trace_t inst = full_inst.sim_inst_base;
+        // print VPC
+        fprintf(output_file, "%04x ", inst.vpc);
+
+        // print active mask
+        fprintf(output_file, "%08x ", inst.active_mask & inst.predicate_mask);
+
+        // print GPRDst
+        if (inst.GPRDst >= 0) {
+          fprintf(output_file, "1 R%d ", inst.GPRDst);
+        } else {
+          fprintf(output_file, "0 ");
+        }
+
+        // print opcode
+        fprintf(output_file, "%s ", inst.opcode);
+
+        // print src count
+        fprintf(output_file, "%d ", inst.numSrcs);
+
+        // print GPRSrcs
+        for (int i = 0; i < MAX_SRC; i++) {
+          if (inst.GPRSrcs[i] >= 0) {
+            fprintf(output_file, "R%d ", inst.GPRSrcs[i]);
+          }
+        }
+
+        // print width
+        fprintf(output_file, "%d ", inst.width);
+
+        // print is_mem
+        if (inst.is_mem) {
+          std::bitset<32> mask(inst.active_mask & inst.predicate_mask);
+          if (inst_type == INST_FLAT) {
+            // default no compression for now
+            fprintf(output_file, "0 ");
+
+            for (int i = 0; i < 32; i++) {
+              if (mask[i]) {
+                fprintf(output_file, "0x%llx ",
+                        (unsigned long long)full_inst.sim_inst_flat.addrs[i]);
+              }
+            }
+          } else if (inst_type == INST_DELTA) {
+            // enabled compression
+            fprintf(output_file, "2 ");
+            fprintf(output_file, "0x%llx ",
+                    (unsigned long long)full_inst.sim_inst_delta.base_addr);
+
+            for (int i = 0; i < 32; i++) {
+              if (mask[i]) {
+                fprintf(output_file, "%llx ",
+                        (unsigned long long)full_inst.sim_inst_delta.base_addr +
+                            full_inst.sim_inst_delta.delta[i]);
+              }
+            }
+          } else if (inst_type == INST_STRIDE) {
+            // enabled compression
+            fprintf(output_file, "1 ");
+            fprintf(output_file, "0x%llx ",
+                    (unsigned long long)full_inst.sim_inst_stride.base_addr);
+
+            for (int i = 0; i < 32; i++) {
+              if (mask[i]) {
+                fprintf(
+                    output_file, "%llx ",
+                    (unsigned long long)full_inst.sim_inst_stride.base_addr +
+                        full_inst.sim_inst_stride.stride * i);
+              }
+            }
+          }
+        }
+
+        // print imm
+        fprintf(output_file, "%ld ", inst.imm);
+
+        // newline
+        fprintf(output_file, "\n");
+
+        read_inst++;
+      }
+      read_tb++;
+    }
+    fprintf(output_file, "#END_TB\n");
+    tb_count++;
+  }
+
+  fclose(file);
+  fclose(output_file);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
index f4649c5ee..73995eabf 100644
--- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu
+++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
@@ -2,6 +2,8 @@
 /* Author2: Jason Shen, shen203@purdue.edu - 2019 */
 
 #include <assert.h>
+#include <climits>
+#include <cstring>
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -33,6 +35,10 @@
 
 #define TRACER_VERSION "5"
 
+static __managed__ uint64_t total_dynamic_instr_counter = 0;
+static __managed__ uint64_t reported_dynamic_instr_counter = 0;
+static __managed__ bool stop_report = false;
+
 /* Channel used to communicate from GPU to CPU receiving thread */
 #define CHANNEL_SIZE (1l << 20)
 static __managed__ ChannelDev channel_dev;
@@ -83,6 +89,9 @@ std::unordered_map<CUcontext, FILE *> ctx_resultsFile;
 
 std::string kernel_ranges = "";
 
+std::unordered_map<CUcontext, std::vector<std::vector<unsigned>>>
+    tb_warp_inst_ct;
+
 struct KernelRange {
   uint64_t start;
   uint64_t end; // UINT64_MAX means open-ended
@@ -502,45 +511,57 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func,
           ctx_kernelid[ctx], ctx);
 
   if (!stop_report) {
-    if (!xz_compress_trace) {
-      ctx_resultsFile[ctx] = fopen(buffer, "w");
-      printf("Writing results to %s\n", buffer);
-    } else {
-      char cmd_buffer[1039];
-      sprintf(cmd_buffer, "xz -1 -T0 > %s.xz", buffer);
-      ctx_resultsFile[ctx] = popen(cmd_buffer, "w");
-      printf("Writing results to %s.xz\n", buffer);
+    try {
+      if (!xz_compress_trace) {
+        ctx_resultsFile[ctx] = openFileForWriting(buffer, false);
+        printf("Writing results to %s\n", buffer);
+      } else {
+        char cmd_buffer[1039];
+        ctx_resultsFile[ctx] =
+            openFileForWritingXz(buffer, cmd_buffer, sizeof(cmd_buffer));
+        printf("Writing results to %s.xz\n", buffer);
+      }
+    } catch (const std::runtime_error &e) {
+      perror("Failed to open results file");
+      fprintf(stderr, "Error: %s\n", e.what());
+      exit(1);
     }
 
-    // Writing header information
-    fprintf(ctx_resultsFile[ctx], "-kernel name = %s\n",
-            nvbit_get_func_name(ctx, func, true));
-    fprintf(ctx_resultsFile[ctx], "-kernel id = %d\n", ctx_kernelid[ctx]);
-    fprintf(ctx_resultsFile[ctx], "-grid dim = (%d,%d,%d)\n", gridDimX,
-            gridDimY, gridDimZ);
-    fprintf(ctx_resultsFile[ctx], "-block dim = (%d,%d,%d)\n", blockDimX,
-            blockDimY, blockDimZ);
-    fprintf(ctx_resultsFile[ctx], "-shmem = %d\n",
-            shmem_static_nbytes + sharedMemBytes);
-    fprintf(ctx_resultsFile[ctx], "-nregs = %d\n", nregs);
-    fprintf(ctx_resultsFile[ctx], "-binary version = %d\n", binary_version);
-    fprintf(ctx_resultsFile[ctx], "-cuda stream id = %lu\n", (uint64_t)hStream);
-    fprintf(ctx_resultsFile[ctx], "-shmem base_addr = 0x%016lx\n",
-            (uint64_t)nvbit_get_shmem_base_addr(ctx));
-    fprintf(ctx_resultsFile[ctx], "-local mem base_addr = 0x%016lx\n",
-            (uint64_t)nvbit_get_local_mem_base_addr(ctx));
-    fprintf(ctx_resultsFile[ctx], "-nvbit version = %s\n", NVBIT_VERSION);
-    fprintf(ctx_resultsFile[ctx], "-accelsim tracer version = %s\n",
-            TRACER_VERSION);
-    fprintf(ctx_resultsFile[ctx], "-enable lineinfo = %d\n", lineinfo);
-    fprintf(ctx_resultsFile[ctx], "\n");
-
-    fprintf(ctx_resultsFile[ctx],
-            "#traces format = [line_num] PC mask dest_num [reg_dests] "
-            "opcode src_num "
-            "[reg_srcs] mem_width [adrrescompress?] [mem_addresses] "
-            "immediate\n");
-    fprintf(ctx_resultsFile[ctx], "\n");
+    kernel_header header;
+    header.kernel_id = ctx_kernelid[ctx];
+    header.grid_dim_x = gridDimX;
+    header.grid_dim_y = gridDimY;
+    header.grid_dim_z = gridDimZ;
+    header.block_dim_x = blockDimX;
+    header.block_dim_y = blockDimY;
+    header.block_dim_z = blockDimZ;
+    header.shared_mem_bytes = shmem_static_nbytes + sharedMemBytes;
+    header.nregs = nregs;
+    header.binary_version = binary_version;
+    header.cuda_stream_id = (uint64_t)hStream;
+    header.shmem_base_addr = (uint64_t)nvbit_get_shmem_base_addr(ctx);
+    header.local_mem_base_addr = (uint64_t)nvbit_get_local_mem_base_addr(ctx);
+    strcpy(header.nvbit_version, NVBIT_VERSION);
+    strcpy(header.accelsim_tracer_version, TRACER_VERSION);
+    header.enable_lineinfo = lineinfo;
+
+    // write kernel name
+    std::string kernel_name = std::string(nvbit_get_func_name(ctx, func, true));
+    // write name size
+    uint64_t name_size = kernel_name.size();
+    fwrite(&name_size, sizeof(uint64_t), 1, ctx_resultsFile[ctx]);
+    fwrite(kernel_name.c_str(), kernel_name.size(), 1, ctx_resultsFile[ctx]);
+    printf("Kernel name: %s\n", kernel_name.c_str());
+    fwrite(&header, sizeof(kernel_header), 1, ctx_resultsFile[ctx]);
+
+    unsigned tot_tb = header.grid_dim_x * header.grid_dim_y * header.grid_dim_z;
+    unsigned tb_size =
+        header.block_dim_x * header.block_dim_y * header.block_dim_z;
+    tb_warp_inst_ct[ctx] = std::vector<std::vector<unsigned>>();
+    tb_warp_inst_ct[ctx].resize(tot_tb, std::vector<unsigned>(tb_size, 0));
+
+    unsigned tot_warp = tot_tb * tb_size;
+    unsigned print_0 = 0;
   }
 
   kernelsFile = fopen(ctx_kernelslist[ctx].c_str(), "a");
@@ -833,61 +854,6 @@ bool check_opcode_contain(const std::vector<std::string> &opcode,
   return false;
 }
 
-bool base_stride_compress(const uint64_t *addrs, const std::bitset<32> &mask,
-                          uint64_t &base_addr, int &stride) {
-  // calulcate the difference between addresses
-  // write cosnsctive addresses with constant stride in a more
-  // compressed way (i.e. start adress and stride)
-  bool const_stride = true;
-  bool first_bit1_found = false;
-  bool last_bit1_found = false;
-
-  for (int s = 0; s < 32; s++) {
-    if (mask.test(s) && !first_bit1_found) {
-      first_bit1_found = true;
-      base_addr = addrs[s];
-      if (s < 31 && mask.test(s + 1))
-        stride = addrs[s + 1] - addrs[s];
-      else {
-        const_stride = false;
-        break;
-      }
-    } else if (first_bit1_found && !last_bit1_found) {
-      if (mask.test(s)) {
-        if (stride != addrs[s] - addrs[s - 1]) {
-          const_stride = false;
-          break;
-        }
-      } else
-        last_bit1_found = true;
-    } else if (last_bit1_found) {
-      if (mask.test(s)) {
-        const_stride = false;
-        break;
-      }
-    }
-  }
-
-  return const_stride;
-}
-
-void base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask,
-                         uint64_t &base_addr, std::vector<long long> &deltas) {
-  // save the delta from the previous address
-  bool first_bit1_found = false;
-  uint64_t last_address = 0;
-  for (int s = 0; s < 32; s++) {
-    if (mask.test(s) && !first_bit1_found) {
-      base_addr = addrs[s];
-      first_bit1_found = true;
-      last_address = addrs[s];
-    } else if (mask.test(s) && first_bit1_found) {
-      deltas.push_back(addrs[s] - last_address);
-      last_address = addrs[s];
-    }
-  }
-}
-
 void *recv_thread_fun(void *args) {
   CUcontext ctx = (CUcontext)args;
   char *recv_buffer = (char *)malloc(CHANNEL_SIZE);
@@ -898,6 +864,9 @@ void *recv_thread_fun(void *args) {
       uint32_t num_processed_bytes = 0;
       while (num_processed_bytes < num_recv_bytes) {
         inst_trace_t *ma = (inst_trace_t *)&recv_buffer[num_processed_bytes];
+        std::string opcode = id_to_opcode_map[ma->opcode_id];
+        assert(opcode.size() <= MAX_OPCODE_LENGTH);
+        strcpy(ma->base.opcode, opcode.c_str());
 
         /* when we get this cta_id_x it means the kernel has completed
          */
@@ -906,94 +875,110 @@ void *recv_thread_fun(void *args) {
           break;
         }
 
-        fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x);
-        fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y);
-        fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z);
-        fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_tb);
-        if (print_core_id) {
-          fprintf(ctx_resultsFile[ctx], "%d ", ma->sm_id);
-          fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_sm);
-        }
-        if (lineinfo) {
-          fprintf(ctx_resultsFile[ctx], "%d ", ma->line_num);
-        }
-        fprintf(ctx_resultsFile[ctx], "%04x ", ma->vpc); // Print the virtual PC
-        fprintf(ctx_resultsFile[ctx], "%08x ",
-                ma->active_mask & ma->predicate_mask);
-        if (ma->GPRDst >= 0) {
-          fprintf(ctx_resultsFile[ctx], "1 ");
-          fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRDst);
-        } else
-          fprintf(ctx_resultsFile[ctx], "0 ");
-
-        // Print the opcode.
-        fprintf(ctx_resultsFile[ctx], "%s ",
-                id_to_opcode_map[ma->opcode_id].c_str());
-        unsigned src_count = 0;
-        for (int s = 0; s < MAX_SRC; s++) // GPR srcs count.
-          if (ma->GPRSrcs[s] >= 0)
-            src_count++;
-        fprintf(ctx_resultsFile[ctx], "%d ", src_count);
-
-        for (int s = 0; s < MAX_SRC; s++) // GPR srcs.
-          if (ma->GPRSrcs[s] >= 0)
-            fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRSrcs[s]);
-
-        // print addresses
-        std::bitset<32> mask(ma->active_mask & ma->predicate_mask);
-        if (ma->is_mem) {
-          std::istringstream iss(id_to_opcode_map[ma->opcode_id]);
-          std::vector<std::string> tokens;
-          std::string token;
-          while (std::getline(iss, token, '.')) {
-            if (!token.empty())
-              tokens.push_back(token);
-          }
-          fprintf(ctx_resultsFile[ctx], "%d ",
-                  get_datawidth_from_opcode(tokens));
-
-          bool base_stride_success = false;
-          uint64_t base_addr = 0;
-          int stride = 0;
-          std::vector<long long> deltas;
-
-          if (enable_compress) {
-            // try base+stride format
-            base_stride_success =
-                base_stride_compress(ma->addrs, mask, base_addr, stride);
-            if (!base_stride_success) {
-              // if base+stride fails, try base+delta format
-              base_delta_compress(ma->addrs, mask, base_addr, deltas);
-            }
-          }
-
-          if (base_stride_success && enable_compress) {
-            // base + stride format
-            fprintf(ctx_resultsFile[ctx], "%u 0x%llx %d ",
-                    address_format::base_stride, base_addr, stride);
-          } else if (!base_stride_success && enable_compress) {
-            // base + delta format
-            fprintf(ctx_resultsFile[ctx], "%u 0x%llx ",
-                    address_format::base_delta, base_addr);
-            for (int s = 0; s < deltas.size(); s++) {
-              fprintf(ctx_resultsFile[ctx], "%lld ", deltas[s]);
-            }
-          } else {
-            // list all the addresses
-            fprintf(ctx_resultsFile[ctx], "%u ", address_format::list_all);
-            for (int s = 0; s < 32; s++) {
-              if (mask.test(s))
-                fprintf(ctx_resultsFile[ctx], "0x%016lx ", ma->addrs[s]);
-            }
-          }
-        } else {
-          fprintf(ctx_resultsFile[ctx], "0 ");
+        // Write the inst_trace_t structure as binary data to the file
+        unsigned size = sizeof(inst_trace_t);
+        if (!ma->base.is_mem) {
+          // write only the part without addrs
+          size = offsetof(inst_trace_t, addrs);
         }
-
-        // Print the immediate
-        fprintf(ctx_resultsFile[ctx], "%d ", ma->imm);
-
-        fprintf(ctx_resultsFile[ctx], "\n");
+        fwrite(&size, sizeof(unsigned), 1, ctx_resultsFile[ctx]);
+        fwrite(ma, size, 1, ctx_resultsFile[ctx]);
+        // unsigned tb_id_x = ma->cta_id_x;
+        // unsigned tb_id_y = ma->cta_id_y;
+        // unsigned tb_id_z = ma->cta_id_z;
+        // unsigned tb_id = tb_id_z * header.grid_dim_y * header.grid_dim_x +
+        //                  tb_id_y * header.grid_dim_x + tb_id_x;
+        // unsigned warp_id = ma->warpid_tb;
+        // tb_warp_inst_ct[ctx][tb_id][warp_id]++;
+
+        // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x);
+        // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y);
+        // fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z);
+        // fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_tb);
+        // if (print_core_id) {
+        //   fprintf(ctx_resultsFile[ctx], "%d ", ma->sm_id);
+        //   fprintf(ctx_resultsFile[ctx], "%d ", ma->warpid_sm);
+        // }
+        // if (lineinfo) {
+        //   fprintf(ctx_resultsFile[ctx], "%d ", ma->line_num);
+        // }
+        // fprintf(ctx_resultsFile[ctx], "%04x ", ma->vpc); // Print the
+        // virtual PC fprintf(ctx_resultsFile[ctx], "%08x ",
+        //         ma->active_mask & ma->predicate_mask);
+        // if (ma->GPRDst >= 0) {
+        //   fprintf(ctx_resultsFile[ctx], "1 ");
+        //   fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRDst);
+        // } else
+        //   fprintf(ctx_resultsFile[ctx], "0 ");
+
+        // // Print the opcode.
+        // fprintf(ctx_resultsFile[ctx], "%s ",
+        //         id_to_opcode_map[ma->opcode_id].c_str());
+        // unsigned src_count = 0;
+        // for (int s = 0; s < MAX_SRC; s++) // GPR srcs count.
+        //   if (ma->GPRSrcs[s] >= 0)
+        //     src_count++;
+        // fprintf(ctx_resultsFile[ctx], "%d ", src_count);
+
+        // for (int s = 0; s < MAX_SRC; s++) // GPR srcs.
+        //   if (ma->GPRSrcs[s] >= 0)
+        //     fprintf(ctx_resultsFile[ctx], "R%d ", ma->GPRSrcs[s]);
+
+        // // print addresses
+        // std::bitset<32> mask(ma->active_mask & ma->predicate_mask);
+        // if (ma->is_mem) {
+        //   std::istringstream iss(id_to_opcode_map[ma->opcode_id]);
+        //   std::vector<std::string> tokens;
+        //   std::string token;
+        //   while (std::getline(iss, token, '.')) {
+        //     if (!token.empty())
+        //       tokens.push_back(token);
+        //   }
+        //   fprintf(ctx_resultsFile[ctx], "%d ",
+        //           get_datawidth_from_opcode(tokens));
+
+        //   bool base_stride_success = false;
+        //   uint64_t base_addr = 0;
+        //   int stride = 0;
+        //   std::vector<long long> deltas;
+
+        //   if (enable_compress) {
+        //     // try base+stride format
+        //     base_stride_success =
+        //         base_stride_compress(ma->addrs, mask, base_addr, stride);
+        //     if (!base_stride_success) {
+        //       // if base+stride fails, try base+delta format
+        //       base_delta_compress(ma->addrs, mask, base_addr, deltas);
+        //     }
+        //   }
+
+        //   if (base_stride_success && enable_compress) {
+        //     // base + stride format
+        //     fprintf(ctx_resultsFile[ctx], "%u 0x%llx %d ",
+        //             address_format::base_stride, base_addr, stride);
+        //   } else if (!base_stride_success && enable_compress) {
+        //     // base + delta format
+        //     fprintf(ctx_resultsFile[ctx], "%u 0x%llx ",
+        //             address_format::base_delta, base_addr);
+        //     for (int s = 0; s < deltas.size(); s++) {
+        //       fprintf(ctx_resultsFile[ctx], "%lld ", deltas[s]);
+        //     }
+        //   } else {
+        //     // list all the addresses
+        //     fprintf(ctx_resultsFile[ctx], "%u ",
+        //     address_format::list_all); for (int s = 0; s < 32; s++) {
+        //       if (mask.test(s))
+        //         fprintf(ctx_resultsFile[ctx], "0x%016lx ", ma->addrs[s]);
+        //     }
+        //   }
+        // } else {
+        //   fprintf(ctx_resultsFile[ctx], "0 ");
+        // }
+
+        // // Print the immediate
+        // fprintf(ctx_resultsFile[ctx], "%d ", ma->imm);
+
+        // fprintf(ctx_resultsFile[ctx], "\n");
 
         num_processed_bytes += sizeof(inst_trace_t);
       }
diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/Makefile b/util/tracer_nvbit/tracer_tool/traces-processing/Makefile
index 899a95329..acdcd54d5 100755
--- a/util/tracer_nvbit/tracer_tool/traces-processing/Makefile
+++ b/util/tracer_nvbit/tracer_tool/traces-processing/Makefile
@@ -1,7 +1,7 @@
 TARGET := post-traces-processing
 
 $(TARGET): post-traces-processing.cpp
-	g++ -std=c++17 -O3 -g -o $@ $^
+	g++ -std=c++17 -O3 -g -Wno-unused-result -o $@ $^
 
 run: $(TARGET)
 	./$(TARGET)
diff --git a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
index cb71f5d45..ed1ade8a7 100644
--- a/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
+++ b/util/tracer_nvbit/tracer_tool/traces-processing/post-traces-processing.cpp
@@ -1,3 +1,5 @@
+#include <bitset>
+#include <cstring>
 #include <deque>
 #include <fstream>
 #include <iostream>
@@ -9,6 +11,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../common.h"
 #include <errno.h>
 #include <signal.h>
 #include <unistd.h>
@@ -18,13 +21,7 @@
 using namespace std;
 
 struct threadblock_info {
-  bool initialized;
-  unsigned tb_id_x, tb_id_y, tb_id_z;
-  vector<deque<const string *>> warp_insts_array;
-  threadblock_info() {
-    initialized = false;
-    tb_id_x = tb_id_y = tb_id_z = 0;
-  }
+  vector<deque<inst_trace_t>> warp_insts_array;
 };
 
 /// @brief There exist significant repetition in the trace. The WarpInstLUT
@@ -81,13 +78,78 @@ struct WarpInstLUT {
 void group_per_block(const char *filepath);
 void group_per_core(const char *filepath);
 
-// This program works by redirecting the stdin/stdout to child processes. The
-// stdin is piped to a process that reads from disk the input trace file. The
-// stdout is piped to a process that writes to disk the post-process trace. We
-// should preserve the original file descriptors for stdin/stdout before doing
-// redirections.
-int preserved_stdin_fileno;
-int preserved_stdout_fileno;
+bool base_stride_compress(const uint64_t *addrs, const std::bitset<32> &mask,
+                          uint64_t &base_addr, int &stride) {
+  // calulcate the difference between addresses
+  // write cosnsctive addresses with constant stride in a more
+  // compressed way (i.e. start adress and stride)
+  bool const_stride = true;
+  bool first_bit1_found = false;
+  bool last_bit1_found = false;
+
+  for (int s = 0; s < 32; s++) {
+    if (mask.test(s) && !first_bit1_found) {
+      first_bit1_found = true;
+      base_addr = addrs[s];
+      if (s < 31 && mask.test(s + 1))
+        stride = addrs[s + 1] - addrs[s];
+      else {
+        const_stride = false;
+        break;
+      }
+    } else if (first_bit1_found && !last_bit1_found) {
+      if (mask.test(s)) {
+        if (stride != addrs[s] - addrs[s - 1]) {
+          const_stride = false;
+          break;
+        }
+      } else
+        last_bit1_found = true;
+    } else if (last_bit1_found) {
+      if (mask.test(s)) {
+        const_stride = false;
+        break;
+      }
+    }
+  }
+
+  return const_stride;
+}
+
+bool base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask,
+                         uint64_t &base_addr, std::vector<int32_t> &deltas) {
+  // save the delta from the previous address
+  bool first_bit1_found = false;
+  uint64_t last_address = 0;
+  for (int s = 0; s < 32; s++) {
+    if (mask.test(s) && !first_bit1_found) {
+      base_addr = addrs[s];
+      first_bit1_found = true;
+      last_address = addrs[s];
+    } else if (mask.test(s) && first_bit1_found) {
+      // Check if delta can fit into int32_t
+      uint64_t delta;
+      if (addrs[s] >= last_address) {
+        delta = addrs[s] - last_address;
+        if (delta > INT32_MAX) {
+          // Overflow detected - return false
+          return false;
+        }
+      } else {
+        delta = last_address - addrs[s];
+        if (delta > (uint64_t)INT32_MAX) {
+          // Overflow detected - return false
+          return false;
+        }
+      }
+
+      // Delta fits in int32_t, add it to the vector
+      deltas.push_back(addrs[s] - last_address);
+      last_address = addrs[s];
+    }
+  }
+  return true; // Success
+}
 
 std::vector<std::string> kernelslist_list;
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -184,122 +246,173 @@ int main(int argc, char **argv) {
 // stderr stream. The io redirection will be restored by the time the function
 // returns.
 void group_per_block(const char *filepath) {
-  preserved_stdin_fileno = dup(STDIN_FILENO);
-  preserved_stdout_fileno = dup(STDOUT_FILENO);
-
-  string filepath_str{filepath};
-  WarpInstLUT warp_inst_lut;
-
-  pid_t sink_process_pid = 0;
-  string trace_sink_cmd;
-  int sink_pipe_fd[2];
-
-  pid_t source_process_pid = 0;
-  string trace_source_cmd;
-  int source_pipe_fd[2];
   string output_filepath;
-
-  bool input_file_is_xz = false;
-  int _l = filepath_str.length();
-  if (_l > 3 && filepath_str.substr(_l - 3, 3) == ".xz") {
-    // kernel-1.trace.xz --(xz -dc)--> f --(xz -1 -T0)--> kernel-1.traceg.xz
-    input_file_is_xz = true;
-    output_filepath = filepath_str.substr(0, _l - 3) + "g.xz";
-    trace_source_cmd = "xz -dc " + filepath_str;
-    trace_sink_cmd = "xz -1 -T0 > " + output_filepath;
-  } else if (_l > 6 && filepath_str.substr(_l - 6, 6) == ".trace") {
-    // kernel-2.trace --(cat)--> f --(cat)--> kernel-2.traceg
-    input_file_is_xz = false;
-    output_filepath = filepath_str + "g";
-    trace_source_cmd = "cat " + filepath_str;
-    trace_sink_cmd = "cat > " + output_filepath;
-  } else {
-    cerr << "Only support xz or raw text format. Unable to process - and "
-            "skipping - trace file "
-         << filepath_str << endl;
-    close(preserved_stdin_fileno);
-    close(preserved_stdout_fileno);
-    return;
+  // Open the pipe
+  FILE *pipe;
+  FILE *kernel_out;
+
+  try {
+    // Use utility function to open input file
+    pipe = openFileForReading(filepath);
+
+    // Generate output filepath and open output file
+    if (hasEnding(filepath, ".xz")) {
+      output_filepath = generateOutputFilepath(filepath, "g");
+      kernel_out =
+          openFileForWriting(output_filepath, true); // Use xz compression
+    } else if (hasEnding(filepath, ".trace")) {
+      output_filepath = string(filepath) + "g";
+      kernel_out = openFileForWriting(output_filepath, false); // No compression
+    } else {
+      throw std::runtime_error("Unsupported file type!");
+    }
+  } catch (const std::runtime_error &e) {
+    throw std::runtime_error("Failed to open files: " + string(e.what()));
   }
 
-  // cerr << "source cmd is "<<trace_source_cmd<<"\n";
-  // cerr << "sink cmd is "<<trace_sink_cmd<<"\n";
+  cerr << "Processing file " << filepath << endl;
 
-  // fork a child process as the trace source
-  if (pipe(source_pipe_fd) != 0) {
-    cerr << "Failed to create pipe\n";
-    perror("pipe");
-    exit(1);
-  }
-  source_process_pid = fork();
-  if (source_process_pid == 0) {
-    //  child process
-    close(source_pipe_fd[0]);
-    dup2(source_pipe_fd[1], STDOUT_FILENO);
-
-    // When using GDB, sending Ctrl+C to the program will send a SIGINT signal
-    // to the child process as well, subsequently causing it to terminate. To
-    // avoid this, we let the child process ignore (SIG_IGN) the SIGINT signal.
-    // Reference:
-    // https://stackoverflow.com/questions/38404925/gdb-interrupt-running-process-without-killing-child-processes
-    signal(SIGINT, SIG_IGN);
-
-    execle("/bin/sh", "sh", "-c", trace_source_cmd.c_str(), NULL, environ);
-    perror("execle"); // child shouldn't reach here if all is well.
-    exit(1);
-  } else if (source_process_pid > 0) {
-    // parent process - the trace post processor
-    // stdin is now redirected to the read end of the source_pipe
-    close(source_pipe_fd[1]);
-    int r = dup2(source_pipe_fd[0], STDIN_FILENO);
-  } else {
-    cerr << "Failed to fork data source process\n";
-    perror("fork");
-    exit(1);
+  vector<threadblock_info> insts;
+
+  // Read the kernel header
+  std::string kernel_name;
+  uint64_t name_size;
+  fread(&name_size, sizeof(name_size), 1, pipe);
+  kernel_name.resize(name_size);
+  fread(kernel_name.data(), name_size, 1, pipe);
+
+  // Read the kernel header
+  kernel_header header;
+  fread(&header, sizeof(header), 1, pipe);
+
+  insts.resize(header.grid_dim_x * header.grid_dim_y * header.grid_dim_z);
+  vector<vector<bool>> ldgsts_flags(header.grid_dim_x * header.grid_dim_y *
+                                    header.grid_dim_z);
+
+  for (unsigned tb = 0; tb < insts.size(); ++tb) {
+    insts[tb].warp_insts_array.resize(ceil(
+        float(header.block_dim_x * header.block_dim_y * header.block_dim_z) /
+        32));
+
+    ldgsts_flags[tb].resize(ceil(
+        float(header.block_dim_x * header.block_dim_y * header.block_dim_z) /
+        32));
+    for (unsigned j = 0; j < ldgsts_flags[tb].size(); j++) {
+      ldgsts_flags[tb][j] = true;
+    }
   }
 
-  // fork a child process as the trace sink
-  if (pipe(sink_pipe_fd) != 0) {
-    cerr << "Failed to create pipe\n";
-    perror("pipe");
-    exit(1);
+  unsigned size;
+  while (fread(&size, sizeof(unsigned), 1, pipe)) {
+    inst_trace_t inst = {0};
+    fread(&inst, size, 1, pipe);
+
+    unsigned tb_id_x = inst.cta_id_x;
+    unsigned tb_id_y = inst.cta_id_y;
+    unsigned tb_id_z = inst.cta_id_z;
+    unsigned tb_id = tb_id_z * header.grid_dim_y * header.grid_dim_x +
+                     tb_id_y * header.grid_dim_x + tb_id_x;
+    unsigned warp_id = inst.warpid_tb;
+
+    std::string opcode = inst.base.opcode;
+    if (opcode.find("LDGSTS") != string::npos) {
+      if (!ldgsts_flags[tb_id][warp_id]) {
+        insts[tb_id].warp_insts_array[warp_id].push_back(inst);
+      }
+      ldgsts_flags[tb_id][warp_id] = !ldgsts_flags[tb_id][warp_id];
+    } else {
+      insts[tb_id].warp_insts_array[warp_id].push_back(inst);
+    }
   }
-  sink_process_pid = fork();
-  if (sink_process_pid == 0) {
-    // child process
-    close(sink_pipe_fd[1]);
-    dup2(sink_pipe_fd[0], STDIN_FILENO);
-    signal(SIGINT, SIG_IGN); // ignore SIGINT
-    execle("/bin/sh", "sh", "-c", trace_sink_cmd.c_str(), NULL, environ);
-    perror("execle"); // child shouldn't reach here if all is well.
-    exit(1);
-  } else if (sink_process_pid > 0) {
-    // parent process - the trace post processor
-    // stdout is now redirected to the write end of the sink_pipe
-    close(sink_pipe_fd[0]);
-    int r = dup2(sink_pipe_fd[1], STDOUT_FILENO);
-  } else {
-    cerr << "Failed to fork data sink process\n";
-    perror("fork");
-    exit(1);
+
+  fwrite(&name_size, sizeof(uint64_t), 1, kernel_out);
+  fwrite(kernel_name.c_str(), kernel_name.size(), 1, kernel_out);
+
+  fwrite(&header, sizeof(kernel_header), 1, kernel_out);
+
+  for (unsigned tb_id = 0; tb_id < insts.size(); ++tb_id) {
+    if (insts[tb_id].warp_insts_array.size() > 0) {
+      // print total warp count in this thread block
+      unsigned total_warp_count = insts[tb_id].warp_insts_array.size();
+      fwrite(&total_warp_count, sizeof(unsigned), 1, kernel_out);
+
+      for (unsigned warp_id = 0; warp_id < insts[tb_id].warp_insts_array.size();
+           ++warp_id) {
+        // print total inst count in this warp
+        unsigned total_inst_count =
+            insts[tb_id].warp_insts_array[warp_id].size();
+        fwrite(&total_inst_count, sizeof(unsigned), 1, kernel_out);
+
+        for (unsigned inst_id = 0;
+             inst_id < insts[tb_id].warp_insts_array[warp_id].size();
+             ++inst_id) {
+          inst_trace_t &full_inst =
+              insts[tb_id].warp_insts_array[warp_id][inst_id];
+
+          if (!full_inst.base.is_mem) {
+            sim_inst_trace_t inst = full_inst.base;
+            inst_type_t type = INST_BASE;
+
+            fwrite(&type, sizeof(inst_type_t), 1, kernel_out);
+            fwrite(&inst, sizeof(inst), 1, kernel_out);
+          } else {
+            std::bitset<32> mask(full_inst.base.active_mask &
+                                 full_inst.base.predicate_mask);
+            bool base_stride_success = false;
+            uint64_t base_addr = 0;
+            int stride = 0;
+            std::vector<int32_t> deltas;
+            bool base_delta_success = false;
+
+            // try base+stride format
+            base_stride_success =
+                base_stride_compress(full_inst.addrs, mask, base_addr, stride);
+            if (!base_stride_success) {
+              // if base+stride fails, try base+delta format
+              base_delta_success =
+                  base_delta_compress(full_inst.addrs, mask, base_addr, deltas);
+            }
+
+            if (base_stride_success) {
+              sim_inst_trace_stride_t inst;
+              inst.base = full_inst.base;
+              inst.base_addr = base_addr;
+              inst.stride = stride;
+              inst_type_t type = INST_STRIDE;
+
+              fwrite(&type, sizeof(inst_type_t), 1, kernel_out);
+              fwrite(&inst, sizeof(inst), 1, kernel_out);
+            } else if (base_delta_success) {
+              sim_inst_trace_delta_t inst;
+              inst.base = full_inst.base;
+              inst.base_addr = base_addr;
+              deltas.resize(32, 0ll);
+              memcpy(inst.delta, deltas.data(), sizeof(inst.delta));
+              inst_type_t type = INST_DELTA;
+
+              fwrite(&type, sizeof(inst_type_t), 1, kernel_out);
+              fwrite(&inst, sizeof(inst), 1, kernel_out);
+            } else {
+              // save the addresses as is
+              sim_inst_trace_flat_t inst;
+              inst.base = full_inst.base;
+              memcpy(inst.addrs, full_inst.addrs, sizeof(inst.addrs));
+              inst_type_t type = INST_FLAT;
+              fwrite(&type, sizeof(inst_type_t), 1, kernel_out);
+              fwrite(&inst, sizeof(inst), 1, kernel_out);
+            }
+          }
+        }
+      }
+    }
   }
+  fclose(kernel_out);
 
-  cerr << "Processing file " << filepath << endl;
+  /*
+  // legacy code starts here. Pending to be removed.
 
-  vector<threadblock_info> insts;
-  unsigned grid_dim_x, grid_dim_y, grid_dim_z, tb_dim_x, tb_dim_y, tb_dim_z;
-  unsigned tb_id_x, tb_id_y, tb_id_z, tb_id, warpid_tb;
-  unsigned lineinfo, linenum;
-  string line;
-  stringstream ss;
-  string string1, string2;
-  bool found_grid_dim = false, found_block_dim = false;
-
-  // Add a flag for LDGSTS instruction to indicate which one to remove
-  vector<vector<bool>> ldgsts_flags; // true to remove, false to not
-
-  // Important... without clear(), cin.eof() may evaluate to true on the second
-  // kernel
+  // Important... without clear(), cin.eof() may evaluate to true on the
+  // second kernel
   cin.clear();
   clearerr(stdin);
   while (!cin.eof()) {
@@ -315,14 +428,11 @@ void group_per_block(const char *filepath) {
       ss.ignore();
       ss >> string1 >> string2;
       if (string1 == "grid" && string2 == "dim") {
-        sscanf(line.c_str(), "-grid dim = (%d,%d,%d)", &grid_dim_x, &grid_dim_y,
-               &grid_dim_z);
-        found_grid_dim = true;
-      } else if (string1 == "block" && string2 == "dim") {
-        sscanf(line.c_str(), "-block dim = (%d,%d,%d)", &tb_dim_x, &tb_dim_y,
-               &tb_dim_z);
-        found_block_dim = true;
-      } else if (string1 == "enable" && string2 == "lineinfo") {
+        sscanf(line.c_str(), "-grid dim = (%d,%d,%d)", &grid_dim_x,
+  &grid_dim_y, &grid_dim_z); found_grid_dim = true; } else if (string1 ==
+  "block" && string2 == "dim") { sscanf(line.c_str(), "-block dim =
+  (%d,%d,%d)", &tb_dim_x, &tb_dim_y, &tb_dim_z); found_block_dim = true; }
+  else if (string1 == "enable" && string2 == "lineinfo") {
         sscanf(line.c_str(), "-enable lineinfo = %d", &lineinfo);
       }
 
@@ -351,9 +461,8 @@ void group_per_block(const char *filepath) {
       ss.str(line);
       ss >> tb_id_x >> tb_id_y >> tb_id_z >> warpid_tb;
       tb_id =
-          tb_id_z * grid_dim_y * grid_dim_x + tb_id_y * grid_dim_x + tb_id_x;
-      if (!insts[tb_id].initialized) {
-        insts[tb_id].tb_id_x = tb_id_x;
+          tb_id_z * grid_dim_y * grid_dim_x + tb_id_y * grid_dim_x +
+  tb_id_x; if (!insts[tb_id].initialized) { insts[tb_id].tb_id_x = tb_id_x;
         insts[tb_id].tb_id_y = tb_id_y;
         insts[tb_id].tb_id_z = tb_id_z;
         insts[tb_id].initialized = true;
@@ -384,7 +493,8 @@ void group_per_block(const char *filepath) {
         inst_ptr = warp_inst_lut.register_new_entry(rest_of_line);
 
       // One actual LDGSTS instruction includes 2 LDGSTS instructions in the
-      // trace, because it has two memory references. This is trying to remove
+      // trace, because it has two memory references. This is trying to
+  remove
       // the one with the shared memory address.
 
       if (opcode.find("LDGSTS") != string::npos) {
@@ -398,33 +508,37 @@ void group_per_block(const char *filepath) {
     }
   }
 
-  for (unsigned i = 0; i < insts.size(); ++i) {
+  for (unsigned tb_id = 0; tb_id < insts.size(); ++tb_id) {
     // ofs<<string<<"\n";
-    if (insts[i].initialized && insts[i].warp_insts_array.size() > 0) {
-      cout << "\n"
+    if (insts[tb_id].initialized && insts[tb_id].warp_insts_array.size() >
+  0) { cout << "\n"
            << "#BEGIN_TB"
            << "\n";
       cout << "\n"
-           << "thread block = " << insts[i].tb_id_x << "," << insts[i].tb_id_y
-           << "," << insts[i].tb_id_z << "\n";
+           << "thread block = " << insts[tb_id].tb_id_x << ","
+           << insts[tb_id].tb_id_y << "," << insts[tb_id].tb_id_z << "\n";
     } else {
-      cerr << "Warning: Thread block " << insts[i].tb_id_x << ","
-           << insts[i].tb_id_y << "," << insts[i].tb_id_z << " is empty"
+      cerr << "Warning: Thread block " << insts[tb_id].tb_id_x << ","
+           << insts[tb_id].tb_id_y << "," << insts[tb_id].tb_id_z << " is
+  empty"
            << "\n";
       continue;
     }
-    for (unsigned j = 0; j < insts[i].warp_insts_array.size(); ++j) {
+    for (unsigned warp_id = 0; warp_id <
+  insts[tb_id].warp_insts_array.size();
+         ++warp_id) {
       cout << "\n"
-           << "warp = " << j << "\n";
-      cout << "insts = " << insts[i].warp_insts_array[j].size() << "\n";
-      if (insts[i].warp_insts_array[j].size() == 0) {
-        cerr << "Warning: Warp " << j << " in thread block" << insts[i].tb_id_x
-             << "," << insts[i].tb_id_y << "," << insts[i].tb_id_z
-             << " is empty"
+           << "warp = " << warp_id << "\n";
+      cout << "insts = " << insts[tb_id].warp_insts_array[warp_id].size()
+           << "\n";
+      if (insts[tb_id].warp_insts_array[warp_id].size() == 0) {
+        cerr << "Warning: Warp " << warp_id << " in thread block"
+             << insts[tb_id].tb_id_x << "," << insts[tb_id].tb_id_y << ","
+             << insts[tb_id].tb_id_z << " is empty"
              << "\n";
       }
-      for (auto it = insts[i].warp_insts_array[j].cbegin();
-           it != insts[i].warp_insts_array[j].cend(); ++it) {
+      for (auto it = insts[tb_id].warp_insts_array[warp_id].cbegin();
+           it != insts[tb_id].warp_insts_array[warp_id].cend(); ++it) {
         // dereference once: const string*
         // dereference twice: const string
         cout << **it << "\n";
@@ -432,17 +546,7 @@ void group_per_block(const char *filepath) {
     }
     cout << endl << "#END_TB" << endl;
   }
-
-  close(source_pipe_fd[0]);
-  close(source_pipe_fd[1]);
-  close(sink_pipe_fd[0]);
-  close(sink_pipe_fd[1]);
-
-  // restore stdin/stdout file descriptor
-  dup2(preserved_stdin_fileno, STDIN_FILENO);
-  dup2(preserved_stdout_fileno, STDOUT_FILENO);
-  close(preserved_stdin_fileno);
-  close(preserved_stdout_fileno);
+  */
 }
 
 void group_per_core(const char *filepath) {