update benchmark

vickiegpt · vickiegpt · commit a48900a4f2d9 · 2025-02-27T19:37:30.000-08:00
diff --git a/microbench/ld.cpp b/microbench/ld.cpp
@@ -26,7 +26,7 @@
 #define STR(x) STR_HELPER(x)
 
 #define MOVE_SIZE 128
-#define MAP_SIZE  (long)(10241024)
+#define MAP_SIZE  (long)(1024 * 1024 * 1024)
 #define CACHELINE_SIZE  64
 
 #ifndef FENCE_COUNT
@@ -45,19 +45,9 @@
   "add $" STR(MOVE_SIZE) ", %%r8 \n"				\
   "cmp $" STR(FENCE_BOUND) ",%%r8\n"				\
   "jl LOOP_START%= \n"						\
-  "mfence \n"
-// 另一种方式是使用 lock 指令前缀来实现内存屏障
-// 这个宏在特定场景下可以替换上面的宏
-#define BODY_ALT(start)						\
-  "xor %%r8, %%r8 \n"						\
-  "LOOP_START%=: \n"						\
-  "lea (%[" #start "], %%r8), %%r9 \n"				\
-  "movdqa  (%%r9), %%xmm0 \n"					\
-  "add $" STR(MOVE_SIZE) ", %%r8 \n"				\
-  "cmp $" STR(FENCE_BOUND) ",%%r8\n"				\
-  "jl LOOP_START%= \n"						\
-  "lock addl $0, 0(%%rsp) \n"                                    \
-  /* lock 指令前缀会强制实现完整的内存屏障 */
+  "mfence \n"						\
+
+
 int main(int argc, char **argv) {
 
   // in principle, you would want to clear out cache lines (and the
@@ -93,35 +83,36 @@ int main(int argc, char **argv) {
 
   // should flush everything from the cache. But, how big is the cache?
   addr = base;
-  size_t large_buffer_size = 32 * 1024 * 1024; // 足够大以覆盖大多数缓存
-  char* large_buffer = (char*)malloc(large_buffer_size);
-  if (large_buffer) {
-    for (size_t i = 0; i < large_buffer_size; i += CACHELINE_SIZE) {
-      // 读取和写入大缓冲区，挤出目标内存的缓存行
-      large_buffer[i] = (char)(i & 0xFF);
-    }
-    free(large_buffer);
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+          "mfence\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
+    addr += CACHELINE_SIZE;
   }
 
+
   clock_gettime(CLOCK_MONOTONIC, &tstart);
-  for (int i=0; i<1e3; i++) {
-    addr = base;
-    while (addr < (base + MAP_SIZE)) {
-      asm volatile(
-		 BODY_ALT(addr)
+for (int i=0;i<1e3;i++){
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 BODY(addr)
 		 :
 		 : [addr] "r" (addr)
 		 : "r8", "r9", "xmm0");
 
       addr += (FENCE_COUNT * MOVE_SIZE);
-    }
-    
-    clock_gettime(CLOCK_MONOTONIC, &tend);
-    uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
-    nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &tend);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
+  nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
 
 
   printf("%lu\n", nanos);
 }
   return 0;
-}
+}
diff --git a/microbench/ld_nt.cpp b/microbench/ld_nt.cpp
@@ -43,103 +43,73 @@
   "add $" STR(MOVE_SIZE) ", %%r8 \n" \
   "cmp $" STR(FENCE_BOUND) ",%%r8\n" \
   "jl LOOP_START%= \n"  \
-  "lock addl $0, 0(%%rsp) \n" 
+  "mfence \n" 
 
 int main(int argc, char **argv) {
-  // 使用原子变量处理同步问题
-  std::atomic<int> sync_var(0);
 
-  // 分配更大的内存并确保对齐
+  // in principle, you would want to clear out cache lines (and the
+  // pipeline) before doing any of the inline assembly stuff.  But,
+  // that's hard.  And, its probably noise when you execute over
+  // enough things.
+
+
+  // allocate some meomery
   char *base =(char *) mmap(nullptr,
-        MAP_SIZE + CACHELINE_SIZE,
-        PROT_READ | PROT_WRITE,
-        MAP_ANONYMOUS | MAP_PRIVATE,
-        -1,
-        0);
-  base = base + CACHELINE_SIZE;
+		    MAP_SIZE,
+		    PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE,
+		    -1,
+		    0);
+
   if (base == MAP_FAILED) {
-    fprintf(stderr, "Memory allocation failed: %d\n", errno);
+    fprintf(stderr, "oops, you suck %d\n", errno);
     return -1;
   }
-  
-  // 确保内存对齐到缓存行
-  uintptr_t addr_value = (uintptr_t)base;
-  uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1);
-  char *aligned_base = (char*)aligned_addr;
-  
-  printf("Base address: %p, Aligned address: %p\n", base, aligned_base);
-  
-  // 初始化XMM0寄存器，避免使用未初始化的值
-  char dummy_data[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-  asm volatile(
-    "movdqu (%0), %%xmm0"
-    :
-    : "r" (dummy_data)
-    : "xmm0"
-  );
-
   char *addr = NULL;
-  intptr_t *iaddr = (intptr_t*)aligned_base;
+
+  intptr_t *iaddr = (intptr_t*) base;
   intptr_t hash = 0;
   struct timespec tstart = {0,0}, tend = {0,0};
 
-  // 初始化内存
-  printf("Initializing memory...\n");
-  size_t count = 0;
-  while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
-    hash = hash ^ (intptr_t)iaddr;
+  // Necessary so that we don't include allocation costs in our benchmark
+  while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
+    hash = hash ^ (intptr_t) iaddr;
     *iaddr = hash;
     iaddr++;
-    count++;
   }
-  printf("Initialized %zu intptr_t elements\n", count);
-
-  // 使用普通内存操作替代缓存刷新
-  printf("Flushing cache...\n");
-  addr = aligned_base;
-  count = 0;
-  while (addr < (aligned_base + MAP_SIZE)) {
-    // 使用读取+写入模式替代缓存刷新
-    volatile char* vaddr = (volatile char*)addr;
-    char temp = *vaddr;  // 读取到缓存
-    *vaddr = temp;       // 写回以触发缓存状态变化
-    
-    // 使用C++原子操作确保内存排序
-    sync_var.store(sync_var.load(std::memory_order_relaxed) + 1, 
-                  std::memory_order_release);
-    
+
+  // should flush everything from the cache. But, how big is the cache?
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+          "mfence\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
     addr += CACHELINE_SIZE;
-    count++;
   }
-  printf("Flushed %zu cache lines\n", count);
-  
-  // 确保之前的所有内存操作完成
-  sync_var.load(std::memory_order_acquire);
 
-  printf("Starting benchmark...\n");
+
   clock_gettime(CLOCK_MONOTONIC, &tstart);
-  for (int i = 0; i < 1000; i++) {
-    addr = aligned_base;
-    
-    // 添加额外安全检查，确保不会越界
-    while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
-      asm volatile(
-        BODY(addr)
-        :
-        : [addr] "r" (addr)
-        : "r8", "r9", "xmm0", "memory");
+for (int i=0;i<1e3;i++){
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 BODY(addr)
+		 :
+		 : [addr] "r" (addr)
+		 : "r8", "r9", "xmm0");
+
       addr += (FENCE_COUNT * MOVE_SIZE);
-    }
   }
   clock_gettime(CLOCK_MONOTONIC, &tend);
-
-  uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
   nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
 
-  printf("Benchmark completed: %lu ns\n", nanos);
-  
-  // 解除内存映射
-  munmap(base, MAP_SIZE + CACHELINE_SIZE);
-  
+
+  printf("%lu\n", nanos);
+}
   return 0;
 }
diff --git a/microbench/ld_nt_serial.cpp b/microbench/ld_nt_serial.cpp
@@ -42,104 +42,74 @@
   "movntdq %%xmm0, (%%r9) \n" \
   "add $" STR(MOVE_SIZE) ", %%r8 \n" \
   "cmp $" STR(FENCE_BOUND) ",%%r8\n" \
-  "lock addl $0, 0(%%rsp) \n"  \
+  "mfence\n"  \
   "jl LOOP_START%= \n" 
 
 int main(int argc, char **argv) {
-  // 使用原子变量处理同步问题
-  std::atomic<int> sync_var(0);
 
-  // 分配更大的内存并确保对齐
+  // in principle, you would want to clear out cache lines (and the
+  // pipeline) before doing any of the inline assembly stuff.  But,
+  // that's hard.  And, its probably noise when you execute over
+  // enough things.
+
+
+  // allocate some meomery
   char *base =(char *) mmap(nullptr,
-        MAP_SIZE + CACHELINE_SIZE,
-        PROT_READ | PROT_WRITE,
-        MAP_ANONYMOUS | MAP_PRIVATE,
-        -1,
-        0);
-  base = base + CACHELINE_SIZE;
+		    MAP_SIZE,
+		    PROT_READ | PROT_WRITE,
+		    MAP_ANONYMOUS | MAP_PRIVATE,
+		    -1,
+		    0);
+
   if (base == MAP_FAILED) {
-    fprintf(stderr, "Memory allocation failed: %d\n", errno);
+    fprintf(stderr, "oops, you suck %d\n", errno);
     return -1;
   }
-  
-  // 确保内存对齐到缓存行
-  uintptr_t addr_value = (uintptr_t)base;
-  uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1);
-  char *aligned_base = (char*)aligned_addr;
-  
-  printf("Base address: %p, Aligned address: %p\n", base, aligned_base);
-  
-  // 初始化XMM0寄存器，避免使用未初始化的值
-  char dummy_data[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
-  asm volatile(
-    "movdqu (%0), %%xmm0"
-    :
-    : "r" (dummy_data)
-    : "xmm0"
-  );
-
   char *addr = NULL;
-  intptr_t *iaddr = (intptr_t*)aligned_base;
+
+  intptr_t *iaddr = (intptr_t*) base;
   intptr_t hash = 0;
   struct timespec tstart = {0,0}, tend = {0,0};
 
-  // 初始化内存
-  printf("Initializing memory...\n");
-  size_t count = 0;
-  while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
-    hash = hash ^ (intptr_t)iaddr;
+  // Necessary so that we don't include allocation costs in our benchmark
+  while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
+    hash = hash ^ (intptr_t) iaddr;
     *iaddr = hash;
     iaddr++;
-    count++;
   }
-  printf("Initialized %zu intptr_t elements\n", count);
-
-  // 使用普通内存操作替代缓存刷新
-  printf("Flushing cache...\n");
-  addr = aligned_base;
-  count = 0;
-  while (addr < (aligned_base + MAP_SIZE)) {
-    // 使用读取+写入模式替代缓存刷新
-    volatile char* vaddr = (volatile char*)addr;
-    char temp = *vaddr;  // 读取到缓存
-    *vaddr = temp;       // 写回以触发缓存状态变化
-    
-    // 使用C++原子操作确保内存排序
-    sync_var.store(sync_var.load(std::memory_order_relaxed) + 1, 
-                  std::memory_order_release);
-    
+
+  // should flush everything from the cache. But, how big is the cache?
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 "mov %[buf], %%rsi\n"
+		 "clflush (%%rsi)\n"
+          "mfence\n"
+		 :
+		 : [buf] "r" (addr)
+		 : "rsi");
     addr += CACHELINE_SIZE;
-    count++;
   }
-  printf("Flushed %zu cache lines\n", count);
-  
-  // 确保之前的所有内存操作完成
-  sync_var.load(std::memory_order_acquire);
 
-  printf("Starting benchmark...\n");
+
   clock_gettime(CLOCK_MONOTONIC, &tstart);
-  for (int i = 0; i < 1000; i++) {
-    addr = aligned_base;
-    
-    // 添加额外安全检查，确保不会越界
-    while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
-      asm volatile(
-        BODY(addr)
-        :
-        : [addr] "r" (addr)
-        : "r8", "r9", "xmm0", "memory");
+for (int i=0;i<1e3;i++){
+  addr = base;
+  while (addr < (base + MAP_SIZE)) {
+    asm volatile(
+		 BODY(addr)
+		 :
+		 : [addr] "r" (addr)
+		 : "r8", "r9", "xmm0");
+
       addr += (FENCE_COUNT * MOVE_SIZE);
-    }
   }
   clock_gettime(CLOCK_MONOTONIC, &tend);
-
-  uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
+  uint64_t nanos = (1000000000  * tend.tv_sec + tend.tv_nsec);
   nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
 
-  printf("Benchmark completed: %lu ns\n", nanos);
-  
-  // 解除内存映射
-  munmap(base, MAP_SIZE + CACHELINE_SIZE);
-  
+
+  printf("%lu\n", nanos);
+}
   return 0;
 }
diff --git a/microbench/ld_serial.cpp b/microbench/ld_serial.cpp