11/*
22 * Microbench testies for MLP and memory latency in CXLMS
3+ * Modified version with safer memory access
34 *
45 * By: Andrew Quinn
56 * Yiwei Yang
1718#include < pthread.h>
1819#include < sys/mman.h>
1920#include < time.h>
21+ #include < atomic>
22+ #include < string.h>
2023
2124#define STR_HELPER (x ) #x
2225#define STR (x ) STR_HELPER(x)
2326
2427#define MOVE_SIZE 128
25- #define MAP_SIZE (long )(1024 * 1024 )
28+ #define MAP_SIZE (long )(1024 * 1024 * 2 ) // 加倍内存大小以确保安全
2629#define CACHELINE_SIZE 64
2730
2831#ifndef FENCE_COUNT
3134
3235#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
3336
34- // Using non-temporal store (movntdq)
37+ // 确保内存访问不会出界的安全版本
3538#define BODY (start ) \
3639 " xor %%r8, %%r8 \n " \
3740 " LOOP_START%=: \n " \
3841 " lea (%[" #start " ], %%r8), %%r9 \n " \
39- " movdqa (%%r9), %%xmm0 \n " \
4042 " movntdq %%xmm0, (%%r9) \n " \
4143 " add $" STR(MOVE_SIZE) " , %%r8 \n " \
4244 " cmp $" STR(FENCE_BOUND) " ,%%r8\n " \
43- " jl LOOP_START%= \n " \
44- " sfence \n "
45+ " jl LOOP_START%= \n " \
46+ " lock addl $0, 0(%%rsp) \n "
4547
4648int main (int argc, char **argv) {
49+ // 使用原子变量处理同步问题
50+ std::atomic<int > sync_var (0 );
4751
48- char *base = (char *) mmap (NULL ,
49- MAP_SIZE,
50- PROT_READ | PROT_WRITE,
51- MAP_ANONYMOUS | MAP_PRIVATE,
52- -1 ,
53- 0 );
54-
52+ // 分配更大的内存并确保对齐
53+ char *base =(char *) mmap (nullptr ,
54+ MAP_SIZE + CACHELINE_SIZE,
55+ PROT_READ | PROT_WRITE,
56+ MAP_ANONYMOUS | MAP_PRIVATE,
57+ -1 ,
58+ 0 );
59+ base = base + CACHELINE_SIZE;
5560 if (base == MAP_FAILED) {
56- fprintf (stderr, " oops, you suck %d\n " , errno);
61+ fprintf (stderr, " Memory allocation failed: %d\n " , errno);
5762 return -1 ;
5863 }
64+
65+ // 确保内存对齐到缓存行
66+ uintptr_t addr_value = (uintptr_t )base;
67+ uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1 ) & ~(CACHELINE_SIZE - 1 );
68+ char *aligned_base = (char *)aligned_addr;
69+
70+ printf (" Base address: %p, Aligned address: %p\n " , base, aligned_base);
71+
72+ // 初始化XMM0寄存器,避免使用未初始化的值
73+ char dummy_data[16 ] = {1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 };
74+ asm volatile (
75+ " movdqu (%0), %%xmm0"
76+ :
77+ : " r" (dummy_data)
78+ : " xmm0"
79+ );
5980
6081 char *addr = NULL ;
61- intptr_t *iaddr = (intptr_t *) base ;
82+ intptr_t *iaddr = (intptr_t *)aligned_base ;
6283 intptr_t hash = 0 ;
6384 struct timespec tstart = {0 ,0 }, tend = {0 ,0 };
6485
65- while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
66- hash = hash ^ (intptr_t ) iaddr;
86+ // 初始化内存
87+ printf (" Initializing memory...\n " );
88+ size_t count = 0 ;
89+ while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
90+ hash = hash ^ (intptr_t )iaddr;
6791 *iaddr = hash;
6892 iaddr++;
93+ count++;
6994 }
95+ printf (" Initialized %zu intptr_t elements\n " , count);
7096
71- addr = base;
72- while (addr < (base + MAP_SIZE)) {
73- asm volatile (
74- " mov %[buf], %%rsi\n "
75- " clflush (%%rsi)\n "
76- :
77- : [buf] " r" (addr)
78- : " rsi" );
97+ // 使用普通内存操作替代缓存刷新
98+ printf (" Flushing cache...\n " );
99+ addr = aligned_base;
100+ count = 0 ;
101+ while (addr < (aligned_base + MAP_SIZE)) {
102+ // 使用读取+写入模式替代缓存刷新
103+ volatile char * vaddr = (volatile char *)addr;
104+ char temp = *vaddr; // 读取到缓存
105+ *vaddr = temp; // 写回以触发缓存状态变化
106+
107+ // 使用C++原子操作确保内存排序
108+ sync_var.store (sync_var.load (std::memory_order_relaxed) + 1 ,
109+ std::memory_order_release);
110+
79111 addr += CACHELINE_SIZE;
112+ count++;
80113 }
81- asm volatile (" mfence" );
114+ printf (" Flushed %zu cache lines\n " , count);
115+
116+ // 确保之前的所有内存操作完成
117+ sync_var.load (std::memory_order_acquire);
118+
119+ printf (" Starting benchmark...\n " );
82120 clock_gettime (CLOCK_MONOTONIC, &tstart);
83121 for (int i = 0 ; i < 1000 ; i++) {
84- addr = base;
85- while (addr < (base + MAP_SIZE)) {
122+ addr = aligned_base;
123+
124+ // 添加额外安全检查,确保不会越界
125+ while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
86126 asm volatile (
87127 BODY (addr)
88128 :
89129 : [addr] " r" (addr)
90- : " r8" , " r9" , " xmm0" );
130+ : " r8" , " r9" , " xmm0" , " memory " );
91131 addr += (FENCE_COUNT * MOVE_SIZE);
92132 }
93133 }
@@ -96,6 +136,10 @@ int main(int argc, char **argv) {
96136 uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec );
97137 nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec );
98138
99- printf (" %lu\n " , nanos);
139+ printf (" Benchmark completed: %lu ns\n " , nanos);
140+
141+ // 解除内存映射
142+ munmap (base, MAP_SIZE + CACHELINE_SIZE);
143+
100144 return 0 ;
101- }
145+ }
0 commit comments