4242 " movntdq %%xmm0, (%%r9) \n " \
4343 " add $" STR(MOVE_SIZE) " , %%r8 \n " \
4444 " cmp $" STR(FENCE_BOUND) " ,%%r8\n " \
45- " lock addl $0, 0(%%rsp) \n " \
45+ " mfence \n " \
4646 " jl LOOP_START%= \n "
4747
4848int main (int argc, char **argv) {
49- // 使用原子变量处理同步问题
50- std::atomic<int > sync_var (0 );
5149
52- // 分配更大的内存并确保对齐
50+ // in principle, you would want to clear out cache lines (and the
51+ // pipeline) before doing any of the inline assembly stuff. But,
52+ // that's hard. And, its probably noise when you execute over
53+ // enough things.
54+
55+
56+ // allocate some meomery
5357 char *base =(char *) mmap (nullptr ,
54- MAP_SIZE + CACHELINE_SIZE ,
55- PROT_READ | PROT_WRITE,
56- MAP_ANONYMOUS | MAP_PRIVATE,
57- -1 ,
58- 0 );
59- base = base + CACHELINE_SIZE;
58+ MAP_SIZE,
59+ PROT_READ | PROT_WRITE,
60+ MAP_ANONYMOUS | MAP_PRIVATE,
61+ -1 ,
62+ 0 );
63+
6064 if (base == MAP_FAILED) {
61- fprintf (stderr, " Memory allocation failed: %d\n " , errno);
65+ fprintf (stderr, " oops, you suck %d\n " , errno);
6266 return -1 ;
6367 }
64-
65- // 确保内存对齐到缓存行
66- uintptr_t addr_value = (uintptr_t )base;
67- uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1 ) & ~(CACHELINE_SIZE - 1 );
68- char *aligned_base = (char *)aligned_addr;
69-
70- printf (" Base address: %p, Aligned address: %p\n " , base, aligned_base);
71-
72- // 初始化XMM0寄存器,避免使用未初始化的值
73- char dummy_data[16 ] = {1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 };
74- asm volatile (
75- " movdqu (%0), %%xmm0"
76- :
77- : " r" (dummy_data)
78- : " xmm0"
79- );
80-
8168 char *addr = NULL ;
82- intptr_t *iaddr = (intptr_t *)aligned_base;
69+
70+ intptr_t *iaddr = (intptr_t *) base;
8371 intptr_t hash = 0 ;
8472 struct timespec tstart = {0 ,0 }, tend = {0 ,0 };
8573
86- // 初始化内存
87- printf (" Initializing memory...\n " );
88- size_t count = 0 ;
89- while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
90- hash = hash ^ (intptr_t )iaddr;
74+ // Necessary so that we don't include allocation costs in our benchmark
75+ while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
76+ hash = hash ^ (intptr_t ) iaddr;
9177 *iaddr = hash;
9278 iaddr++;
93- count++;
9479 }
95- printf (" Initialized %zu intptr_t elements\n " , count);
96-
97- // 使用普通内存操作替代缓存刷新
98- printf (" Flushing cache...\n " );
99- addr = aligned_base;
100- count = 0 ;
101- while (addr < (aligned_base + MAP_SIZE)) {
102- // 使用读取+写入模式替代缓存刷新
103- volatile char * vaddr = (volatile char *)addr;
104- char temp = *vaddr; // 读取到缓存
105- *vaddr = temp; // 写回以触发缓存状态变化
106-
107- // 使用C++原子操作确保内存排序
108- sync_var.store (sync_var.load (std::memory_order_relaxed) + 1 ,
109- std::memory_order_release);
110-
80+
81+ // should flush everything from the cache. But, how big is the cache?
82+ addr = base;
83+ while (addr < (base + MAP_SIZE)) {
84+ asm volatile (
85+ " mov %[buf], %%rsi\n "
86+ " clflush (%%rsi)\n "
87+ " mfence\n "
88+ :
89+ : [buf] " r" (addr)
90+ : " rsi" );
11191 addr += CACHELINE_SIZE;
112- count++;
11392 }
114- printf (" Flushed %zu cache lines\n " , count);
115-
116- // 确保之前的所有内存操作完成
117- sync_var.load (std::memory_order_acquire);
11893
119- printf ( " Starting benchmark... \n " );
94+
12095 clock_gettime (CLOCK_MONOTONIC, &tstart);
121- for (int i = 0 ; i < 1000 ; i++) {
122- addr = aligned_base;
123-
124- // 添加额外安全检查,确保不会越界
125- while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
126- asm volatile (
127- BODY (addr)
128- :
129- : [addr] " r" (addr)
130- : " r8" , " r9" , " xmm0" , " memory" );
96+ for (int i=0 ;i<1e3 ;i++){
97+ addr = base;
98+ while (addr < (base + MAP_SIZE)) {
99+ asm volatile (
100+ BODY (addr)
101+ :
102+ : [addr] " r" (addr)
103+ : " r8" , " r9" , " xmm0" );
104+
131105 addr += (FENCE_COUNT * MOVE_SIZE);
132- }
133106 }
134107 clock_gettime (CLOCK_MONOTONIC, &tend);
135-
136- uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec );
108+ uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec );
137109 nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec );
138110
139- printf (" Benchmark completed: %lu ns\n " , nanos);
140-
141- // 解除内存映射
142- munmap (base, MAP_SIZE + CACHELINE_SIZE);
143-
111+
112+ printf (" %lu\n " , nanos);
113+ }
144114 return 0 ;
145115}
0 commit comments