Skip to content

Commit a48900a

Browse files
committed
update benchmark
1 parent 2eaf32d commit a48900a

File tree

4 files changed

+178
-276
lines changed

4 files changed

+178
-276
lines changed

microbench/ld.cpp

Lines changed: 24 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#define STR(x) STR_HELPER(x)
2727

2828
#define MOVE_SIZE 128
29-
#define MAP_SIZE (long)(10241024)
29+
#define MAP_SIZE (long)(1024 * 1024 * 1024)
3030
#define CACHELINE_SIZE 64
3131

3232
#ifndef FENCE_COUNT
@@ -45,19 +45,9 @@
4545
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
4646
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
4747
"jl LOOP_START%= \n" \
48-
"mfence \n"
49-
// 另一种方式是使用 lock 指令前缀来实现内存屏障
50-
// 这个宏在特定场景下可以替换上面的宏
51-
#define BODY_ALT(start) \
52-
"xor %%r8, %%r8 \n" \
53-
"LOOP_START%=: \n" \
54-
"lea (%[" #start "], %%r8), %%r9 \n" \
55-
"movdqa (%%r9), %%xmm0 \n" \
56-
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
57-
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
58-
"jl LOOP_START%= \n" \
59-
"lock addl $0, 0(%%rsp) \n" \
60-
/* lock 指令前缀会强制实现完整的内存屏障 */
48+
"mfence \n" \
49+
50+
6151
int main(int argc, char **argv) {
6252

6353
// in principle, you would want to clear out cache lines (and the
@@ -93,35 +83,36 @@ int main(int argc, char **argv) {
9383

9484
// should flush everything from the cache. But, how big is the cache?
9585
addr = base;
96-
size_t large_buffer_size = 32 * 1024 * 1024; // 足够大以覆盖大多数缓存
97-
char* large_buffer = (char*)malloc(large_buffer_size);
98-
if (large_buffer) {
99-
for (size_t i = 0; i < large_buffer_size; i += CACHELINE_SIZE) {
100-
// 读取和写入大缓冲区,挤出目标内存的缓存行
101-
large_buffer[i] = (char)(i & 0xFF);
102-
}
103-
free(large_buffer);
86+
while (addr < (base + MAP_SIZE)) {
87+
asm volatile(
88+
"mov %[buf], %%rsi\n"
89+
"clflush (%%rsi)\n"
90+
"mfence\n"
91+
:
92+
: [buf] "r" (addr)
93+
: "rsi");
94+
addr += CACHELINE_SIZE;
10495
}
10596

97+
10698
clock_gettime(CLOCK_MONOTONIC, &tstart);
107-
for (int i=0; i<1e3; i++) {
108-
addr = base;
109-
while (addr < (base + MAP_SIZE)) {
110-
asm volatile(
111-
BODY_ALT(addr)
99+
for (int i=0;i<1e3;i++){
100+
addr = base;
101+
while (addr < (base + MAP_SIZE)) {
102+
asm volatile(
103+
BODY(addr)
112104
:
113105
: [addr] "r" (addr)
114106
: "r8", "r9", "xmm0");
115107

116108
addr += (FENCE_COUNT * MOVE_SIZE);
117-
}
118-
119-
clock_gettime(CLOCK_MONOTONIC, &tend);
120-
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
121-
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
109+
}
110+
clock_gettime(CLOCK_MONOTONIC, &tend);
111+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
112+
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
122113

123114

124115
printf("%lu\n", nanos);
125116
}
126117
return 0;
127-
}
118+
}

microbench/ld_nt.cpp

Lines changed: 45 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -43,103 +43,73 @@
4343
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
4444
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
4545
"jl LOOP_START%= \n" \
46-
"lock addl $0, 0(%%rsp) \n"
46+
"mfence \n"
4747

4848
int main(int argc, char **argv) {
49-
// 使用原子变量处理同步问题
50-
std::atomic<int> sync_var(0);
5149

52-
// 分配更大的内存并确保对齐
50+
// in principle, you would want to clear out cache lines (and the
51+
// pipeline) before doing any of the inline assembly stuff. But,
52+
// that's hard. And, its probably noise when you execute over
53+
// enough things.
54+
55+
56+
// allocate some meomery
5357
char *base =(char *) mmap(nullptr,
54-
MAP_SIZE + CACHELINE_SIZE,
55-
PROT_READ | PROT_WRITE,
56-
MAP_ANONYMOUS | MAP_PRIVATE,
57-
-1,
58-
0);
59-
base = base + CACHELINE_SIZE;
58+
MAP_SIZE,
59+
PROT_READ | PROT_WRITE,
60+
MAP_ANONYMOUS | MAP_PRIVATE,
61+
-1,
62+
0);
63+
6064
if (base == MAP_FAILED) {
61-
fprintf(stderr, "Memory allocation failed: %d\n", errno);
65+
fprintf(stderr, "oops, you suck %d\n", errno);
6266
return -1;
6367
}
64-
65-
// 确保内存对齐到缓存行
66-
uintptr_t addr_value = (uintptr_t)base;
67-
uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1);
68-
char *aligned_base = (char*)aligned_addr;
69-
70-
printf("Base address: %p, Aligned address: %p\n", base, aligned_base);
71-
72-
// 初始化XMM0寄存器,避免使用未初始化的值
73-
char dummy_data[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
74-
asm volatile(
75-
"movdqu (%0), %%xmm0"
76-
:
77-
: "r" (dummy_data)
78-
: "xmm0"
79-
);
80-
8168
char *addr = NULL;
82-
intptr_t *iaddr = (intptr_t*)aligned_base;
69+
70+
intptr_t *iaddr = (intptr_t*) base;
8371
intptr_t hash = 0;
8472
struct timespec tstart = {0,0}, tend = {0,0};
8573

86-
// 初始化内存
87-
printf("Initializing memory...\n");
88-
size_t count = 0;
89-
while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
90-
hash = hash ^ (intptr_t)iaddr;
74+
// Necessary so that we don't include allocation costs in our benchmark
75+
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
76+
hash = hash ^ (intptr_t) iaddr;
9177
*iaddr = hash;
9278
iaddr++;
93-
count++;
9479
}
95-
printf("Initialized %zu intptr_t elements\n", count);
96-
97-
// 使用普通内存操作替代缓存刷新
98-
printf("Flushing cache...\n");
99-
addr = aligned_base;
100-
count = 0;
101-
while (addr < (aligned_base + MAP_SIZE)) {
102-
// 使用读取+写入模式替代缓存刷新
103-
volatile char* vaddr = (volatile char*)addr;
104-
char temp = *vaddr; // 读取到缓存
105-
*vaddr = temp; // 写回以触发缓存状态变化
106-
107-
// 使用C++原子操作确保内存排序
108-
sync_var.store(sync_var.load(std::memory_order_relaxed) + 1,
109-
std::memory_order_release);
110-
80+
81+
// should flush everything from the cache. But, how big is the cache?
82+
addr = base;
83+
while (addr < (base + MAP_SIZE)) {
84+
asm volatile(
85+
"mov %[buf], %%rsi\n"
86+
"clflush (%%rsi)\n"
87+
"mfence\n"
88+
:
89+
: [buf] "r" (addr)
90+
: "rsi");
11191
addr += CACHELINE_SIZE;
112-
count++;
11392
}
114-
printf("Flushed %zu cache lines\n", count);
115-
116-
// 确保之前的所有内存操作完成
117-
sync_var.load(std::memory_order_acquire);
11893

119-
printf("Starting benchmark...\n");
94+
12095
clock_gettime(CLOCK_MONOTONIC, &tstart);
121-
for (int i = 0; i < 1000; i++) {
122-
addr = aligned_base;
123-
124-
// 添加额外安全检查,确保不会越界
125-
while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
126-
asm volatile(
127-
BODY(addr)
128-
:
129-
: [addr] "r" (addr)
130-
: "r8", "r9", "xmm0", "memory");
96+
for (int i=0;i<1e3;i++){
97+
addr = base;
98+
while (addr < (base + MAP_SIZE)) {
99+
asm volatile(
100+
BODY(addr)
101+
:
102+
: [addr] "r" (addr)
103+
: "r8", "r9", "xmm0");
104+
131105
addr += (FENCE_COUNT * MOVE_SIZE);
132-
}
133106
}
134107
clock_gettime(CLOCK_MONOTONIC, &tend);
135-
136-
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
108+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
137109
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
138110

139-
printf("Benchmark completed: %lu ns\n", nanos);
140-
141-
// 解除内存映射
142-
munmap(base, MAP_SIZE + CACHELINE_SIZE);
143-
111+
112+
printf("%lu\n", nanos);
113+
}
144114
return 0;
145115
}

microbench/ld_nt_serial.cpp

Lines changed: 45 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -42,104 +42,74 @@
4242
"movntdq %%xmm0, (%%r9) \n" \
4343
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
4444
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
45-
"lock addl $0, 0(%%rsp) \n" \
45+
"mfence\n" \
4646
"jl LOOP_START%= \n"
4747

4848
int main(int argc, char **argv) {
49-
// 使用原子变量处理同步问题
50-
std::atomic<int> sync_var(0);
5149

52-
// 分配更大的内存并确保对齐
50+
// in principle, you would want to clear out cache lines (and the
51+
// pipeline) before doing any of the inline assembly stuff. But,
52+
// that's hard. And, its probably noise when you execute over
53+
// enough things.
54+
55+
56+
// allocate some meomery
5357
char *base =(char *) mmap(nullptr,
54-
MAP_SIZE + CACHELINE_SIZE,
55-
PROT_READ | PROT_WRITE,
56-
MAP_ANONYMOUS | MAP_PRIVATE,
57-
-1,
58-
0);
59-
base = base + CACHELINE_SIZE;
58+
MAP_SIZE,
59+
PROT_READ | PROT_WRITE,
60+
MAP_ANONYMOUS | MAP_PRIVATE,
61+
-1,
62+
0);
63+
6064
if (base == MAP_FAILED) {
61-
fprintf(stderr, "Memory allocation failed: %d\n", errno);
65+
fprintf(stderr, "oops, you suck %d\n", errno);
6266
return -1;
6367
}
64-
65-
// 确保内存对齐到缓存行
66-
uintptr_t addr_value = (uintptr_t)base;
67-
uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1);
68-
char *aligned_base = (char*)aligned_addr;
69-
70-
printf("Base address: %p, Aligned address: %p\n", base, aligned_base);
71-
72-
// 初始化XMM0寄存器,避免使用未初始化的值
73-
char dummy_data[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
74-
asm volatile(
75-
"movdqu (%0), %%xmm0"
76-
:
77-
: "r" (dummy_data)
78-
: "xmm0"
79-
);
80-
8168
char *addr = NULL;
82-
intptr_t *iaddr = (intptr_t*)aligned_base;
69+
70+
intptr_t *iaddr = (intptr_t*) base;
8371
intptr_t hash = 0;
8472
struct timespec tstart = {0,0}, tend = {0,0};
8573

86-
// 初始化内存
87-
printf("Initializing memory...\n");
88-
size_t count = 0;
89-
while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
90-
hash = hash ^ (intptr_t)iaddr;
74+
// Necessary so that we don't include allocation costs in our benchmark
75+
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
76+
hash = hash ^ (intptr_t) iaddr;
9177
*iaddr = hash;
9278
iaddr++;
93-
count++;
9479
}
95-
printf("Initialized %zu intptr_t elements\n", count);
96-
97-
// 使用普通内存操作替代缓存刷新
98-
printf("Flushing cache...\n");
99-
addr = aligned_base;
100-
count = 0;
101-
while (addr < (aligned_base + MAP_SIZE)) {
102-
// 使用读取+写入模式替代缓存刷新
103-
volatile char* vaddr = (volatile char*)addr;
104-
char temp = *vaddr; // 读取到缓存
105-
*vaddr = temp; // 写回以触发缓存状态变化
106-
107-
// 使用C++原子操作确保内存排序
108-
sync_var.store(sync_var.load(std::memory_order_relaxed) + 1,
109-
std::memory_order_release);
110-
80+
81+
// should flush everything from the cache. But, how big is the cache?
82+
addr = base;
83+
while (addr < (base + MAP_SIZE)) {
84+
asm volatile(
85+
"mov %[buf], %%rsi\n"
86+
"clflush (%%rsi)\n"
87+
"mfence\n"
88+
:
89+
: [buf] "r" (addr)
90+
: "rsi");
11191
addr += CACHELINE_SIZE;
112-
count++;
11392
}
114-
printf("Flushed %zu cache lines\n", count);
115-
116-
// 确保之前的所有内存操作完成
117-
sync_var.load(std::memory_order_acquire);
11893

119-
printf("Starting benchmark...\n");
94+
12095
clock_gettime(CLOCK_MONOTONIC, &tstart);
121-
for (int i = 0; i < 1000; i++) {
122-
addr = aligned_base;
123-
124-
// 添加额外安全检查,确保不会越界
125-
while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
126-
asm volatile(
127-
BODY(addr)
128-
:
129-
: [addr] "r" (addr)
130-
: "r8", "r9", "xmm0", "memory");
96+
for (int i=0;i<1e3;i++){
97+
addr = base;
98+
while (addr < (base + MAP_SIZE)) {
99+
asm volatile(
100+
BODY(addr)
101+
:
102+
: [addr] "r" (addr)
103+
: "r8", "r9", "xmm0");
104+
131105
addr += (FENCE_COUNT * MOVE_SIZE);
132-
}
133106
}
134107
clock_gettime(CLOCK_MONOTONIC, &tend);
135-
136-
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
108+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
137109
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
138110

139-
printf("Benchmark completed: %lu ns\n", nanos);
140-
141-
// 解除内存映射
142-
munmap(base, MAP_SIZE + CACHELINE_SIZE);
143-
111+
112+
printf("%lu\n", nanos);
113+
}
144114
return 0;
145115
}

0 commit comments

Comments
 (0)