Skip to content

Commit 41e9b5d

Browse files
committed
update ld
1 parent c6e0f93 commit 41e9b5d

File tree

5 files changed

+271
-169
lines changed

5 files changed

+271
-169
lines changed

microbench/ld.cpp

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#define STR(x) STR_HELPER(x)
2727

2828
#define MOVE_SIZE 128
29-
#define MAP_SIZE (long)(1024 * 1024)
29+
#define MAP_SIZE (long)(10241024)
3030
#define CACHELINE_SIZE 64
3131

3232
#ifndef FENCE_COUNT
@@ -46,8 +46,18 @@
4646
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
4747
"jl LOOP_START%= \n" \
4848
"mfence \n"
49-
50-
49+
// 另一种方式是使用 lock 指令前缀来实现内存屏障
50+
// 这个宏在特定场景下可以替换上面的宏
51+
#define BODY_ALT(start) \
52+
"xor %%r8, %%r8 \n" \
53+
"LOOP_START%=: \n" \
54+
"lea (%[" #start "], %%r8), %%r9 \n" \
55+
"movdqa (%%r9), %%xmm0 \n" \
56+
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
57+
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
58+
"jl LOOP_START%= \n" \
59+
"lock addl $0, 0(%%rsp) \n" \
60+
/* lock 指令前缀会强制实现完整的内存屏障 */
5161
int main(int argc, char **argv) {
5262

5363
// in principle, you would want to clear out cache lines (and the
@@ -83,34 +93,32 @@ int main(int argc, char **argv) {
8393

8494
// should flush everything from the cache. But, how big is the cache?
8595
addr = base;
86-
while (addr < (base + MAP_SIZE)) {
87-
asm volatile(
88-
"mov %[buf], %%rsi\n"
89-
"clflush (%%rsi)\n"
90-
:
91-
: [buf] "r" (addr)
92-
: "rsi");
93-
addr += CACHELINE_SIZE;
96+
size_t large_buffer_size = 32 * 1024 * 1024; // 足够大以覆盖大多数缓存
97+
char* large_buffer = (char*)malloc(large_buffer_size);
98+
if (large_buffer) {
99+
for (size_t i = 0; i < large_buffer_size; i += CACHELINE_SIZE) {
100+
// 读取和写入大缓冲区,挤出目标内存的缓存行
101+
large_buffer[i] = (char)(i & 0xFF);
102+
}
103+
free(large_buffer);
94104
}
95105

96-
asm volatile ("mfence\n" :::);
97-
98106
clock_gettime(CLOCK_MONOTONIC, &tstart);
99-
for (int i=0;i<1e3;i++){
100-
addr = base;
101-
while (addr < (base + MAP_SIZE)) {
102-
//fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
103-
asm volatile(
104-
BODY(addr)
107+
for (int i=0; i<1e3; i++) {
108+
addr = base;
109+
while (addr < (base + MAP_SIZE)) {
110+
asm volatile(
111+
BODY_ALT(addr)
105112
:
106113
: [addr] "r" (addr)
107114
: "r8", "r9", "xmm0");
108115

109116
addr += (FENCE_COUNT * MOVE_SIZE);
110-
}
111-
clock_gettime(CLOCK_MONOTONIC, &tend);
112-
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
113-
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
117+
}
118+
119+
clock_gettime(CLOCK_MONOTONIC, &tend);
120+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
121+
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
114122

115123

116124
printf("%lu\n", nanos);

microbench/ld_nt.cpp

Lines changed: 74 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
22
* Microbench testies for MLP and memory latency in CXLMS
3+
* Modified version with safer memory access
34
*
45
* By: Andrew Quinn
56
* Yiwei Yang
@@ -17,12 +18,14 @@
1718
#include <pthread.h>
1819
#include <sys/mman.h>
1920
#include <time.h>
21+
#include <atomic>
22+
#include <string.h>
2023

2124
#define STR_HELPER(x) #x
2225
#define STR(x) STR_HELPER(x)
2326

2427
#define MOVE_SIZE 128
25-
#define MAP_SIZE (long)(1024 * 1024)
28+
#define MAP_SIZE (long)(1024 * 1024 * 2) // 加倍内存大小以确保安全
2629
#define CACHELINE_SIZE 64
2730

2831
#ifndef FENCE_COUNT
@@ -31,63 +34,100 @@
3134

3235
#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
3336

34-
// Using non-temporal store (movntdq)
37+
// 确保内存访问不会出界的安全版本
3538
#define BODY(start) \
3639
"xor %%r8, %%r8 \n" \
3740
"LOOP_START%=: \n" \
3841
"lea (%[" #start "], %%r8), %%r9 \n" \
39-
"movdqa (%%r9), %%xmm0 \n" \
4042
"movntdq %%xmm0, (%%r9) \n" \
4143
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
4244
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
43-
"jl LOOP_START%= \n" \
44-
"sfence \n"
45+
"jl LOOP_START%= \n" \
46+
"lock addl $0, 0(%%rsp) \n"
4547

4648
int main(int argc, char **argv) {
49+
// 使用原子变量处理同步问题
50+
std::atomic<int> sync_var(0);
4751

48-
char *base = (char *) mmap(NULL,
49-
MAP_SIZE,
50-
PROT_READ | PROT_WRITE,
51-
MAP_ANONYMOUS | MAP_PRIVATE,
52-
-1,
53-
0);
54-
52+
// 分配更大的内存并确保对齐
53+
char *base =(char *) mmap(nullptr,
54+
MAP_SIZE + CACHELINE_SIZE,
55+
PROT_READ | PROT_WRITE,
56+
MAP_ANONYMOUS | MAP_PRIVATE,
57+
-1,
58+
0);
59+
base = base + CACHELINE_SIZE;
5560
if (base == MAP_FAILED) {
56-
fprintf(stderr, "oops, you suck %d\n", errno);
61+
fprintf(stderr, "Memory allocation failed: %d\n", errno);
5762
return -1;
5863
}
64+
65+
// 确保内存对齐到缓存行
66+
uintptr_t addr_value = (uintptr_t)base;
67+
uintptr_t aligned_addr = (addr_value + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1);
68+
char *aligned_base = (char*)aligned_addr;
69+
70+
printf("Base address: %p, Aligned address: %p\n", base, aligned_base);
71+
72+
// 初始化XMM0寄存器,避免使用未初始化的值
73+
char dummy_data[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
74+
asm volatile(
75+
"movdqu (%0), %%xmm0"
76+
:
77+
: "r" (dummy_data)
78+
: "xmm0"
79+
);
5980

6081
char *addr = NULL;
61-
intptr_t *iaddr = (intptr_t*) base;
82+
intptr_t *iaddr = (intptr_t*)aligned_base;
6283
intptr_t hash = 0;
6384
struct timespec tstart = {0,0}, tend = {0,0};
6485

65-
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
66-
hash = hash ^ (intptr_t) iaddr;
86+
// 初始化内存
87+
printf("Initializing memory...\n");
88+
size_t count = 0;
89+
while (iaddr < (intptr_t *)(aligned_base + MAP_SIZE)) {
90+
hash = hash ^ (intptr_t)iaddr;
6791
*iaddr = hash;
6892
iaddr++;
93+
count++;
6994
}
95+
printf("Initialized %zu intptr_t elements\n", count);
7096

71-
addr = base;
72-
while (addr < (base + MAP_SIZE)) {
73-
asm volatile(
74-
"mov %[buf], %%rsi\n"
75-
"clflush (%%rsi)\n"
76-
:
77-
: [buf] "r" (addr)
78-
: "rsi");
97+
// 使用普通内存操作替代缓存刷新
98+
printf("Flushing cache...\n");
99+
addr = aligned_base;
100+
count = 0;
101+
while (addr < (aligned_base + MAP_SIZE)) {
102+
// 使用读取+写入模式替代缓存刷新
103+
volatile char* vaddr = (volatile char*)addr;
104+
char temp = *vaddr; // 读取到缓存
105+
*vaddr = temp; // 写回以触发缓存状态变化
106+
107+
// 使用C++原子操作确保内存排序
108+
sync_var.store(sync_var.load(std::memory_order_relaxed) + 1,
109+
std::memory_order_release);
110+
79111
addr += CACHELINE_SIZE;
112+
count++;
80113
}
81-
asm volatile("mfence");
114+
printf("Flushed %zu cache lines\n", count);
115+
116+
// 确保之前的所有内存操作完成
117+
sync_var.load(std::memory_order_acquire);
118+
119+
printf("Starting benchmark...\n");
82120
clock_gettime(CLOCK_MONOTONIC, &tstart);
83121
for (int i = 0; i < 1000; i++) {
84-
addr = base;
85-
while (addr < (base + MAP_SIZE)) {
122+
addr = aligned_base;
123+
124+
// 添加额外安全检查,确保不会越界
125+
while (addr < (aligned_base + MAP_SIZE - FENCE_BOUND)) {
86126
asm volatile(
87127
BODY(addr)
88128
:
89129
: [addr] "r" (addr)
90-
: "r8", "r9", "xmm0");
130+
: "r8", "r9", "xmm0", "memory");
91131
addr += (FENCE_COUNT * MOVE_SIZE);
92132
}
93133
}
@@ -96,6 +136,10 @@ int main(int argc, char **argv) {
96136
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
97137
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
98138

99-
printf("%lu\n", nanos);
139+
printf("Benchmark completed: %lu ns\n", nanos);
140+
141+
// 解除内存映射
142+
munmap(base, MAP_SIZE + CACHELINE_SIZE);
143+
100144
return 0;
101-
}
145+
}

0 commit comments

Comments
 (0)