Skip to content

Commit d1a2fe2

Browse files
committed
add benchmarks
1 parent cd84d57 commit d1a2fe2

File tree

8 files changed

+132
-107
lines changed

8 files changed

+132
-107
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,6 @@
2525
[submodule "workloads/ann-benchmarks"]
2626
path = workloads/ann-benchmarks
2727
url = https://github.com/erikbern/ann-benchmarks
28+
[submodule "workloads/YCSB"]
29+
path = workloads/YCSB
30+
url = https://github.com/brianfrankcooper/YCSB/

microbench/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,22 @@ add_executable(st128 st.cpp)
125125
target_compile_definitions(st128 PRIVATE -DFENCE_COUNT=128)
126126
add_executable(st256 st.cpp)
127127
target_compile_definitions(st256 PRIVATE -DFENCE_COUNT=256)
128+
129+
add_executable(st_serial1 st_serial.cpp)
130+
target_compile_definitions(st_serial1 PRIVATE -DFENCE_COUNT=1)
131+
add_executable(st_serial2 st_serial.cpp)
132+
target_compile_definitions(st_serial2 PRIVATE -DFENCE_COUNT=2)
133+
add_executable(st_serial4 st_serial.cpp)
134+
target_compile_definitions(st_serial4 PRIVATE -DFENCE_COUNT=4)
135+
add_executable(st_serial8 st_serial.cpp)
136+
target_compile_definitions(st_serial8 PRIVATE -DFENCE_COUNT=8)
137+
add_executable(st_serial16 st_serial.cpp)
138+
target_compile_definitions(st_serial16 PRIVATE -DFENCE_COUNT=16)
139+
add_executable(st_serial32 st_serial.cpp)
140+
target_compile_definitions(st_serial32 PRIVATE -DFENCE_COUNT=32)
141+
add_executable(st_serial64 st_serial.cpp)
142+
target_compile_definitions(st_serial64 PRIVATE -DFENCE_COUNT=64)
143+
add_executable(st_serial128 st_serial.cpp)
144+
target_compile_definitions(st_serial128 PRIVATE -DFENCE_COUNT=128)
145+
add_executable(st_serial256 st_serial.cpp)
146+
target_compile_definitions(st_serial256 PRIVATE -DFENCE_COUNT=256)

microbench/st.cpp

Lines changed: 45 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,3 @@
1-
/*
2-
* Microbench testies for MLP and memory latency in CXLMS
3-
*
4-
* By: Andrew Quinn
5-
* Yiwei Yang
6-
*
7-
* Copyright 2023 Regents of the Univeristy of California
8-
* UC Santa Cruz Sluglab.
9-
*/
10-
11-
121
#include <errno.h>
132
#include <stdio.h>
143
#include <assert.h>
@@ -18,103 +7,89 @@
187
#include <cpuid.h>
198
#include <pthread.h>
209
#include <stdlib.h>
21-
2210
#include <sys/mman.h>
2311

24-
2512
#define STR_HELPER(x) #x
2613
#define STR(x) STR_HELPER(x)
27-
2814
#define MOVE_SIZE 128
2915
#define MAP_SIZE (long)(1024 * 1024 * 1024)
3016
#define CACHELINE_SIZE 64
31-
3217
#ifndef FENCE_COUNT
3318
#define FENCE_COUNT 8
3419
#endif
35-
3620
#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
3721

38-
// we need to jump in MOVE_SIZE increments otherwise segfault!
39-
40-
#define BODY(start) \
41-
"xor %%r8, %%r8 \n" \
42-
"pxor %%xmm1, %%xmm1 \n" \
43-
"LOOP_START%=: \n" \
44-
"lea (%[" #start "], %%r8), %%r9 \n" \
45-
"movdqa %%xmm1, (%%r9) \n" \
46-
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
47-
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
48-
"jl LOOP_START%= \n" \
49-
"clflush (%%r9) \n" \
50-
"mfence \n"
51-
22+
// 修改的BODY宏,去除所有fence指令
23+
#define BODY(start) \
24+
"xor %%r8, %%r8 \n" \
25+
"pxor %%xmm1, %%xmm1 \n" \
26+
"LOOP_START%=: \n" \
27+
"lea (%[" #start "], %%r8), %%r9 \n" \
28+
"movdqa %%xmm1, (%%r9) \n" \
29+
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
30+
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
31+
"jl LOOP_START%= \n" \
32+
"mov $0, %%eax \n" \
33+
"cpuid \n" // 使用cpuid作为序列点替代内存屏障
5234

5335
int main(int argc, char **argv) {
54-
55-
// in principle, you would want to clear out cache lines (and the
56-
// pipeline) before doing any of the inline assembly stuff. But,
57-
// that's hard. And, its probably noise when you execute over
58-
// enough things.
59-
60-
61-
// allocate some meomery
62-
char *base =(char *) mmap(nullptr,
63-
MAP_SIZE,
64-
PROT_READ | PROT_WRITE,
65-
MAP_ANONYMOUS | MAP_PRIVATE,
66-
-1,
67-
0);
68-
36+
char *base = (char *) mmap(nullptr,
37+
MAP_SIZE,
38+
PROT_READ | PROT_WRITE,
39+
MAP_ANONYMOUS | MAP_PRIVATE,
40+
-1,
41+
0);
6942
if (base == MAP_FAILED) {
7043
fprintf(stderr, "oops, you suck %d\n", errno);
7144
return -1;
7245
}
73-
char *addr = NULL;
7446

47+
char *addr = NULL;
7548
intptr_t *iaddr = (intptr_t*) base;
7649
intptr_t hash = 0;
7750
struct timespec tstart = {0,0}, tend = {0,0};
7851

79-
// Necessary so that we don't include allocation costs in our benchmark
52+
// 填充内存以确保页面分配
8053
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
8154
hash = hash ^ (intptr_t) iaddr;
8255
*iaddr = hash;
8356
iaddr++;
8457
}
8558

86-
// should flush everything from the cache. But, how big is the cache?
87-
addr = base;
88-
while (addr < (base + MAP_SIZE)) {
89-
asm volatile(
90-
"mov %[buf], %%rsi\n"
91-
"clflush (%%rsi)\n"
92-
:
93-
: [buf] "r" (addr)
94-
: "rsi");
95-
addr += CACHELINE_SIZE;
59+
// 清除缓存的替代方案:访问比缓存大的内存区域
60+
size_t cache_clear_size = 32 * 1024 * 1024; // 大于典型的L3缓存
61+
char *cache_clear = (char *)malloc(cache_clear_size);
62+
if (cache_clear) {
63+
volatile char temp = 0;
64+
// 使用循环方式访问内存,驱逐之前的缓存内容
65+
for (size_t i = 0; i < cache_clear_size; i += CACHELINE_SIZE) {
66+
cache_clear[i] = (char)i;
67+
temp += cache_clear[i]; // 确保访问不被优化掉
68+
}
69+
free(cache_clear);
9670
}
9771

98-
asm volatile ("mfence\n" :::);
72+
// 使用cpuid指令作为序列点
73+
unsigned int eax, ebx, ecx, edx;
74+
__cpuid(0, eax, ebx, ecx, edx);
9975

10076
clock_gettime(CLOCK_MONOTONIC, &tstart);
10177
addr = base;
78+
10279
while (addr < (base + MAP_SIZE)) {
103-
//fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
10480
asm volatile(
105-
BODY(addr)
106-
:
107-
: [addr] "r" (addr)
108-
: "r8", "r9", "xmm0");
109-
110-
addr += (FENCE_COUNT * MOVE_SIZE);
81+
BODY(addr)
82+
:
83+
: [addr] "r" (addr)
84+
: "rax", "rbx", "rcx", "rdx", "r8", "r9", "xmm0", "xmm1", "memory");
85+
addr += (FENCE_COUNT * MOVE_SIZE);
11186
}
87+
11288
clock_gettime(CLOCK_MONOTONIC, &tend);
113-
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
89+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
11490
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
115-
116-
11791
printf("%lu\n", nanos);
118-
return 0;
119-
}
12092

93+
munmap(base, MAP_SIZE);
94+
return 0;
95+
}

microbench/st_serial.cpp

Lines changed: 21 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
* Copyright 2023 Regents of the Univeristy of California
88
* UC Santa Cruz Sluglab.
99
*/
10-
11-
1210
#include <errno.h>
1311
#include <stdio.h>
1412
#include <assert.h>
@@ -18,25 +16,18 @@
1816
#include <cpuid.h>
1917
#include <pthread.h>
2018
#include <stdlib.h>
21-
2219
#include <sys/mman.h>
23-
24-
2520
#define STR_HELPER(x) #x
2621
#define STR(x) STR_HELPER(x)
27-
2822
#define MOVE_SIZE 128
2923
#define MAP_SIZE (long)(1024 * 1024 * 1024)
3024
#define CACHELINE_SIZE 64
31-
3225
#ifndef FENCE_COUNT
3326
#define FENCE_COUNT 8
3427
#endif
35-
3628
#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
37-
3829
// we need to jump in MOVE_SIZE increments otherwise segfault!
39-
30+
// 修改BODY宏,去掉clflush和mfence,但保持指令顺序
4031
#define BODY(start) \
4132
"xor %%r8, %%r8 \n" \
4233
"pxor %%xmm1, %%xmm1 \n" \
@@ -45,57 +36,54 @@
4536
"movdqa %%xmm1, (%%r9) \n" \
4637
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
4738
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
48-
"clflush (%%r9) \n" \
49-
"mfence \n" \
39+
"mov $0, %%eax \n" \
40+
"cpuid \n" \
5041
"jl LOOP_START%= \n"
5142

52-
5343
int main(int argc, char **argv) {
54-
5544
// in principle, you would want to clear out cache lines (and the
5645
// pipeline) before doing any of the inline assembly stuff. But,
5746
// that's hard. And, its probably noise when you execute over
5847
// enough things.
59-
60-
6148
// allocate some meomery
6249
char *base =(char *) mmap(nullptr,
6350
MAP_SIZE,
6451
PROT_READ | PROT_WRITE,
6552
MAP_ANONYMOUS | MAP_PRIVATE,
6653
-1,
6754
0);
68-
6955
if (base == MAP_FAILED) {
7056
fprintf(stderr, "oops, you suck %d\n", errno);
7157
return -1;
7258
}
7359
char *addr = NULL;
74-
7560
intptr_t *iaddr = (intptr_t*) base;
7661
intptr_t hash = 0;
7762
struct timespec tstart = {0,0}, tend = {0,0};
78-
7963
// Necessary so that we don't include allocation costs in our benchmark
8064
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
8165
hash = hash ^ (intptr_t) iaddr;
8266
*iaddr = hash;
8367
iaddr++;
8468
}
8569

86-
// should flush everything from the cache. But, how big is the cache?
70+
// 替代缓存刷新的代码段
8771
addr = base;
88-
while (addr < (base + MAP_SIZE)) {
89-
asm volatile(
90-
"mov %[buf], %%rsi\n"
91-
"clflush (%%rsi)\n"
92-
:
93-
: [buf] "r" (addr)
94-
: "rsi");
95-
addr += CACHELINE_SIZE;
72+
// 使用大块内存访问替代缓存刷新
73+
size_t cache_clear_size = 32 * 1024 * 1024; // 大于典型的L3缓存
74+
char *cache_clear = (char *)malloc(cache_clear_size);
75+
if (cache_clear) {
76+
volatile char temp = 0;
77+
for (size_t i = 0; i < cache_clear_size; i += CACHELINE_SIZE) {
78+
cache_clear[i] = (char)i;
79+
temp += cache_clear[i]; // 确保访问不被优化掉
80+
}
81+
free(cache_clear);
9682
}
9783

98-
asm volatile ("mfence\n" :::);
84+
// 使用cpuid替代内存屏障
85+
unsigned int eax, ebx, ecx, edx;
86+
__cpuid(0, eax, ebx, ecx, edx);
9987

10088
clock_gettime(CLOCK_MONOTONIC, &tstart);
10189
addr = base;
@@ -105,16 +93,14 @@ int main(int argc, char **argv) {
10593
BODY(addr)
10694
:
10795
: [addr] "r" (addr)
108-
: "r8", "r9", "xmm0");
109-
96+
: "rax", "rbx", "rcx", "rdx", "r8", "r9", "xmm0", "xmm1", "memory");
11097
addr += (FENCE_COUNT * MOVE_SIZE);
11198
}
11299
clock_gettime(CLOCK_MONOTONIC, &tend);
113100
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
114101
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
115-
116-
117102
printf("%lu\n", nanos);
118-
return 0;
119-
}
120103

104+
munmap(base, MAP_SIZE);
105+
return 0;
106+
}

workloads/CMakeLists.txt

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,44 @@
11
add_subdirectory(gapbs)
2+
add_subdirectory(llama.cpp)
3+
add_subdirectory(vsag)
4+
5+
macro(memcached_build)
6+
set(memcached_src ${CMAKE_CURRENT_SOURCE_DIR}/memcached)
7+
add_custom_target(memcached_cfg ./autogen.sh
8+
COMMAND ./configure --enable-jobserver=no --enable-memaslap
9+
--enable-static --enable-shared=off
10+
WORKING_DIRECTORY ${memcached_src}
11+
)
12+
add_custom_target(memcached_make
13+
COMMAND make -j 4
14+
DEPENDS memcached_cfg
15+
WORKING_DIRECTORY ${memcached_src}
16+
)
17+
add_custom_target(memcached_copy
18+
COMMAND ${CMAKE_COMMAND} -E copy ${memcached_src}/memcached
19+
${CMAKE_CURRENT_BINARY_DIR}/memcached
20+
)
21+
add_custom_target(memcached DEPENDS memcached_copy)
22+
endmacro(memcached_build)
23+
24+
macro(memcached_ycsb_build)
25+
set(memcached_ycsb_src ${CMAKE_CURRENT_SOURCE_DIR}/YCSB)
26+
add_custom_target(memcached_ycsb_cfg
27+
COMMAND mvn -pl site.ycsb:memcached-binding -am clean package
28+
WORKING_DIRECTORY ${memcached_ycsb_src}
29+
)
30+
add_custom_target(memcached_ycsb_gen_workload
31+
COMMAND ./bin/ycsb load memcached -s -P workloads/workloada > outputLoad.txt
32+
&& ./bin/ycsb run memcached -s -P workloads/workloada > outputRun.txt
33+
DEPENDS memcached_ycsb_cfg
34+
WORKING_DIRECTORY ${memcached_ycsb_src}
35+
)
36+
add_custom_target(memcached_ycsb_copy
37+
COMMAND ${CMAKE_COMMAND} -E copy ${memcached_ycsb_src}/outputLoad.txt
38+
${CMAKE_CURRENT_BINARY_DIR}/outputLoad.txt && ${CMAKE_COMMAND} -E copy ${memcached_src}/outputRun.txt
39+
${CMAKE_CURRENT_BINARY_DIR}/outputRun.txt
40+
)
41+
endmacro(memcached_ycsb_build)
42+
43+
memcached_build()
44+
memcached_ycsb_build()

workloads/YCSB

Submodule YCSB added at 33296cd

workloads/memcached-ycsb

Submodule memcached-ycsb deleted from fe8cbf0

workloads/vectordb

Submodule vectordb deleted from 53cd4a5

0 commit comments

Comments
 (0)