Skip to content

Commit 2eaf32d

Browse files
committed
update new
1 parent d61c3eb commit 2eaf32d

File tree

7 files changed

+223
-56
lines changed

7 files changed

+223
-56
lines changed

include/cxlcounter.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@
1414

1515
#include <cstdint>
1616
#include <map>
17-
#include <string>
1817
#include <tuple>
19-
#include <unordered_map>
2018
#include <vector>
2119

2220
/** TODO: Whether to using the pebs to record the state. add back invalidation migrate huge/ page and prefetch*/

include/cxlendpoint.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
#include <map>
2020
#include <string>
2121
#include <tuple>
22-
#include <unordered_map>
22+
#include <unordered_set>
2323
#include <vector>
2424
#define ROB_SIZE 512
2525

2626
struct occupation_info {
27-
uint64_t timestamp;
28-
uint64_t address;
29-
uint64_t access_count;
27+
uint64_t timestamp{};
28+
uint64_t address{};
29+
uint64_t access_count{};
3030
};
3131
struct rob_info {
3232
std::map<int, int64_t> m_bandwidth, m_count;
@@ -62,6 +62,8 @@ class CXLMemExpander : public CXLEndPoint {
6262
uint64_t capacity;
6363

6464
std::vector<occupation_info> occupation; // timestamp, pa
65+
std::unordered_set<uint64_t> address_cache{};
66+
bool cache_valid = false;
6567
CXLMemExpanderEvent counter{};
6668
CXLMemExpanderEvent last_counter{};
6769
mutable std::shared_mutex occupationMutex_; // 使用共享互斥锁允许多个读取者
@@ -82,6 +84,28 @@ class CXLMemExpander : public CXLEndPoint {
8284
double dramlatency) override; // traverse the tree to calculate the latency
8385
double calculate_bandwidth(const std::vector<std::tuple<uint64_t, uint64_t>> &elem) override;
8486
void delete_entry(uint64_t addr, uint64_t length) override;
87+
void update_address_cache() {
88+
if (cache_valid) return;
89+
90+
address_cache.clear();
91+
for (const auto& occ : occupation) {
92+
address_cache.insert(occ.address);
93+
}
94+
cache_valid = true;
95+
}
96+
// 当 occupation 更新时调用此函数
97+
void invalidate_cache() {
98+
cache_valid = false;
99+
}
100+
101+
// 检查地址是否在 occupation 中
102+
bool is_address_local(uint64_t addr) {
103+
if (!cache_valid) {
104+
update_address_cache();
105+
}
106+
107+
return address_cache.find(addr) != address_cache.end();
108+
}
85109
};
86110
class CXLSwitch : public CXLEndPoint {
87111
public:

include/perf.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include <sys/ioctl.h>
2727
#include <sys/syscall.h>
2828
#include <sys/types.h>
29-
#include <thread>
3029
#include <tuple>
3130
#include <unistd.h>
3231

microbench/st_serial.cpp

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
* Microbench testies for MLP and memory latency in CXLMS
3+
*
4+
* By: Andrew Quinn
5+
* Yiwei Yang
6+
*
7+
* Copyright 2023 Regents of the Univeristy of California
8+
* UC Santa Cruz Sluglab.
9+
*/
10+
11+
12+
#include <errno.h>
13+
#include <stdio.h>
14+
#include <assert.h>
15+
#include <stdio.h>
16+
#include <stdlib.h>
17+
#include <stdint.h>
18+
#include <cpuid.h>
19+
#include <pthread.h>
20+
#include <stdlib.h>
21+
22+
#include <sys/mman.h>
23+
24+
25+
#define STR_HELPER(x) #x
26+
#define STR(x) STR_HELPER(x)
27+
28+
#define MOVE_SIZE 128
29+
#define MAP_SIZE (long)(1024 * 1024 * 1024)
30+
#define CACHELINE_SIZE 64
31+
32+
#ifndef FENCE_COUNT
33+
#define FENCE_COUNT 8
34+
#endif
35+
36+
#define FENCE_BOUND (FENCE_COUNT * MOVE_SIZE)
37+
38+
// we need to jump in MOVE_SIZE increments otherwise segfault!
39+
40+
#define BODY(start) \
41+
"xor %%r8, %%r8 \n" \
42+
"pxor %%xmm1, %%xmm1 \n" \
43+
"LOOP_START%=: \n" \
44+
"lea (%[" #start "], %%r8), %%r9 \n" \
45+
"movdqa %%xmm1, (%%r9) \n" \
46+
"add $" STR(MOVE_SIZE) ", %%r8 \n" \
47+
"cmp $" STR(FENCE_BOUND) ",%%r8\n" \
48+
"mfence \n" \
49+
"jl LOOP_START%= \n"
50+
51+
52+
int main(int argc, char **argv) {
53+
54+
// in principle, you would want to clear out cache lines (and the
55+
// pipeline) before doing any of the inline assembly stuff. But,
56+
// that's hard. And, its probably noise when you execute over
57+
// enough things.
58+
59+
60+
// allocate some meomery
61+
char *base =(char *) mmap(nullptr,
62+
MAP_SIZE,
63+
PROT_READ | PROT_WRITE,
64+
MAP_ANONYMOUS | MAP_PRIVATE,
65+
-1,
66+
0);
67+
68+
if (base == MAP_FAILED) {
69+
fprintf(stderr, "oops, you suck %d\n", errno);
70+
return -1;
71+
}
72+
char *addr = NULL;
73+
74+
intptr_t *iaddr = (intptr_t*) base;
75+
intptr_t hash = 0;
76+
struct timespec tstart = {0,0}, tend = {0,0};
77+
78+
// Necessary so that we don't include allocation costs in our benchmark
79+
while (iaddr < (intptr_t *)(base + MAP_SIZE)) {
80+
hash = hash ^ (intptr_t) iaddr;
81+
*iaddr = hash;
82+
iaddr++;
83+
}
84+
85+
// should flush everything from the cache. But, how big is the cache?
86+
addr = base;
87+
while (addr < (base + MAP_SIZE)) {
88+
asm volatile(
89+
"mov %[buf], %%rsi\n"
90+
"clflush (%%rsi)\n"
91+
:
92+
: [buf] "r" (addr)
93+
: "rsi");
94+
addr += CACHELINE_SIZE;
95+
}
96+
97+
asm volatile ("mfence\n" :::);
98+
99+
clock_gettime(CLOCK_MONOTONIC, &tstart);
100+
addr = base;
101+
while (addr < (base + MAP_SIZE)) {
102+
//fprintf (stderr, "addr %p bound %p\n", addr, base + MAP_SIZE);
103+
asm volatile(
104+
BODY(addr)
105+
:
106+
: [addr] "r" (addr)
107+
: "r8", "r9", "xmm0");
108+
109+
addr += (FENCE_COUNT * MOVE_SIZE);
110+
}
111+
clock_gettime(CLOCK_MONOTONIC, &tend);
112+
uint64_t nanos = (1000000000 * tend.tv_sec + tend.tv_nsec);
113+
nanos -= (1000000000 * tstart.tv_sec + tstart.tv_nsec);
114+
115+
116+
printf("%lu\n", nanos);
117+
return 0;
118+
}
119+

src/cxlcontroller.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ void CXLController::insert_one(thread_info &t_info, lbr &lbr) {
103103
ring_buffer.push(lbr);
104104

105105
for (int i = 0; i < llcm_count; i++) {
106+
if (t_info.llcm_type.empty()) {
107+
// 如果 llcm_type 为空,直接插入 0
108+
t_info.llcm_type.push(0);
109+
}
106110
rob.m_count[t_info.llcm_type.front()]++;
107111
t_info.llcm_type_rob.push(t_info.llcm_type.front());
108112
t_info.llcm_type.pop();

0 commit comments

Comments
 (0)