Skip to content

Commit ec83f4f

Browse files
authored
Merge pull request #109 from sysprog21/mmu-caching
Upgrade MMU cache to 8×2 set-associative
2 parents bb10925 + c7728b8 commit ec83f4f

File tree

4 files changed

+191
-48
lines changed

4 files changed

+191
-48
lines changed

main.c

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,18 @@ static int semu_step(emu_state_t *emu)
910910
}
911911

912912
#ifdef MMU_CACHE_STATS
913+
static vm_t *global_vm_for_signal = NULL;
914+
static volatile sig_atomic_t signal_received = 0;
915+
916+
/* Forward declaration */
917+
static void print_mmu_cache_stats(vm_t *vm);
918+
919+
/* Async-signal-safe handler: only set flag, defer printing */
920+
static void signal_handler_stats(int sig UNUSED)
921+
{
922+
signal_received = 1;
923+
}
924+
913925
static void print_mmu_cache_stats(vm_t *vm)
914926
{
915927
fprintf(stderr, "\n=== MMU Cache Statistics ===\n");
@@ -918,15 +930,25 @@ static void print_mmu_cache_stats(vm_t *vm)
918930
uint64_t fetch_total =
919931
hart->cache_fetch.hits + hart->cache_fetch.misses;
920932

921-
/* Combine 2-way load cache statistics */
922-
uint64_t load_hits =
923-
hart->cache_load[0].hits + hart->cache_load[1].hits;
924-
uint64_t load_misses =
925-
hart->cache_load[0].misses + hart->cache_load[1].misses;
933+
/* Combine 8-set × 2-way load cache statistics */
934+
uint64_t load_hits = 0, load_misses = 0;
935+
for (int set = 0; set < 8; set++) {
936+
for (int way = 0; way < 2; way++) {
937+
load_hits += hart->cache_load[set].ways[way].hits;
938+
load_misses += hart->cache_load[set].ways[way].misses;
939+
}
940+
}
926941
uint64_t load_total = load_hits + load_misses;
927942

928-
uint64_t store_total =
929-
hart->cache_store.hits + hart->cache_store.misses;
943+
/* Combine 8-set × 2-way store cache statistics */
944+
uint64_t store_hits = 0, store_misses = 0;
945+
for (int set = 0; set < 8; set++) {
946+
for (int way = 0; way < 2; way++) {
947+
store_hits += hart->cache_store[set].ways[way].hits;
948+
store_misses += hart->cache_store[set].ways[way].misses;
949+
}
950+
}
951+
uint64_t store_total = store_hits + store_misses;
930952

931953
fprintf(stderr, "\nHart %u:\n", i);
932954
fprintf(stderr, " Fetch: %12llu hits, %12llu misses",
@@ -936,18 +958,18 @@ static void print_mmu_cache_stats(vm_t *vm)
936958
100.0 * hart->cache_fetch.hits / fetch_total);
937959
fprintf(stderr, "\n");
938960

939-
fprintf(stderr, " Load: %12llu hits, %12llu misses (2-way)",
940-
load_hits, load_misses);
961+
fprintf(stderr, " Load: %12llu hits, %12llu misses (8x2)", load_hits,
962+
load_misses);
941963
if (load_total > 0)
942964
fprintf(stderr, " (%.2f%% hit rate)",
943965
100.0 * load_hits / load_total);
944966
fprintf(stderr, "\n");
945967

946-
fprintf(stderr, " Store: %12llu hits, %12llu misses",
947-
hart->cache_store.hits, hart->cache_store.misses);
968+
fprintf(stderr, " Store: %12llu hits, %12llu misses (8x2)", store_hits,
969+
store_misses);
948970
if (store_total > 0)
949971
fprintf(stderr, " (%.2f%% hit rate)",
950-
100.0 * hart->cache_store.hits / store_total);
972+
100.0 * store_hits / store_total);
951973
fprintf(stderr, "\n");
952974
}
953975
}
@@ -1007,6 +1029,13 @@ static int semu_run(emu_state_t *emu)
10071029
#endif
10081030

10091031
while (!emu->stopped) {
1032+
#ifdef MMU_CACHE_STATS
1033+
/* Check if signal received (SIGINT/SIGTERM) */
1034+
if (signal_received) {
1035+
print_mmu_cache_stats(&emu->vm);
1036+
return 0;
1037+
}
1038+
#endif
10101039
/* Resume each hart's coroutine in round-robin fashion */
10111040
for (uint32_t i = 0; i < vm->n_hart; i++) {
10121041
coro_resume_hart(i);
@@ -1100,6 +1129,11 @@ static int semu_run(emu_state_t *emu)
11001129
if (ret)
11011130
return ret;
11021131
#ifdef MMU_CACHE_STATS
1132+
/* Check if signal received (SIGINT/SIGTERM) */
1133+
if (signal_received) {
1134+
print_mmu_cache_stats(&emu->vm);
1135+
return 0;
1136+
}
11031137
/* Exit after running for 15 seconds to collect statistics */
11041138
gettimeofday(&current_time, NULL);
11051139
long elapsed_sec = current_time.tv_sec - start_time.tv_sec;
@@ -1246,6 +1280,12 @@ int main(int argc, char **argv)
12461280
if (ret)
12471281
return ret;
12481282

1283+
#ifdef MMU_CACHE_STATS
1284+
global_vm_for_signal = &emu.vm;
1285+
signal(SIGINT, signal_handler_stats);
1286+
signal(SIGTERM, signal_handler_stats);
1287+
#endif
1288+
12491289
if (emu.debug)
12501290
ret = semu_run_debug(&emu);
12511291
else

riscv.c

Lines changed: 76 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,18 @@ static inline uint32_t read_rs2(const hart_t *vm, uint32_t insn)
185185
void mmu_invalidate(hart_t *vm)
186186
{
187187
vm->cache_fetch.n_pages = 0xFFFFFFFF;
188-
vm->cache_load[0].n_pages = 0xFFFFFFFF;
189-
vm->cache_load[1].n_pages = 0xFFFFFFFF;
190-
vm->cache_store.n_pages = 0xFFFFFFFF;
188+
/* Invalidate all 8 sets × 2 ways for load cache */
189+
for (int set = 0; set < 8; set++) {
190+
for (int way = 0; way < 2; way++)
191+
vm->cache_load[set].ways[way].n_pages = 0xFFFFFFFF;
192+
vm->cache_load[set].lru = 0; /* Reset LRU to way 0 */
193+
}
194+
/* Invalidate all 8 sets × 2 ways for store cache */
195+
for (int set = 0; set < 8; set++) {
196+
for (int way = 0; way < 2; way++)
197+
vm->cache_store[set].ways[way].n_pages = 0xFFFFFFFF;
198+
vm->cache_store[set].lru = 0; /* Reset LRU to way 0 */
199+
}
191200
}
192201

193202
/* Pre-verify the root page table to minimize page table access during
@@ -333,13 +342,36 @@ static void mmu_load(hart_t *vm,
333342
{
334343
uint32_t vpn = addr >> RV_PAGE_SHIFT;
335344
uint32_t phys_addr;
336-
/* 2-entry direct-mapped cache: use parity hash to select entry */
337-
uint32_t index = __builtin_parity(vpn) & 0x1;
345+
/* 8-set × 2-way set-associative cache: use 3-bit parity hash */
346+
uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) |
347+
(__builtin_parity(vpn & 0x55555555) << 1) |
348+
__builtin_parity(vpn & 0xCCCCCCCC);
349+
350+
mmu_cache_set_t *set = &vm->cache_load[set_idx];
351+
352+
/* Check both ways in the set */
353+
int hit_way = -1;
354+
for (int way = 0; way < 2; way++) {
355+
if (likely(set->ways[way].n_pages == vpn)) {
356+
hit_way = way;
357+
break;
358+
}
359+
}
338360

339-
if (unlikely(vpn != vm->cache_load[index].n_pages)) {
361+
if (likely(hit_way >= 0)) {
362+
/* Cache hit: reconstruct physical address from cached PPN */
363+
#ifdef MMU_CACHE_STATS
364+
set->ways[hit_way].hits++;
365+
#endif
366+
phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) |
367+
(addr & MASK(RV_PAGE_SHIFT));
368+
/* Update LRU: mark the other way as replacement candidate */
369+
set->lru = 1 - hit_way;
370+
} else {
340371
/* Cache miss: do full translation */
372+
int victim_way = set->lru; /* Use LRU bit to select victim */
341373
#ifdef MMU_CACHE_STATS
342-
vm->cache_load[index].misses++;
374+
set->ways[victim_way].misses++;
343375
#endif
344376
phys_addr = addr;
345377
mmu_translate(vm, &phys_addr,
@@ -348,16 +380,11 @@ static void mmu_load(hart_t *vm,
348380
RV_EXC_LOAD_PFAULT);
349381
if (vm->error)
350382
return;
351-
/* Cache physical page number (not a pointer) */
352-
vm->cache_load[index].n_pages = vpn;
353-
vm->cache_load[index].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
354-
} else {
355-
/* Cache hit: reconstruct physical address from cached PPN */
356-
#ifdef MMU_CACHE_STATS
357-
vm->cache_load[index].hits++;
358-
#endif
359-
phys_addr = (vm->cache_load[index].phys_ppn << RV_PAGE_SHIFT) |
360-
(addr & MASK(RV_PAGE_SHIFT));
383+
/* Replace victim way with new translation */
384+
set->ways[victim_way].n_pages = vpn;
385+
set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
386+
/* Update LRU: mark the other way for next eviction */
387+
set->lru = 1 - victim_way;
361388
}
362389

363390
vm->mem_load(vm, phys_addr, width, value);
@@ -376,28 +403,48 @@ static bool mmu_store(hart_t *vm,
376403
{
377404
uint32_t vpn = addr >> RV_PAGE_SHIFT;
378405
uint32_t phys_addr;
406+
/* 8-set × 2-way set-associative cache: use 3-bit parity hash */
407+
uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) |
408+
(__builtin_parity(vpn & 0x55555555) << 1) |
409+
__builtin_parity(vpn & 0xCCCCCCCC);
410+
411+
mmu_cache_set_t *set = &vm->cache_store[set_idx];
412+
413+
/* Check both ways in the set */
414+
int hit_way = -1;
415+
for (int way = 0; way < 2; way++) {
416+
if (likely(set->ways[way].n_pages == vpn)) {
417+
hit_way = way;
418+
break;
419+
}
420+
}
379421

380-
if (unlikely(vpn != vm->cache_store.n_pages)) {
422+
if (likely(hit_way >= 0)) {
423+
/* Cache hit: reconstruct physical address from cached PPN */
424+
#ifdef MMU_CACHE_STATS
425+
set->ways[hit_way].hits++;
426+
#endif
427+
phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) |
428+
(addr & MASK(RV_PAGE_SHIFT));
429+
/* Update LRU: mark the other way as replacement candidate */
430+
set->lru = 1 - hit_way;
431+
} else {
381432
/* Cache miss: do full translation */
433+
int victim_way = set->lru; /* Use LRU bit to select victim */
382434
#ifdef MMU_CACHE_STATS
383-
vm->cache_store.misses++;
435+
set->ways[victim_way].misses++;
384436
#endif
385437
phys_addr = addr;
386438
mmu_translate(vm, &phys_addr, (1 << 2), (1 << 6) | (1 << 7),
387439
vm->sstatus_sum && vm->s_mode, RV_EXC_STORE_FAULT,
388440
RV_EXC_STORE_PFAULT);
389441
if (vm->error)
390442
return false;
391-
/* Cache physical page number (not a pointer) */
392-
vm->cache_store.n_pages = vpn;
393-
vm->cache_store.phys_ppn = phys_addr >> RV_PAGE_SHIFT;
394-
} else {
395-
/* Cache hit: reconstruct physical address from cached PPN */
396-
#ifdef MMU_CACHE_STATS
397-
vm->cache_store.hits++;
398-
#endif
399-
phys_addr = (vm->cache_store.phys_ppn << RV_PAGE_SHIFT) |
400-
(addr & MASK(RV_PAGE_SHIFT));
443+
/* Replace victim way with new translation */
444+
set->ways[victim_way].n_pages = vpn;
445+
set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
446+
/* Update LRU: mark the other way for next eviction */
447+
set->lru = 1 - victim_way;
401448
}
402449

403450
if (unlikely(cond)) {

riscv.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ typedef struct {
5151
#endif
5252
} mmu_addr_cache_t;
5353

54+
/* Set-associative cache structure for load operations */
55+
typedef struct {
56+
mmu_addr_cache_t ways[2]; /* 2-way associative */
57+
uint8_t lru; /* LRU bit: 0 or 1 (which way to replace) */
58+
} mmu_cache_set_t;
59+
5460
/* To use the emulator, start by initializing a hart_t object with zero values,
5561
* invoke vm_init(), and set the required environment-supplied callbacks. You
5662
* may also set other necessary fields such as argument registers and s_mode,
@@ -101,9 +107,10 @@ struct __hart_internal {
101107
uint32_t exc_cause, exc_val;
102108

103109
mmu_fetch_cache_t cache_fetch;
104-
/* 2-entry direct-mapped with hash-based indexing */
105-
mmu_addr_cache_t cache_load[2];
106-
mmu_addr_cache_t cache_store;
110+
/* 8-set × 2-way set-associative cache with 3-bit parity hash indexing */
111+
mmu_cache_set_t cache_load[8];
112+
/* 8-set × 2-way set-associative cache for store operations */
113+
mmu_cache_set_t cache_store[8];
107114

108115
/* Supervisor state */
109116
bool s_mode;

utils.c

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
#include <math.h>
12
#include <stdbool.h>
3+
#include <stdio.h>
24
#include <time.h>
35

46
#include "utils.h"
@@ -24,6 +26,10 @@ bool boot_complete = false;
2426
static double ticks_increment;
2527
static double boot_ticks;
2628

29+
/* Timer calibration statistics */
30+
static uint64_t timer_call_count = 0;
31+
static int timer_n_harts = 1;
32+
2733
/* Calculate "x * n / d" without unnecessary overflow or loss of precision.
2834
*
2935
* Reference:
@@ -88,6 +94,7 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)
8894
static bool first_switch = true;
8995

9096
if (!boot_complete) {
97+
timer_call_count++;
9198
boot_ticks += ticks_increment;
9299
return (uint64_t) boot_ticks;
93100
}
@@ -98,6 +105,34 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)
98105

99106
/* Calculate the offset between the real time and the emulator time */
100107
offset = (int64_t) (real_ticks - boot_ticks);
108+
109+
#ifdef SEMU_TIMER_STATS
110+
/* Output timer calibration statistics (only when SEMU_TIMER_STATS is
111+
* defined) */
112+
double actual_coefficient = (double) timer_call_count / timer_n_harts;
113+
double current_coefficient = 1.744e8;
114+
double recommended_coefficient = actual_coefficient;
115+
116+
fprintf(stderr, "\n[Timer Calibration Statistics]\n");
117+
fprintf(stderr, " Boot completed after %llu timer calls\n",
118+
(unsigned long long) timer_call_count);
119+
fprintf(stderr, " Number of harts: %d\n", timer_n_harts);
120+
fprintf(stderr, " Actual coefficient: %.3e (%.2f calls per hart)\n",
121+
actual_coefficient, actual_coefficient);
122+
fprintf(stderr, " Current coefficient: %.3e\n", current_coefficient);
123+
fprintf(stderr, " Difference: %.2f%% %s\n",
124+
fabs(actual_coefficient - current_coefficient) /
125+
current_coefficient * 100.0,
126+
actual_coefficient > current_coefficient ? "(more calls)"
127+
: "(fewer calls)");
128+
fprintf(stderr, "\n[Recommendation]\n");
129+
fprintf(stderr, " Update utils.c line 121 to:\n");
130+
fprintf(stderr,
131+
" ticks_increment = (SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / "
132+
"(%.3e * n_harts);\n",
133+
recommended_coefficient);
134+
fprintf(stderr, "\n");
135+
#endif
101136
}
102137
return (uint64_t) ((int64_t) real_ticks - offset);
103138
}
@@ -108,14 +143,28 @@ void semu_timer_init(semu_timer_t *timer, uint64_t freq, int n_harts)
108143
timer->begin = mult_frac(host_time_ns(), timer->freq, 1e9);
109144
boot_ticks = timer->begin; /* Initialize the fake ticks for boot process */
110145

146+
/* Store n_harts for calibration statistics */
147+
timer_n_harts = n_harts;
148+
111149
/* According to statistics, the number of times 'semu_timer_clocksource'
112-
* called is approximately 'SMP count * 2.15 * 1e8'. By the time the boot
150+
* called is approximately 'SMP count * 1.744 * 1e8'. By the time the boot
113151
* process is completed, the emulator will have a total of 'boot seconds *
114-
* frequency' ticks. Therefore, each time, '(boot seconds * frequency) /
115-
* (2.15 * 1e8 * SMP count)' ticks need to be added.
152+
* frequency' ticks. Therefore, each time, (boot seconds * frequency) /
153+
* (1.744 * 1e8 * SMP count) ticks need to be added.
154+
*
155+
* Note: This coefficient was recalibrated after MMU cache optimization
156+
* (8×2 set-associative with 99%+ hit rate). The original coefficient
157+
* (2.15 * 1e8) was based on measurements before the optimization. With
158+
* faster CPU execution, fewer timer calls are needed to complete boot.
159+
*
160+
* Calibration history:
161+
* - Original (pre-MMU cache): 2.15 × 10^8
162+
* - After MMU cache (measured): 1.696 × 10^8 (-21.1%)
163+
* - Verification measurement: 1.744 × 10^8 (error: 2.85%)
164+
* - Final coefficient: 1.744 × 10^8 (based on verification)
116165
*/
117166
ticks_increment =
118-
(SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (2.15 * 1e8 * n_harts);
167+
(SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (1.744 * 1e8 * n_harts);
119168
}
120169

121170
uint64_t semu_timer_get(semu_timer_t *timer)

0 commit comments

Comments
 (0)