Skip to content

Commit 6dcfd84

Browse files
authored
Merge pull request #333 from qwe661234/add_profiler
Add profiler
2 parents 1351c44 + ee8ac93 commit 6dcfd84

File tree

14 files changed

+450
-94
lines changed

14 files changed

+450
-94
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,23 @@ _Example Instructions Histogram_
255255
_Example Registers Histogram_
256256
![Registers Hisrogram Example](docs/histogram-reg.png)
257257

258+
## RISC-V Basic Block Usage Statistics
259+
260+
To install [lolviz](https://github.com/parrt/lolviz)
261+
```shell
262+
$ pip install lolviz
263+
```
264+
For macOS users, installing might be required:
265+
```shell
266+
$ brew install graphviz
267+
```
268+
First, user need to crate the directory `prof` and build profiling data through executing `rv32emu`
269+
270+
```shell
271+
$ ./build/rv32emu -p ./build/[test_program].elf
272+
$ ./tools/rv_profiler [--start-address|--stop-address|--graph-ir] [test_program]
273+
```
274+
258275
## Contributing
259276

260277
See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines.

src/cache.c

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,14 @@
66
#include <assert.h>
77
#include <stdbool.h>
88
#include <stddef.h>
9+
#include <stdio.h>
910
#include <stdlib.h>
1011
#include <string.h>
1112

1213
#include "cache.h"
1314
#include "mpool.h"
1415
#include "utils.h"
1516

16-
/* Currently, THRESHOLD is set to identify hot spots. Once the using frequency
17-
* for a block exceeds the THRESHOLD, the tier-1 JIT compiler process is
18-
* triggered.
19-
* FIXME: Implement effective profiler to detect hot spots, instead of simply
20-
* relying on THRESHOLD.
21-
*/
22-
#define THRESHOLD 4096
23-
2417
static uint32_t cache_size, cache_size_bits;
2518
static struct mpool *cache_mp;
2619

@@ -166,7 +159,7 @@ cache_t *cache_create(int size_bits)
166159
return NULL;
167160
}
168161

169-
void *cache_get(const cache_t *cache, uint32_t key)
162+
void *cache_get(const cache_t *cache, uint32_t key, bool update)
170163
{
171164
if (!cache->capacity ||
172165
hlist_empty(&cache->map->ht_list_head[cache_hash(key)]))
@@ -192,7 +185,7 @@ void *cache_get(const cache_t *cache, uint32_t key)
192185
* code. The generated C code is then compiled into machine code by the
193186
* target compiler.
194187
*/
195-
if (entry->frequency < THRESHOLD) {
188+
if (update && entry->frequency < THRESHOLD) {
196189
list_del_init(&entry->list);
197190
list_add(&entry->list, cache->lists[entry->frequency++]);
198191
}
@@ -242,7 +235,7 @@ void cache_free(cache_t *cache)
242235
free(cache);
243236
}
244237

245-
uint32_t cache_freq(struct cache *cache, uint32_t key)
238+
uint32_t cache_freq(const struct cache *cache, uint32_t key)
246239
{
247240
if (!cache->capacity ||
248241
hlist_empty(&cache->map->ht_list_head[cache_hash(key)]))
@@ -282,4 +275,16 @@ bool cache_hot(const struct cache *cache, uint32_t key)
282275
}
283276
return false;
284277
}
278+
void cache_profile(const struct cache *cache,
279+
FILE *output_file,
280+
prof_func_t func)
281+
{
282+
assert(func);
283+
for (int i = 0; i < THRESHOLD; i++) {
284+
lfu_entry_t *entry, *safe;
285+
list_for_each_entry_safe (entry, safe, cache->lists[i], list) {
286+
func(entry->value, entry->frequency, output_file);
287+
}
288+
}
289+
}
285290
#endif

src/cache.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@
77

88
#include <stdbool.h>
99
#include <stdint.h>
10+
#include <stdio.h>
11+
12+
/* Currently, THRESHOLD is set to identify hot spots. Once the using frequency
13+
* for a block exceeds the THRESHOLD, the tier-1 JIT compiler process is
14+
* triggered.
15+
*/
16+
#define THRESHOLD 4096
1017

1118
struct cache;
1219

@@ -20,9 +27,10 @@ struct cache *cache_create(int size_bits);
2027
* cache_get - retrieve the specified entry from the cache
2128
* @cache: a pointer points to target cache
2229
* @key: the key of the specified entry
30+
* @update: update frequency or not
2331
* @return: the specified entry or NULL
2432
*/
25-
void *cache_get(const struct cache *cache, uint32_t key);
33+
void *cache_get(const struct cache *cache, uint32_t key, bool update);
2634

2735
/**
2836
* cache_put - insert a new entry into the cache
@@ -48,6 +56,11 @@ void cache_free(struct cache *cache);
4856
* @key: the key of the specified entry
4957
*/
5058
bool cache_hot(const struct cache *cache, uint32_t key);
59+
60+
typedef void (*prof_func_t)(void *, uint32_t, FILE *);
61+
void cache_profile(const struct cache *cache,
62+
FILE *output_file,
63+
prof_func_t func);
5164
#endif
5265

53-
uint32_t cache_freq(struct cache *cache, uint32_t key);
66+
uint32_t cache_freq(const struct cache *cache, uint32_t key);

src/emulate.c

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ static block_t *block_alloc(riscv_t *rv)
305305
block->translatable = true;
306306
block->hot = false;
307307
block->backward = false;
308+
block->has_loops = false;
308309
INIT_LIST_HEAD(&block->list);
309310
#endif
310311
return block;
@@ -383,6 +384,11 @@ static bool is_branch_taken = false;
383384
/* record the program counter of the previous block */
384385
static uint32_t last_pc = 0;
385386

387+
#if RV32_HAS(JIT)
388+
static set_t pc_set;
389+
static bool has_loops = false;
390+
#endif
391+
386392
/* Interpreter-based execution path */
387393
#define RVOP(inst, code, asm) \
388394
static bool do_##inst(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, \
@@ -628,8 +634,10 @@ static void block_translate(riscv_t *rv, block_t *block)
628634
#endif
629635
/* stop on branch */
630636
if (insn_is_branch(ir->opcode)) {
637+
#if RV32_HAS(JIT)
631638
if (ir->imm < 0)
632639
block->backward = true;
640+
#endif
633641
if (ir->opcode == rv_insn_jalr
634642
#if RV32_HAS(EXT_C)
635643
|| ir->opcode == rv_insn_cjalr || ir->opcode == rv_insn_cjr
@@ -950,7 +958,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
950958
block_t *next = block_find(map, rv->PC);
951959
#else
952960
/* lookup the next block in the block cache */
953-
block_t *next = (block_t *) cache_get(rv->block_cache, rv->PC);
961+
block_t *next = (block_t *) cache_get(rv->block_cache, rv->PC, true);
954962
#endif
955963

956964
if (!next) {
@@ -986,7 +994,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
986994
rv_insn_t *taken = delete_target->ir_tail->branch_taken,
987995
*untaken = delete_target->ir_tail->branch_untaken;
988996
if (taken && taken->pc != delete_target->pc_start) {
989-
block_t *target = cache_get(rv->block_cache, taken->pc);
997+
block_t *target = cache_get(rv->block_cache, taken->pc, false);
990998
bool flag = false;
991999
list_for_each_entry_safe (entry, safe, &target->list, list) {
9921000
if (entry->block == delete_target) {
@@ -998,7 +1006,8 @@ static block_t *block_find_or_translate(riscv_t *rv)
9981006
assert(flag);
9991007
}
10001008
if (untaken && untaken->pc != delete_target->pc_start) {
1001-
block_t *target = cache_get(rv->block_cache, untaken->pc);
1009+
block_t *target =
1010+
cache_get(rv->block_cache, untaken->pc, false);
10021011
assert(target);
10031012
bool flag = false;
10041013
list_for_each_entry_safe (entry, safe, &target->list, list) {
@@ -1039,6 +1048,22 @@ static block_t *block_find_or_translate(riscv_t *rv)
10391048
}
10401049

10411050
#if RV32_HAS(JIT)
1051+
static bool runtime_profiler(riscv_t *rv, block_t *block)
1052+
{
1053+
/* Based on our observation, a high percentage of true hotspots involve high
1054+
* using frequency, loops or backward jumps. Therefore, we believe our
1055+
* profiler can use three indices to detect hotspots */
1056+
uint32_t freq = cache_freq(rv->block_cache, block->pc_start);
1057+
/* to profile the block after chaining, the block should be executed first
1058+
*/
1059+
if (unlikely(freq >= 2 && (block->backward || block->has_loops)))
1060+
return true;
1061+
/* using frequency exceeds predetermined threshold */
1062+
if (unlikely(freq == THRESHOLD))
1063+
return true;
1064+
return false;
1065+
}
1066+
10421067
typedef void (*exec_block_func_t)(riscv_t *rv, uintptr_t);
10431068
#endif
10441069

@@ -1056,7 +1081,7 @@ void rv_step(riscv_t *rv, int32_t cycles)
10561081
#if !RV32_HAS(JIT)
10571082
prev = block_find(&rv->block_map, last_pc);
10581083
#else
1059-
prev = cache_get(rv->block_cache, last_pc);
1084+
prev = cache_get(rv->block_cache, last_pc, false);
10601085
#endif
10611086
}
10621087
/* lookup the next block in block map or translate a new block,
@@ -1116,17 +1141,16 @@ void rv_step(riscv_t *rv, int32_t cycles)
11161141
prev = NULL;
11171142
continue;
11181143
} /* check if using frequency of block exceed threshold */
1119-
else if (block->translatable &&
1120-
((block->backward &&
1121-
cache_freq(rv->block_cache, block->pc_start) >= 1024) ||
1122-
cache_hot(rv->block_cache, block->pc_start))) {
1144+
else if (block->translatable && runtime_profiler(rv, block)) {
11231145
block->hot = true;
11241146
block->offset = jit_translate(rv, block);
11251147
((exec_block_func_t) state->buf)(
11261148
rv, (uintptr_t) (state->buf + block->offset));
11271149
prev = NULL;
11281150
continue;
11291151
}
1152+
set_reset(&pc_set);
1153+
has_loops = false;
11301154
#endif
11311155
/* execute the block by interpreter */
11321156
const rv_insn_t *ir = block->ir_head;
@@ -1135,6 +1159,10 @@ void rv_step(riscv_t *rv, int32_t cycles)
11351159
prev = NULL;
11361160
break;
11371161
}
1162+
#if RV32_HAS(JIT)
1163+
if (has_loops && !block->has_loops)
1164+
block->has_loops = true;
1165+
#endif
11381166
prev = block;
11391167
}
11401168
}

src/jit.c

Lines changed: 4 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,60 +1163,6 @@ static void muldivmod(struct jit_state *state,
11631163
}
11641164
#endif /* RV32_HAS(EXT_M) */
11651165

1166-
#define SET_SIZE_BITS 10
1167-
#define SET_SIZE (1 << SET_SIZE_BITS)
1168-
#define SET_SLOTS_SIZE 32
1169-
HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS);
1170-
1171-
/* The set consists of SET_SIZE buckets, with each bucket containing
1172-
* SET_SLOTS_SIZE slots.
1173-
*/
1174-
typedef struct {
1175-
uint32_t table[SET_SIZE][SET_SLOTS_SIZE];
1176-
} set_t;
1177-
1178-
/**
1179-
* set_reset - clear a set
1180-
* @set: a pointer points to target set
1181-
*/
1182-
static inline void set_reset(set_t *set)
1183-
{
1184-
memset(set, 0, sizeof(set_t));
1185-
}
1186-
1187-
/**
1188-
* set_add - insert a new element into the set
1189-
* @set: a pointer points to target set
1190-
* @key: the key of the inserted entry
1191-
*/
1192-
static bool set_add(set_t *set, uint32_t key)
1193-
{
1194-
const uint32_t index = set_hash(key);
1195-
uint8_t count = 0;
1196-
while (set->table[index][count]) {
1197-
if (set->table[index][count++] == key)
1198-
return false;
1199-
}
1200-
1201-
set->table[index][count] = key;
1202-
return true;
1203-
}
1204-
1205-
/**
1206-
* set_has - check whether the element exist in the set or not
1207-
* @set: a pointer points to target set
1208-
* @key: the key of the inserted entry
1209-
*/
1210-
static bool set_has(set_t *set, uint32_t key)
1211-
{
1212-
const uint32_t index = set_hash(key);
1213-
for (uint8_t count = 0; set->table[index][count]; count++) {
1214-
if (set->table[index][count] == key)
1215-
return true;
1216-
}
1217-
return false;
1218-
}
1219-
12201166
static void prepare_translate(struct jit_state *state)
12211167
{
12221168
#if defined(__x86_64__)
@@ -1497,12 +1443,14 @@ static void translate_chained_block(struct jit_state *state,
14971443
translate(state, rv, block);
14981444
rv_insn_t *ir = block->ir_tail;
14991445
if (ir->branch_untaken && !set_has(set, ir->branch_untaken->pc)) {
1500-
block_t *block1 = cache_get(rv->block_cache, ir->branch_untaken->pc);
1446+
block_t *block1 =
1447+
cache_get(rv->block_cache, ir->branch_untaken->pc, false);
15011448
if (block1->translatable)
15021449
translate_chained_block(state, rv, block1, set);
15031450
}
15041451
if (ir->branch_taken && !set_has(set, ir->branch_taken->pc)) {
1505-
block_t *block1 = cache_get(rv->block_cache, ir->branch_taken->pc);
1452+
block_t *block1 =
1453+
cache_get(rv->block_cache, ir->branch_taken->pc, false);
15061454
if (block1->translatable)
15071455
translate_chained_block(state, rv, block1, set);
15081456
}

src/main.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,15 @@ static const char *opt_prog_name = "a.out";
3737
/* target argc and argv */
3838
static int prog_argc;
3939
static char **prog_args;
40-
static const char *optstr = "tgqmhd:a:";
40+
static const char *optstr = "tgqmhpd:a:";
4141

4242
/* enable misaligned memory access */
4343
static bool opt_misaligned = false;
4444

45+
/* dump profiling data */
46+
static bool opt_prof_data = false;
47+
static char *prof_out_file;
48+
4549
#define MEMIO(op) on_mem_##op
4650
#define IO_HANDLER_IMPL(type, op, RW) \
4751
static IIF(RW)( \
@@ -108,6 +112,7 @@ static void print_usage(const char *filename)
108112
" -a [filename] : dump signature to the given file, "
109113
"required by arch-test test\n"
110114
" -m : enable misaligned memory access\n"
115+
" -p : generate profiling data\n"
111116
" -h : show this message\n",
112117
filename);
113118
}
@@ -137,6 +142,9 @@ static bool parse_args(int argc, char **args)
137142
case 'm':
138143
opt_misaligned = true;
139144
break;
145+
case 'p':
146+
opt_prof_data = true;
147+
break;
140148
case 'd':
141149
opt_dump_regs = true;
142150
registers_out_file = optarg;
@@ -158,6 +166,12 @@ static bool parse_args(int argc, char **args)
158166
*/
159167
prog_args = &args[optind];
160168
opt_prog_name = prog_args[0];
169+
if (opt_prof_data) {
170+
char *prog_name = malloc(strlen(opt_prog_name) - 11);
171+
strncpy(prog_name, opt_prog_name + 8, strlen(opt_prog_name) - 12);
172+
prof_out_file = malloc(strlen(opt_prog_name) + 1);
173+
sprintf(prof_out_file, "./prof/%s.prof", prog_name);
174+
}
161175
return true;
162176
}
163177

@@ -264,6 +278,8 @@ int main(int argc, char **args)
264278
if (opt_arch_test)
265279
dump_test_signature(elf);
266280

281+
if (opt_prof_data)
282+
rv_profile(rv, prof_out_file);
267283
/* finalize the RISC-V runtime */
268284
elf_delete(elf);
269285
rv_delete(rv);

0 commit comments

Comments
 (0)