Skip to content

Commit 2bc4abb

Browse files
committed
Add ios cycle counter support
1 parent 3ad67da commit 2bc4abb

File tree

2 files changed

+132
-53
lines changed

2 files changed

+132
-53
lines changed

include/utils.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929

3030
// utilities
3131
std::map<const char *, size_t> get_cache_sizes();
32-
char **generate_random_pointer_chasing(size_t size);
32+
char **generate_random_pointer_chasing(size_t size,
33+
size_t granularity = (size_t)-1);
3334

3435
// get time or cycles
3536
// unit: ns or cycle
@@ -108,12 +109,14 @@ extern bool nasm;
108109
void define_gadgets_array(FILE *fp, const char *name);
109110
void add_gadget(FILE *fp, const char *format, ...);
110111
void emit_nasm_nops(FILE *fp, int repeat);
112+
void emit_multibyte_nops(FILE *fp, int length);
111113
// load address to register on arm64
112114
void arm64_la(FILE *fp, int reg, const char *format, ...);
113115

114116
// convert virtual address to physical
115117
int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr);
116118

119+
// phr length
117120
#if defined(APPLE_M1_FIRESTORM)
118121
#define PHR_BRANCHES 100
119122
#define PHRB_BRANCHES 28

src/utils.cpp

Lines changed: 128 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,21 @@ uint64_t get_time() {
8585
return (uint64_t)tv.tv_sec * 1000000000 + (uint64_t)tv.tv_usec * 1000;
8686
}
8787

88-
char **generate_random_pointer_chasing(size_t size) {
89-
int page_size = getpagesize();
90-
if (size < (size_t)page_size) {
88+
char **generate_random_pointer_chasing(size_t size, size_t granularity) {
89+
if (granularity == (size_t)-1) {
90+
// use page size as granularity
91+
granularity = getpagesize();
92+
}
93+
94+
if (size < granularity) {
9195
return NULL;
9296
}
9397

94-
int page_pointer_count = page_size / sizeof(char *);
98+
// number of pointers within each `granularity` bytes
99+
int pointer_count = granularity / sizeof(char *);
95100
int count = size / sizeof(char *);
96-
// every page one pointer
97-
int index_count = size / page_size;
101+
// every `granularity` bytes has one pointer
102+
int index_count = size / granularity;
98103
char **buffer = new char *[count];
99104
int *index = new int[index_count];
100105

@@ -115,11 +120,11 @@ char **generate_random_pointer_chasing(size_t size) {
115120

116121
// init circular list
117122
for (int i = 0; i < index_count - 1; i++) {
118-
buffer[index[i] * page_pointer_count] =
119-
(char *)&buffer[index[i + 1] * page_pointer_count];
123+
buffer[index[i] * pointer_count] =
124+
(char *)&buffer[index[i + 1] * pointer_count];
120125
}
121-
buffer[index[index_count - 1] * page_pointer_count] =
122-
(char *)&buffer[index[0] * page_pointer_count];
126+
buffer[index[index_count - 1] * pointer_count] =
127+
(char *)&buffer[index[0] * pointer_count];
123128

124129
delete[] index;
125130

@@ -324,17 +329,16 @@ struct counter_mapping {
324329
uint32_t type;
325330
uint64_t config;
326331

327-
// for subtract fallback
328-
// name = name1 - name2
329-
bool subtract;
330-
const char *name1;
331-
const char *name2;
332-
333332
// for computed counter
334333
const char *source_counters;
335334
void *compute_fn;
336335
};
337336

337+
static uint64_t compute_subtract(const std::vector<uint64_t> counters) {
338+
assert(counters.size() == 2);
339+
return counters[0] - counters[1];
340+
}
341+
338342
static counter_per_cycle
339343
compute_counter_per_cycle(const std::vector<uint64_t> counters) {
340344
assert(counters.size() == 2);
@@ -347,23 +351,17 @@ compute_counter_per_cycle(const std::vector<uint64_t> counters) {
347351
// collect counter mappings
348352
std::vector<counter_mapping> counter_mappings = {
349353
#define DEFINE_COUNTER(_name, _uarch, _type, _config) \
350-
counter_mapping{#_name, _uarch, _uarch, _type, _config, \
351-
false, NULL, NULL, NULL, NULL},
354+
counter_mapping{#_name, _uarch, _uarch, _type, _config, NULL, NULL},
352355
#define DEFINE_COUNTER_RANGE(_name, _uarch, _type, _config) \
353-
counter_mapping{#_name, _uarch##_begin, _uarch##_end, _type, _config, \
354-
false, NULL, NULL, NULL, NULL},
355-
#define DEFINE_COUNTER_SUBTRACT(_name, _name1, _name2) \
356-
counter_mapping{#_name, all_begin, all_end, 0, 0, \
357-
true, #_name1, #_name2, NULL, NULL},
356+
counter_mapping{#_name, _uarch##_begin, _uarch##_end, _type, \
357+
_config, NULL, NULL},
358358
#define DEFINE_COMPUTED_COUNTER(_name, _ret_type, _uarch, _fn, ...) \
359-
counter_mapping{#_name, _uarch, _uarch, 0, 0, false, \
360-
NULL, NULL, #__VA_ARGS__, (void *)_fn},
359+
counter_mapping{#_name, _uarch, _uarch, 0, 0, #__VA_ARGS__, (void *)_fn},
361360
#define DEFINE_COMPUTED_COUNTER_RANGE(_name, _ret_type, _uarch, _fn, ...) \
362-
counter_mapping{#_name, _uarch##_begin, _uarch##_end, 0, 0, false, \
363-
NULL, NULL, #__VA_ARGS__, (void *)_fn},
361+
counter_mapping{#_name, _uarch##_begin, _uarch##_end, 0, \
362+
0, #__VA_ARGS__, (void *)_fn},
364363
#include "include/counters_mapping.h"
365364
#undef DEFINE_COUNTER
366-
#undef DEFINE_COUNTER_SUBTRACT
367365
#undef DEFINE_COMPUTED_COUNTER_RANGE
368366
};
369367

@@ -376,9 +374,6 @@ struct counter_mapping find_mapping(const char *name) {
376374
if (mapping.source_counters) {
377375
printf("Found perf counter for %s: computed from %s\n", name,
378376
mapping.source_counters);
379-
} else if (mapping.subtract) {
380-
printf("Found perf counter for %s: %s - %s\n", name, mapping.name1,
381-
mapping.name2);
382377
} else {
383378
printf("Found perf counter for %s: type=0x%x config=0x%lx\n", name,
384379
mapping.type, mapping.config);
@@ -412,17 +407,7 @@ std::vector<std::string> split_counters(const std::string &counters) {
412407
void setup_perf_##name() { \
413408
fprintf(stderr, "Recording PMU counter for %s\n", #name); \
414409
counter_mapping mapping = find_mapping(#name); \
415-
if (mapping.subtract) { \
416-
counter_mapping mapping1 = find_mapping(mapping.name1); \
417-
counter_mapping mapping2 = find_mapping(mapping.name2); \
418-
assert(!mapping1.subtract); \
419-
assert(!mapping2.subtract); \
420-
perf_counter_##name = setup_perf_common(mapping1.type, mapping1.config); \
421-
perf_counter_##name##_2 = \
422-
setup_perf_common(mapping2.type, mapping2.config); \
423-
} else { \
424-
perf_counter_##name = setup_perf_common(mapping.type, mapping.config); \
425-
} \
410+
perf_counter_##name = setup_perf_common(mapping.type, mapping.config); \
426411
}
427412

428413
#define DECLARE_COMPUTED_COUNTER(_type, name) \
@@ -697,16 +682,74 @@ top_down perf_end_top_down() { return top_down{}; }
697682
#elif defined(__APPLE__) && defined(IOS)
698683
// ios
699684

700-
#define DEFINE_COUNTER(name) \
701-
uint64_t perf_read_##name() { return get_time(); } \
702-
void setup_perf_##name() { printf("Using time instead of PMU\n"); } \
703-
void setup_perf_##name##_per_cycle() {} \
704-
counter_per_cycle perf_read_##name##_per_cycle() { \
705-
return counter_per_cycle(); \
706-
}
685+
// Adapted from
686+
// https://github.com/junjie1475/iOS-microbench/blob/main/iOS-microbench/main.c
707687

708-
#include "include/counters.h"
688+
struct proc_threadcounts_data {
689+
uint64_t ptcd_instructions;
690+
uint64_t ptcd_cycles;
691+
uint64_t ptcd_user_time_mach;
692+
uint64_t ptcd_system_time_mach;
693+
uint64_t ptcd_energy_nj;
694+
};
695+
696+
struct proc_threadcounts {
697+
uint16_t ptc_len;
698+
uint16_t ptc_reserved0;
699+
uint32_t ptc_reserved1;
700+
struct proc_threadcounts_data ptc_counts[];
701+
};
702+
703+
// https://github.com/apple-oss-distributions/xnu/blob/aca3beaa3dfbd42498b42c5e5ce20a938e6554e5/bsd/sys/proc_info.h#L927
704+
#define PROC_PIDTHREADCOUNTS 34
705+
#define PROC_PIDTHREADCOUNTS_SIZE (sizeof(struct proc_threadcounts))
706+
extern "C" int proc_pidinfo(int pid, int flavor, uint64_t arg, void *buffer,
707+
int buffersize);
708+
709+
// only support cycles and instructions
710+
711+
static uint64_t tid;
712+
static int countsize;
713+
static pid_t pid;
714+
static proc_threadcounts *rbuf = NULL;
715+
716+
void setup_perf_common() {
717+
pid = getpid();
718+
printf("Got pid %d\n", pid);
719+
// 2: p and e, two perf levels
720+
countsize = sizeof(struct proc_threadcounts) +
721+
2 * sizeof(struct proc_threadcounts_data);
722+
rbuf = (struct proc_threadcounts *)malloc(countsize);
723+
memset(rbuf, 0, countsize);
724+
pthread_threadid_np(pthread_self(), &tid);
725+
printf("Got tid %d\n", tid);
726+
}
727+
728+
uint64_t perf_read_cycles() {
729+
proc_pidinfo(pid, PROC_PIDTHREADCOUNTS, tid, rbuf, countsize);
730+
// read all cores
731+
return rbuf->ptc_counts[0].ptcd_cycles + rbuf->ptc_counts[1].ptcd_cycles;
732+
}
733+
734+
uint64_t perf_read_instructions() {
735+
proc_pidinfo(pid, PROC_PIDTHREADCOUNTS, tid, rbuf, countsize);
736+
// read all cores
737+
return rbuf->ptc_counts[0].ptcd_instructions +
738+
rbuf->ptc_counts[1].ptcd_instructions;
739+
}
740+
741+
void setup_perf_cycles() { setup_perf_common(); }
742+
743+
void setup_perf_instructions() { setup_perf_common(); }
744+
745+
// provide dummy impl
746+
747+
#define DEFINE_COUNTER(name, event) \
748+
uint64_t perf_read_##name() { return 0; } \
749+
void setup_perf_##name() {}
750+
#include "include/counters_mapping.h"
709751
#undef DEFINE_COUNTER
752+
710753
#endif
711754

712755
void setup_time_or_cycles() { setup_perf_cycles(); }
@@ -715,8 +758,8 @@ uint64_t get_time_or_cycles() {
715758
#ifdef __linux__
716759
if (perf_counter_cycles.fd >= 0) {
717760
#elif defined(__APPLE__) && defined(IOS)
718-
// no pmu
719-
if (false) {
761+
// perf initialized
762+
if (rbuf) {
720763
#elif defined(__APPLE__) && !defined(IOS)
721764
// perf initialized
722765
if (lib_kperf != NULL) {
@@ -751,6 +794,12 @@ void bind_to_core() {
751794
fprintf(stderr, "Bind to E core on macOS\n");
752795
pthread_set_qos_class_self_np(QOS_CLASS_BACKGROUND, 0);
753796
#endif
797+
#elif defined(IOS)
798+
// TODO: make it configurable
799+
// it is also not very reliable
800+
// p core
801+
fprintf(stderr, "Bind to P core on iOS\n");
802+
pthread_set_qos_class_self_np(QOS_CLASS_USER_INTERACTIVE, 0);
754803
#endif
755804
}
756805

@@ -811,6 +860,33 @@ void emit_nasm_nops(FILE *fp, int repeat) {
811860
fprintf(fp, "\t%%endrep\n");
812861
}
813862

863+
void emit_multibyte_nops(FILE *fp, int length) {
864+
std::vector<std::vector<uint8_t>> encodings = {
865+
{0x90},
866+
{0x66, 0x90},
867+
{0x0F, 0x1F, 0x00},
868+
{0x0F, 0x1F, 0x40, 0x00},
869+
{0x0F, 0x1F, 0x44, 0x00, 0x00},
870+
{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
871+
{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
872+
{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
873+
{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
874+
{0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
875+
{0x66, 0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
876+
{0x66, 0x66, 0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
877+
{0x66, 0x66, 0x66, 0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00,
878+
0x00},
879+
{0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00,
880+
0x00, 0x00},
881+
{0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00,
882+
0x00, 0x00, 0x00},
883+
};
884+
assert(length >= 1 && length <= 15);
885+
for (auto byte : encodings[length - 1]) {
886+
fprintf(fp, "\t.byte 0x%x\n", byte);
887+
}
888+
}
889+
814890
void arm64_la(FILE *fp, int reg, const char *format, ...) {
815891
va_list args;
816892
va_list tmp;

0 commit comments

Comments
 (0)