@@ -85,16 +85,21 @@ uint64_t get_time() {
8585 return (uint64_t )tv.tv_sec * 1000000000 + (uint64_t )tv.tv_usec * 1000 ;
8686}
8787
88- char **generate_random_pointer_chasing (size_t size) {
89- int page_size = getpagesize ();
90- if (size < (size_t )page_size) {
88+ char **generate_random_pointer_chasing (size_t size, size_t granularity) {
89+ if (granularity == (size_t )-1 ) {
90+ // use page size as granularity
91+ granularity = getpagesize ();
92+ }
93+
94+ if (size < granularity) {
9195 return NULL ;
9296 }
9397
94- int page_pointer_count = page_size / sizeof (char *);
98+ // number of pointers within each `granularity` bytes
99+ int pointer_count = granularity / sizeof (char *);
95100 int count = size / sizeof (char *);
96- // every page one pointer
97- int index_count = size / page_size ;
101+ // every `granularity` bytes has one pointer
102+ int index_count = size / granularity ;
98103 char **buffer = new char *[count];
99104 int *index = new int [index_count];
100105
@@ -115,11 +120,11 @@ char **generate_random_pointer_chasing(size_t size) {
115120
116121 // init circular list
117122 for (int i = 0 ; i < index_count - 1 ; i++) {
118- buffer[index[i] * page_pointer_count ] =
119- (char *)&buffer[index[i + 1 ] * page_pointer_count ];
123+ buffer[index[i] * pointer_count ] =
124+ (char *)&buffer[index[i + 1 ] * pointer_count ];
120125 }
121- buffer[index[index_count - 1 ] * page_pointer_count ] =
122- (char *)&buffer[index[0 ] * page_pointer_count ];
126+ buffer[index[index_count - 1 ] * pointer_count ] =
127+ (char *)&buffer[index[0 ] * pointer_count ];
123128
124129 delete[] index;
125130
@@ -324,17 +329,16 @@ struct counter_mapping {
324329 uint32_t type;
325330 uint64_t config;
326331
327- // for subtract fallback
328- // name = name1 - name2
329- bool subtract;
330- const char *name1;
331- const char *name2;
332-
333332 // for computed counter
334333 const char *source_counters;
335334 void *compute_fn;
336335};
337336
337+ static uint64_t compute_subtract (const std::vector<uint64_t > counters) {
338+ assert (counters.size () == 2 );
339+ return counters[0 ] - counters[1 ];
340+ }
341+
338342static counter_per_cycle
339343compute_counter_per_cycle (const std::vector<uint64_t > counters) {
340344 assert (counters.size () == 2 );
@@ -347,23 +351,17 @@ compute_counter_per_cycle(const std::vector<uint64_t> counters) {
347351// collect counter mappings
348352std::vector<counter_mapping> counter_mappings = {
349353#define DEFINE_COUNTER (_name, _uarch, _type, _config ) \
350- counter_mapping{#_name, _uarch, _uarch, _type, _config, \
351- false , NULL , NULL , NULL , NULL },
354+ counter_mapping{#_name, _uarch, _uarch, _type, _config, NULL , NULL },
352355#define DEFINE_COUNTER_RANGE (_name, _uarch, _type, _config ) \
353- counter_mapping{#_name, _uarch##_begin, _uarch##_end, _type, _config, \
354- false , NULL , NULL , NULL , NULL },
355- #define DEFINE_COUNTER_SUBTRACT (_name, _name1, _name2 ) \
356- counter_mapping{#_name, all_begin, all_end, 0 , 0 , \
357- true , #_name1, #_name2, NULL , NULL },
356+ counter_mapping{#_name, _uarch##_begin, _uarch##_end, _type, \
357+ _config, NULL , NULL },
358358#define DEFINE_COMPUTED_COUNTER (_name, _ret_type, _uarch, _fn, ...) \
359- counter_mapping{#_name, _uarch, _uarch, 0 , 0 , false , \
360- NULL , NULL , #__VA_ARGS__, (void *)_fn},
359+ counter_mapping{#_name, _uarch, _uarch, 0 , 0 , #__VA_ARGS__, (void *)_fn},
361360#define DEFINE_COMPUTED_COUNTER_RANGE (_name, _ret_type, _uarch, _fn, ...) \
362- counter_mapping{#_name, _uarch##_begin, _uarch##_end, 0 , 0 , false , \
363- NULL , NULL , #__VA_ARGS__, (void *)_fn},
361+ counter_mapping{#_name, _uarch##_begin, _uarch##_end, 0 , \
362+ 0 , #__VA_ARGS__, (void *)_fn},
364363#include " include/counters_mapping.h"
365364#undef DEFINE_COUNTER
366- #undef DEFINE_COUNTER_SUBTRACT
367365#undef DEFINE_COMPUTED_COUNTER_RANGE
368366};
369367
@@ -376,9 +374,6 @@ struct counter_mapping find_mapping(const char *name) {
376374 if (mapping.source_counters ) {
377375 printf (" Found perf counter for %s: computed from %s\n " , name,
378376 mapping.source_counters );
379- } else if (mapping.subtract ) {
380- printf (" Found perf counter for %s: %s - %s\n " , name, mapping.name1 ,
381- mapping.name2 );
382377 } else {
383378 printf (" Found perf counter for %s: type=0x%x config=0x%lx\n " , name,
384379 mapping.type , mapping.config );
@@ -412,17 +407,7 @@ std::vector<std::string> split_counters(const std::string &counters) {
412407 void setup_perf_##name() { \
413408 fprintf (stderr, " Recording PMU counter for %s\n " , #name); \
414409 counter_mapping mapping = find_mapping (#name); \
415- if (mapping.subtract ) { \
416- counter_mapping mapping1 = find_mapping (mapping.name1 ); \
417- counter_mapping mapping2 = find_mapping (mapping.name2 ); \
418- assert (!mapping1.subtract ); \
419- assert (!mapping2.subtract ); \
420- perf_counter_##name = setup_perf_common (mapping1.type , mapping1.config ); \
421- perf_counter_##name##_2 = \
422- setup_perf_common (mapping2.type , mapping2.config ); \
423- } else { \
424- perf_counter_##name = setup_perf_common (mapping.type , mapping.config ); \
425- } \
410+ perf_counter_##name = setup_perf_common (mapping.type , mapping.config ); \
426411 }
427412
428413#define DECLARE_COMPUTED_COUNTER (_type, name ) \
@@ -697,16 +682,74 @@ top_down perf_end_top_down() { return top_down{}; }
697682#elif defined(__APPLE__) && defined(IOS)
698683// ios
699684
700- #define DEFINE_COUNTER (name ) \
701- uint64_t perf_read_##name() { return get_time (); } \
702- void setup_perf_##name() { printf (" Using time instead of PMU\n " ); } \
703- void setup_perf_##name##_per_cycle() {} \
704- counter_per_cycle perf_read_##name##_per_cycle() { \
705- return counter_per_cycle (); \
706- }
685+ // Adapted from
686+ // https://github.com/junjie1475/iOS-microbench/blob/main/iOS-microbench/main.c
707687
708- #include " include/counters.h"
688+ struct proc_threadcounts_data {
689+ uint64_t ptcd_instructions;
690+ uint64_t ptcd_cycles;
691+ uint64_t ptcd_user_time_mach;
692+ uint64_t ptcd_system_time_mach;
693+ uint64_t ptcd_energy_nj;
694+ };
695+
696+ struct proc_threadcounts {
697+ uint16_t ptc_len;
698+ uint16_t ptc_reserved0;
699+ uint32_t ptc_reserved1;
700+ struct proc_threadcounts_data ptc_counts[];
701+ };
702+
703+ // https://github.com/apple-oss-distributions/xnu/blob/aca3beaa3dfbd42498b42c5e5ce20a938e6554e5/bsd/sys/proc_info.h#L927
704+ #define PROC_PIDTHREADCOUNTS 34
705+ #define PROC_PIDTHREADCOUNTS_SIZE (sizeof (struct proc_threadcounts ))
706+ extern " C" int proc_pidinfo (int pid, int flavor, uint64_t arg, void *buffer,
707+ int buffersize);
708+
709+ // only support cycles and instructions
710+
711+ static uint64_t tid;
712+ static int countsize;
713+ static pid_t pid;
714+ static proc_threadcounts *rbuf = NULL ;
715+
716+ void setup_perf_common () {
717+ pid = getpid ();
718+ printf (" Got pid %d\n " , pid);
719+ // 2: p and e, two perf levels
720+ countsize = sizeof (struct proc_threadcounts ) +
721+ 2 * sizeof (struct proc_threadcounts_data );
722+ rbuf = (struct proc_threadcounts *)malloc (countsize);
723+ memset (rbuf, 0 , countsize);
724+ pthread_threadid_np (pthread_self (), &tid);
725+ printf (" Got tid %d\n " , tid);
726+ }
727+
728+ uint64_t perf_read_cycles () {
729+ proc_pidinfo (pid, PROC_PIDTHREADCOUNTS, tid, rbuf, countsize);
730+ // read all cores
731+ return rbuf->ptc_counts [0 ].ptcd_cycles + rbuf->ptc_counts [1 ].ptcd_cycles ;
732+ }
733+
734+ uint64_t perf_read_instructions () {
735+ proc_pidinfo (pid, PROC_PIDTHREADCOUNTS, tid, rbuf, countsize);
736+ // read all cores
737+ return rbuf->ptc_counts [0 ].ptcd_instructions +
738+ rbuf->ptc_counts [1 ].ptcd_instructions ;
739+ }
740+
741+ void setup_perf_cycles () { setup_perf_common (); }
742+
743+ void setup_perf_instructions () { setup_perf_common (); }
744+
745+ // provide dummy impl
746+
747+ #define DEFINE_COUNTER (name, event ) \
748+ uint64_t perf_read_##name() { return 0 ; } \
749+ void setup_perf_##name() {}
750+ #include " include/counters_mapping.h"
709751#undef DEFINE_COUNTER
752+
710753#endif
711754
712755void setup_time_or_cycles () { setup_perf_cycles (); }
@@ -715,8 +758,8 @@ uint64_t get_time_or_cycles() {
715758#ifdef __linux__
716759 if (perf_counter_cycles.fd >= 0 ) {
717760#elif defined(__APPLE__) && defined(IOS)
718- // no pmu
719- if (false ) {
761+ // perf initialized
762+ if (rbuf ) {
720763#elif defined(__APPLE__) && !defined(IOS)
721764 // perf initialized
722765 if (lib_kperf != NULL ) {
@@ -751,6 +794,12 @@ void bind_to_core() {
751794 fprintf (stderr, " Bind to E core on macOS\n " );
752795 pthread_set_qos_class_self_np (QOS_CLASS_BACKGROUND, 0 );
753796#endif
797+ #elif defined(IOS)
798+ // TODO: make it configurable
799+ // it is also not very reliable
800+ // p core
801+ fprintf (stderr, " Bind to P core on iOS\n " );
802+ pthread_set_qos_class_self_np (QOS_CLASS_USER_INTERACTIVE, 0 );
754803#endif
755804}
756805
@@ -811,6 +860,33 @@ void emit_nasm_nops(FILE *fp, int repeat) {
811860 fprintf (fp, " \t %%endrep\n " );
812861}
813862
863+ void emit_multibyte_nops (FILE *fp, int length) {
864+ std::vector<std::vector<uint8_t >> encodings = {
865+ {0x90 },
866+ {0x66 , 0x90 },
867+ {0x0F , 0x1F , 0x00 },
868+ {0x0F , 0x1F , 0x40 , 0x00 },
869+ {0x0F , 0x1F , 0x44 , 0x00 , 0x00 },
870+ {0x66 , 0x0F , 0x1F , 0x44 , 0x00 , 0x00 },
871+ {0x0F , 0x1F , 0x80 , 0x00 , 0x00 , 0x00 , 0x00 },
872+ {0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 },
873+ {0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 },
874+ {0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 },
875+ {0x66 , 0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 },
876+ {0x66 , 0x66 , 0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 , 0x00 },
877+ {0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 , 0x00 ,
878+ 0x00 },
879+ {0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 , 0x00 ,
880+ 0x00 , 0x00 },
881+ {0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x66 , 0x0F , 0x1F , 0x84 , 0x00 , 0x00 ,
882+ 0x00 , 0x00 , 0x00 },
883+ };
884+ assert (length >= 1 && length <= 15 );
885+ for (auto byte : encodings[length - 1 ]) {
886+ fprintf (fp, " \t .byte 0x%x\n " , byte);
887+ }
888+ }
889+
814890void arm64_la (FILE *fp, int reg, const char *format, ...) {
815891 va_list args;
816892 va_list tmp;
0 commit comments