Skip to content
4 changes: 4 additions & 0 deletions include/counters_mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)
DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE,
PERF_COUNT_HW_INSTRUCTIONS)

// cache misses and loads
DEFINE_COUNTER_RANGE(llc_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES)
DEFINE_COUNTER_RANGE(llc_loads, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES)

// branch mispredictions
DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE,
PERF_COUNT_HW_BRANCH_MISSES)
Expand Down
6 changes: 5 additions & 1 deletion include/uarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ enum uarch {
// qualcomm
oryon,
// arm
cortex_a53,
cortex_a55,
cortex_a73,
cortex_a77,
cortex_a78,
cortex_x1,
Expand All @@ -23,7 +26,7 @@ enum uarch {
neoverse_v2,
// hisilicon
tsv110,

tsv200m,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any references to this model? If not, I'd better not include this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You know hisilicon does not release any offical specs, so I'm not very sure. But it refers the micro-architecture on my OrangePi AI Pro.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand, but please avoid non-public architecture names.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could I use the SKU to refer to the architecture? If not set, there might be some problems for users using this hisilicon chip.

unknown_arm64,
arm64_begin = firestorm,
arm64_end = unknown_arm64,
Expand All @@ -41,6 +44,7 @@ enum uarch {
sunny_cove,
skylake,
broadwell,
whiskylake,
// amd
zen1,
zen2,
Expand Down
3 changes: 3 additions & 0 deletions include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,8 @@ int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr);
#ifndef PHR_BRANCHES
#define PHR_BRANCHES 64
#endif
#ifndef PHRB_BRANCHES
#define PHRB_BRANCHES 32
#endif

#endif
88 changes: 81 additions & 7 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ cpp_args = []
link_args = []
cpu = target_machine.cpu_family()

avx2_support = false
avx512f_support = false
sve_support = false

if get_option('ios')
message('Configured for iOS')
cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a']
Expand All @@ -20,12 +24,24 @@ else
foreach line : r.stderr().strip().split('\n')
message(line)
endforeach
foreach line : r.stdout().strip().split('\n')
cpp_args += [line]
foreach line : r.stdout().strip().split('\n') # check whether the line first char is -
if line[0] == '-'
cpp_args += [line]
elif line == 'AVX2 detected'
avx2_support = true
elif line == 'AVX512F detected'
avx512f_support = true
elif line == 'SVE detected'
sve_support = true
endif
endforeach
message('Got CXXFLAGS:', cpp_args)
endif

if cpu == 'x86_64'
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we already print CXXFLAGS above?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it's for ISA detection.

message('Got CXXFLAGS:', cpp_args)
endif


message('Final CXXFLAGS:', cpp_args)
message('Final LDFLAGS:', link_args)
Expand All @@ -49,18 +65,76 @@ endforeach
libs = []

if cpu == 'x86_64'
gather_avx2 = executable('gather_avx2',
'src/gather.cpp',
cpp_args: ['-DAVX2', '-mavx2'],
link_with: utils,
install: true)
if avx2_support
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can just build these binaries without checking if the cpu actually supports it? It allows us to build them on one machine, and run them on another.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's all right. I'm just afraid that it may cause confuse.

gather_avx2 = executable('gather_avx2',
'src/gather.cpp',
cpp_args: ['-DAVX2', '-mavx2'],
link_with: utils,
install: true)
endif
if avx512f_support
gather_avx512 = executable('gather_avx512',
'src/gather.cpp',
cpp_args: ['-DAVX512', '-mavx512f'],
link_with: utils,
install: true)
endif
elif cpu == 'aarch64'
# gather_neon = executable('gather_neon',
# 'src/gather_aarch64.cpp',
# cpp_args: ['-DNEON'],
# link_with: utils,
# install: true)
if sve_support
gather_sve = executable('gather_sve',
'src/gather_aarch64.cpp',
cpp_args: ['-DSVE', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp32_add = executable('sve_fp32_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp64_add = executable('sve_fp64_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp32_fma = executable('sve_fp32_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
sve_fp64_fma = executable('sve_fp64_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'],
link_with: utils,
install: true)
endif
neon_fp32_add = executable('neon_fp32_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP32_ADD'],
link_with: utils,
install: true)
neon_fp64_add = executable('neon_fp64_add',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP64_ADD'],
link_with: utils,
install: true)
neon_fp32_fma = executable('neon_fp32_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP32_FMA'],
link_with: utils,
install: true)
neon_fp64_fma = executable('neon_fp64_fma',
'src/simd_aarch64.cpp',
cpp_args: ['-DNEON_FP64_FMA'],
link_with: utils,
install: true)
endif


cpp = meson.get_compiler('cpp')
cpu = target_machine.cpu_family()

Expand Down
16 changes: 16 additions & 0 deletions src/detect_uarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ int main() {
case cortex_a77:
printf("-DARM_CORTEX_A77\n");
break;
case cortex_a53:
printf("-DARM_CORTEX_A53\n");
break;
case cortex_a55:
printf("-DARM_CORTEX_A55\n");
break;
case cortex_a73:
printf("-DARM_CORTEX_A73\n");
break;
case cortex_x1:
printf("-DARM_CORTEX_X1\n");
break;
Expand All @@ -48,6 +57,9 @@ int main() {
case tsv110:
printf("-DHISILICON_TSV110\n");
break;
case tsv200m:
printf("-DHISILICON_TSV200M\n");
break;
case unknown_arm64:
break;
case golden_cove:
Expand All @@ -70,6 +82,10 @@ int main() {
printf("-DINTEL\n");
printf("-DINTEL_BROADWELL\n");
break;
case whiskylake:
printf("-DINTEL\n");
printf("-DINTEL_WHISKYLAKE\n");
break;
case zen1:
printf("-DAMD\n");
printf("-DAMD_ZEN1\n");
Expand Down
2 changes: 1 addition & 1 deletion src/elimination.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

extern void elimination(FILE *fp);
int main(int argc, char *argv[]) {
FILE *fp = fopen("elimination.csv", "w");
FILE *fp = fopen("../run_results/elimination.csv", "w");
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does this path come from?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry it's my fault not fixing it before my PR...

assert(fp);
elimination(fp);

Expand Down
104 changes: 104 additions & 0 deletions src/gather_aarch64.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "include/utils.h"
#include <arm_sve.h>
#include <time.h>
#include <unistd.h>

int res = 0;
const int n = 4000;
uint32_t array[n] = {0};
const int repeat = 1200;
const int unroll = 16;

void test_1(uint32_t *indices) {
#ifdef SVE
uint32_t tmp[svcntw()];
svbool_t pg = svptrue_b32();
svuint32_t index = svld1_u32(pg, indices);
for (int i = 0; i < repeat; i++) {
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
index = svld1_gather_u32index_u32(pg, array, index);
}
svst1_u32(pg, tmp, index); // 存储向量到数组
res += tmp[0];
#endif
#ifdef NEON

#endif
}

int main(int argc, char *argv[]) {

int opt;
while ((opt = getopt(argc, argv, "")) != -1) {
switch (opt) {
default:
fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
exit(EXIT_FAILURE);
}
}

bind_to_core();
setup_perf_instructions();
setup_perf_cycles();

// int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
#ifdef NEON
const int vlen = 4;
#endif

#ifdef SVE
const int vlen = svcntw();
#endif
uint32_t indices[vlen];
srand(time(NULL));
for (int i = 0; i < vlen; i++) {
indices[i] = rand() % 32;
}

printf("Numbers:");
for (int i = 0; i < vlen; i++) {
// generate patterns
printf(" %d", indices[i]);
array[indices[i]] = indices[i];
}
printf("\n");

int warmup = 1000;

for (int i = 0; i < warmup; i++) {
test_1(indices);
}

int m = 50000;
uint64_t cycles_before = perf_read_cycles();
uint64_t instructions_before = perf_read_instructions();

for (int i = 0; i < m; i++) {
test_1(indices);
}

uint64_t cycles_after = perf_read_cycles();
uint64_t instructions_after = perf_read_instructions();

printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
(cycles_after - cycles_before) / m / repeat / unroll,
(instructions_after - instructions_before) / m / repeat / unroll,
(double)(instructions_after - instructions_before) /
(cycles_after - cycles_before),
res);
return 0;
}
2 changes: 1 addition & 1 deletion src/ipc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
test_1();
}

int m = 50000;
int m = 100000;
uint64_t cycles_before = perf_read_cycles();
uint64_t instructions_before = perf_read_instructions();

Expand Down
Loading