diff --git a/include/counters_mapping.h b/include/counters_mapping.h index 5fe6b20..92084f3 100644 --- a/include/counters_mapping.h +++ b/include/counters_mapping.h @@ -67,6 +67,10 @@ DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES) DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS) +// cache misses and loads +DEFINE_COUNTER_RANGE(llc_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES) +DEFINE_COUNTER_RANGE(llc_loads, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES) + // branch mispredictions DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES) diff --git a/include/uarch.h b/include/uarch.h index 4e8f2a2..810004a 100644 --- a/include/uarch.h +++ b/include/uarch.h @@ -14,6 +14,9 @@ enum uarch { // qualcomm oryon, // arm + cortex_a53, + cortex_a55, + cortex_a73, cortex_a77, cortex_a78, cortex_x1, @@ -23,7 +26,7 @@ enum uarch { neoverse_v2, // hisilicon tsv110, - + tsv200m, unknown_arm64, arm64_begin = firestorm, arm64_end = unknown_arm64, @@ -41,6 +44,7 @@ enum uarch { sunny_cove, skylake, broadwell, + whiskylake, // amd zen1, zen2, diff --git a/include/utils.h b/include/utils.h index 849e0bc..f00e433 100644 --- a/include/utils.h +++ b/include/utils.h @@ -129,5 +129,8 @@ int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr); #ifndef PHR_BRANCHES #define PHR_BRANCHES 64 #endif +#ifndef PHRB_BRANCHES +#define PHRB_BRANCHES 32 +#endif #endif diff --git a/meson.build b/meson.build index 89d71ee..d9ba3a1 100644 --- a/meson.build +++ b/meson.build @@ -4,6 +4,10 @@ cpp_args = [] link_args = [] cpu = target_machine.cpu_family() +avx2_support = false +avx512f_support = false +sve_support = false + if get_option('ios') message('Configured for iOS') cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a'] @@ -20,12 +24,24 @@ else foreach line : r.stderr().strip().split('\n') message(line) endforeach - foreach line : r.stdout().strip().split('\n') - cpp_args += [line] + foreach line : r.stdout().strip().split('\n') # check whether the line first char is - + if line[0] == '-' + cpp_args += [line] + elif line == 'AVX2 detected' + avx2_support = true + elif line == 'AVX512F detected' + avx512f_support = true + elif line == 'SVE detected' + sve_support = true + endif endforeach message('Got CXXFLAGS:', cpp_args) endif +if cpu == 'x86_64' + message('Got CXXFLAGS:', cpp_args) +endif + message('Final CXXFLAGS:', cpp_args) message('Final LDFLAGS:', link_args) @@ -49,18 +65,81 @@ endforeach libs = [] if cpu == 'x86_64' - gather_avx2 = executable('gather_avx2', - 'src/gather.cpp', - cpp_args: ['-DAVX2', '-mavx2'], - link_with: utils, - install: true) + if avx2_support + gather_avx2 = executable('gather_avx2', + 'src/gather.cpp', + cpp_args: ['-DAVX2', '-mavx2'], + link_with: utils, + install: true) + div_avx2 = executable('div_avx2', + 'src/div.cpp', + cpp_args: ['-DAVX2', '-mavx2'], + link_with: utils, + install: true) + endif + if avx512f_support gather_avx512 = executable('gather_avx512', 'src/gather.cpp', cpp_args: ['-DAVX512', '-mavx512f'], link_with: utils, install: true) + endif +elif cpu == 'aarch64' + # gather_neon = executable('gather_neon', + # 'src/gather_aarch64.cpp', + # cpp_args: ['-DNEON'], + # link_with: utils, + # install: true) + if sve_support + gather_sve = executable('gather_sve', + 'src/gather_aarch64.cpp', + cpp_args: ['-DSVE', '-march=armv8.6-a+sve'], + link_with: utils, + install: true) + sve_fp32_add = executable('sve_fp32_add', + 'src/simd_aarch64.cpp', + cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'], + link_with: utils, + install: true) + sve_fp64_add = executable('sve_fp64_add', + 'src/simd_aarch64.cpp', + cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'], + link_with: utils, + install: true) + sve_fp32_fma = executable('sve_fp32_fma', + 'src/simd_aarch64.cpp', + cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'], + link_with: utils, + install: true) + sve_fp64_fma = executable('sve_fp64_fma', + 'src/simd_aarch64.cpp', + cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'], + link_with: utils, + install: true) + endif + neon_fp32_add = executable('neon_fp32_add', + 'src/simd_aarch64.cpp', + cpp_args: ['-DNEON_FP32_ADD'], + link_with: utils, + install: true) + neon_fp64_add = executable('neon_fp64_add', + 'src/simd_aarch64.cpp', + cpp_args: ['-DNEON_FP64_ADD'], + link_with: utils, + install: true) + neon_fp32_fma = executable('neon_fp32_fma', + 'src/simd_aarch64.cpp', + cpp_args: ['-DNEON_FP32_FMA'], + link_with: utils, + install: true) + neon_fp64_fma = executable('neon_fp64_fma', + 'src/simd_aarch64.cpp', + cpp_args: ['-DNEON_FP64_FMA'], + link_with: utils, + install: true) endif + cpp = meson.get_compiler('cpp') cpu = target_machine.cpu_family() diff --git a/src/detect_uarch.cpp b/src/detect_uarch.cpp index 44ae0e1..19cdcfb 100644 --- a/src/detect_uarch.cpp +++ b/src/detect_uarch.cpp @@ -30,6 +30,15 @@ int main() { case cortex_a77: printf("-DARM_CORTEX_A77\n"); break; + case cortex_a53: + printf("-DARM_CORTEX_A53\n"); + break; + case cortex_a55: + printf("-DARM_CORTEX_A55\n"); + break; + case cortex_a73: + printf("-DARM_CORTEX_A73\n"); + break; case cortex_x1: printf("-DARM_CORTEX_X1\n"); break; @@ -70,6 +79,10 @@ int main() { printf("-DINTEL\n"); printf("-DINTEL_BROADWELL\n"); break; + case whiskylake: + printf("-DINTEL\n"); + printf("-DINTEL_WHISKYLAKE\n"); + break; case zen1: printf("-DAMD\n"); printf("-DAMD_ZEN1\n"); diff --git a/src/div.cpp b/src/div.cpp new file mode 100644 index 0000000..46e190d --- /dev/null +++ b/src/div.cpp @@ -0,0 +1,127 @@ +#include "include/utils.h" +#include +#include +#include + +int res = 0; +const int n = 1000; +int array[n] = {0}; +const int repeat = 500; +const int unroll = 16; + +void test_1(int *indices) { +#ifdef AVX2 + __m256d index = _mm256_set1_pd(3.33); + __m256d d0 = _mm256_set1_pd(1.0001); + for (int i = 0; i < repeat; i++) { + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + index = _mm256_div_pd(index, d0); + } + res += index[0]; +#endif +#ifdef AVX512 + __m512d index = _mm512_set1_pd(3.33); + __m512d d0 = _mm512_set1_pd(1.0001); + for (int i = 0; i < repeat; i++) { + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + index = _mm512_div_pd(index, d0); + } + res += index[0]; +#endif +} + +int main(int argc, char *argv[]) { + + int opt; + while ((opt = getopt(argc, argv, "")) != -1) { + switch (opt) { + default: + fprintf(stderr, "Usage: %s [-p]\n", argv[0]); + exit(EXIT_FAILURE); + } + } + + bind_to_core(); + setup_perf_instructions(); + setup_perf_cycles(); + + // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7}; +#ifdef AVX2 + const int vlen = 8; +#endif +#ifdef AVX512 + const int vlen = 16; +#endif + int indices[vlen]; + srand(time(NULL)); + for (int i = 0; i < vlen; i++) { + indices[i] = rand() % 32; + } + + printf("Numbers:"); + for (int i = 0; i < vlen; i++) { + // generate patterns + printf(" %d", indices[i]); + array[indices[i]] = indices[i]; + } + printf("\n"); + + int warmup = 1000; + + for (int i = 0; i < warmup; i++) { + test_1(indices); + } + + int m = 50000; + uint64_t cycles_before = perf_read_cycles(); + uint64_t instructions_before = perf_read_instructions(); + + for (int i = 0; i < m; i++) { + test_1(indices); + } + + uint64_t cycles_after = perf_read_cycles(); + uint64_t instructions_after = perf_read_instructions(); + + // i9-14900K: AVX2 24 cycles + // i9-12900KS: AVX2 24 cycles + // i9-10980XE: AVX2 38 cycles, AVX512 43 cycles + // EPYC 9654: AVX2 20 cycles, AVX512 33 cycles + // EPYC 7742: AVX2 21 cycles + // EPYC 7551: AVX2 20 cycles + printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n", + (cycles_after - cycles_before) / m / repeat / unroll, + (instructions_after - instructions_before) / m / repeat / unroll, + (double)(instructions_after - instructions_before) / + (cycles_after - cycles_before), + res); + return 0; +} diff --git a/src/gather_aarch64.cpp b/src/gather_aarch64.cpp new file mode 100644 index 0000000..2fbdc97 --- /dev/null +++ b/src/gather_aarch64.cpp @@ -0,0 +1,104 @@ +#include "include/utils.h" +#include +#include +#include + +int res = 0; +const int n = 4000; +uint32_t array[n] = {0}; +const int repeat = 1200; +const int unroll = 16; + +void test_1(uint32_t *indices) { +#ifdef SVE + uint32_t tmp[svcntw()]; + svbool_t pg = svptrue_b32(); + svuint32_t index = svld1_u32(pg, indices); + for (int i = 0; i < repeat; i++) { + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + index = svld1_gather_u32index_u32(pg, array, index); + } + svst1_u32(pg, tmp, index); // 存储向量到数组 + res += tmp[0]; +#endif +#ifdef NEON + +#endif +} + +int main(int argc, char *argv[]) { + + int opt; + while ((opt = getopt(argc, argv, "")) != -1) { + switch (opt) { + default: + fprintf(stderr, "Usage: %s [-p]\n", argv[0]); + exit(EXIT_FAILURE); + } + } + + bind_to_core(); + setup_perf_instructions(); + setup_perf_cycles(); + + // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7}; +#ifdef NEON + const int vlen = 4; +#endif + +#ifdef SVE + const int vlen = svcntw(); +#endif + uint32_t indices[vlen]; + srand(time(NULL)); + for (int i = 0; i < vlen; i++) { + indices[i] = rand() % 32; + } + + printf("Numbers:"); + for (int i = 0; i < vlen; i++) { + // generate patterns + printf(" %d", indices[i]); + array[indices[i]] = indices[i]; + } + printf("\n"); + + int warmup = 1000; + + for (int i = 0; i < warmup; i++) { + test_1(indices); + } + + int m = 50000; + uint64_t cycles_before = perf_read_cycles(); + uint64_t instructions_before = perf_read_instructions(); + + for (int i = 0; i < m; i++) { + test_1(indices); + } + + uint64_t cycles_after = perf_read_cycles(); + uint64_t instructions_after = perf_read_instructions(); + + printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n", + (cycles_after - cycles_before) / m / repeat / unroll, + (instructions_after - instructions_before) / m / repeat / unroll, + (double)(instructions_after - instructions_before) / + (cycles_after - cycles_before), + res); + return 0; +} diff --git a/src/ipc.cpp b/src/ipc.cpp index 342f920..c68164b 100644 --- a/src/ipc.cpp +++ b/src/ipc.cpp @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) { test_1(); } - int m = 50000; + int m = 100000; uint64_t cycles_before = perf_read_cycles(); uint64_t instructions_before = perf_read_instructions(); diff --git a/src/simd_aarch64.cpp b/src/simd_aarch64.cpp new file mode 100644 index 0000000..42525ef --- /dev/null +++ b/src/simd_aarch64.cpp @@ -0,0 +1,312 @@ +#include "include/utils.h" +#include +#include +#include +#include + +int res = 0; +const int n = 4000; +uint32_t array[n] = {0}; +const int repeat = 1200; +const int unroll = 16; + +#ifdef SVE_FP32_ADD +void test1(float *indices) { + float tmp[svcntw()]; + svbool_t pg = svptrue_b32(); + svfloat32_t v0 = svdup_f32(1.0); + svfloat32_t v1 = svld1_f32(pg, indices); + for (int i = 0; i < n; i++) { + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + v1 = svadd_f32_z(pg, v1, v0); + } + svst1_f32(pg, tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef NEON_FP32_ADD +void test1(float *indices) { + float tmp[4]; + float32x4_t v0 = vdupq_n_f32(1.0); + float32x4_t v1 = vld1q_f32(indices); + for (int i = 0; i < n; i++) { + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + v1 = vaddq_f32(v1, v0); + } + vst1q_f32(tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef SVE_FP64_ADD +void test1(double *indices) { + double tmp[svcntd()]; + svbool_t pg = svptrue_b64(); + svfloat64_t v0 = svdup_f64(1.0); + svfloat64_t v1 = svld1_f64(pg, indices); + for (int i = 0; i < n; i++) { + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + v1 = svadd_f64_z(pg, v1, v0); + } + svst1_f64(pg, tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef NEON_FP64_ADD +void test1(double *indices) { + double tmp[2]; + float64x2_t v0 = vdupq_n_f64(1.0); + float64x2_t v1 = vld1q_f64(indices); + for (int i = 0; i < n; i++) { + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + v1 = vaddq_f64(v1, v0); + } + vst1q_f64(tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef SVE_FP32_FMA +void test1(float *indices) { + float tmp[svcntw()]; + svbool_t pg = svptrue_b32(); + svfloat32_t v0 = svdup_f32(1.0); + svfloat32_t v1 = svld1_f32(pg, indices); + for (int i = 0; i < n; i++) { + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + v1 = svmla_n_f32_z(pg, v1, v0, 1.0); + } + svst1_f32(pg, tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef NEON_FP32_FMA +void test1(float *indices) { + float tmp[4]; + float32x4_t v0 = vdupq_n_f32(1.0); + float32x4_t v1 = vld1q_f32(indices); + for (int i = 0; i < n; i++) { + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + v1 = vfmaq_n_f32(v1, v0, 1.0); + } + vst1q_f32(tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef SVE_FP64_FMA +void test1(double *indices) { + double tmp[svcntd()]; + svbool_t pg = svptrue_b64(); + svfloat64_t v0 = svdup_f64(1.0); + svfloat64_t v1 = svld1_f64(pg, indices); + for (int i = 0; i < n; i++) { + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + v1 = svmla_n_f64_z(pg, v1, v0, 1.0); + } + svst1_f64(pg, tmp, v1); + res += tmp[0]; +} +#endif + +#ifdef NEON_FP64_FMA +void test1(double *indices) { + double tmp[2]; + float64x2_t v0 = vdupq_n_f64(1.0); + float64x2_t v1 = vld1q_f64(indices); + for (int i = 0; i < n; i++) { + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + v1 = vfmaq_n_f64(v1, v0, 1.0); + } + vst1q_f64(tmp, v1); + res += tmp[0]; +} +#endif + +int main(int argc, char *argv[]) { + + int opt; + while ((opt = getopt(argc, argv, "")) != -1) { + switch (opt) { + default: + fprintf(stderr, "Usage: %s [-p]\n", argv[0]); + exit(EXIT_FAILURE); + } + } + + bind_to_core(); + setup_perf_instructions(); + setup_perf_cycles(); + + // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7}; +#if defined(NEON_FP32_ADD) || defined(NEON_FP32_FMA) + const int vlen = 4; + float indices[vlen]; +#endif + +#if defined(NEON_FP64_ADD) || defined(NEON_FP64_FMA) + const int vlen = 2; + double indices[vlen]; +#endif + +#if defined(SVE_FP32_ADD) || defined(SVE_FP32_FMA) + const int vlen = svcntw(); + float indices[vlen]; +#endif + +#if defined(SVE_FP64_ADD) || defined(SVE_FP64_FMA) + const int vlen = svcntd(); + double indices[vlen]; +#endif + for (int i = 0; i < vlen; i++) { + indices[i] = i + 1.0; + } + + // printf("Numbers:"); + // for (int i = 0; i < vlen; i++) { + // // generate patterns + // printf(" %d", indices[i]); + // array[indices[i]] = indices[i]; + // } + // printf("\n"); + + int warmup = 1000; + + for (int i = 0; i < warmup; i++) { + test1(indices); + } + + int m = 50000; + uint64_t cycles_before = perf_read_cycles(); + uint64_t instructions_before = perf_read_instructions(); + + for (int i = 0; i < m; i++) { + test1(indices); + } + + uint64_t cycles_after = perf_read_cycles(); + uint64_t instructions_after = perf_read_instructions(); + + printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n", + (cycles_after - cycles_before) / m / repeat / unroll, + (instructions_after - instructions_before) / m / repeat / unroll, + (double)(instructions_after - instructions_before) / + (cycles_after - cycles_before), + res); + return 0; +} diff --git a/src/uarch.cpp b/src/uarch.cpp index dbc3b70..7c292fa 100644 --- a/src/uarch.cpp +++ b/src/uarch.cpp @@ -74,6 +74,9 @@ enum uarch get_uarch_inner() { // arm64 int implementer = 0; int part = 0; + bool sve = false; + bool avx512f = false; + bool avx2 = false; while (std::getline(t, line)) { size_t pos = line.find(':'); @@ -91,11 +94,33 @@ enum uarch get_uarch_inner() { part = std::stoi(value, nullptr, 16); } else if (key == "Model Name" && value == " Loongson-3C5000") { return la464; + } else if (key == "flags") { + if (value.find("avx512f") != std::string::npos && !avx512f) { + avx512f = true; + } + if (value.find("avx2") != std::string::npos && !avx2) { + avx2 = true; + } + } else if (key == "Features") { + if (value.find("sve") != std::string::npos && !sve) { + sve = true; + } } } } fprintf(stderr, "Found CPU family %d, model %d, implementer %d, part %d\n", family, model, implementer, part); + + if (avx2) { + fprintf(stdout, "AVX2 detected\n"); + } + if (avx512f) { + fprintf(stdout, "AVX512F detected\n"); + } + if (sve) { + fprintf(stdout, "SVE detected\n"); + } + if (family == 6 && model == 183) { fprintf(stderr, "Intel Raptor Lake detected\n"); fprintf(stderr, "Configured for Golden Cove\n"); @@ -113,6 +138,9 @@ enum uarch get_uarch_inner() { fprintf(stderr, "Intel Broadwell detected\n"); return broadwell; // https://en.wikichip.org/wiki/amd/cpuid + } else if (family == 6 && model == 142) { + fprintf(stderr, "Intel Whiskylake detected\n"); + return skylake; } else if (family == 23 && model == 1) { fprintf(stderr, "AMD Zen 1 detected\n"); return zen1; @@ -135,6 +163,18 @@ enum uarch get_uarch_inner() { } else if (implementer == 0x41 && part == 0xd0d) { fprintf(stderr, "ARM Cortex A77 detected\n"); return cortex_a77; + } else if (implementer == 0x41 && part == 0xd41) { + fprintf(stderr, "ARM Cortex A78 detected\n"); + return cortex_a78; + } else if (implementer == 0x41 && part == 0xd05) { + fprintf(stderr, "ARM Cortex A55 detected\n"); + return cortex_a55; + } else if (implementer == 0x41 && part == 0xd03) { + fprintf(stderr, "ARM Cortex A75 detected\n"); + return cortex_a53; + } else if (implementer == 0x41 && part == 0xd09) { + fprintf(stderr, "ARM Cortex A76 detected\n"); + return cortex_a73; } else if (implementer == 0x41 && part == 0xd40) { fprintf(stderr, "ARM Neoverse V1 detected\n"); return neoverse_v1;