-
Notifications
You must be signed in to change notification settings - Fork 4
Fix some problems and Add more ARM supports #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 13 commits
858cf90
d7a607b
1cc99a1
78647d4
223f872
93cb3ad
365f81a
49b31da
61f0c36
b1b1b07
3665e2b
f7efc92
7b1cc7f
5f3f0f6
46f0c57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,9 @@ enum uarch { | |
| // qualcomm | ||
| oryon, | ||
| // arm | ||
| cortex_a53, | ||
| cortex_a55, | ||
| cortex_a73, | ||
| cortex_a77, | ||
| cortex_a78, | ||
| cortex_x1, | ||
|
|
@@ -23,7 +26,7 @@ enum uarch { | |
| neoverse_v2, | ||
| // hisilicon | ||
| tsv110, | ||
|
|
||
| tsv200m, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there any references to this model? If not, I'd better not include this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You know hisilicon does not release any offical specs, so I'm not very sure. But it refers the micro-architecture on my OrangePi AI Pro. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand, but please avoid non-public architecture names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could I use the SKU to refer to the architecture? If not set, there might be some problems for users using this hisilicon chip. |
||
| unknown_arm64, | ||
| arm64_begin = firestorm, | ||
| arm64_end = unknown_arm64, | ||
|
|
@@ -41,6 +44,7 @@ enum uarch { | |
| sunny_cove, | ||
| skylake, | ||
| broadwell, | ||
| whiskylake, | ||
| // amd | ||
| zen1, | ||
| zen2, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,10 @@ cpp_args = [] | |
| link_args = [] | ||
| cpu = target_machine.cpu_family() | ||
|
|
||
| avx2_support = false | ||
| avx512f_support = false | ||
| sve_support = false | ||
|
|
||
| if get_option('ios') | ||
| message('Configured for iOS') | ||
| cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a'] | ||
|
|
@@ -20,12 +24,24 @@ else | |
| foreach line : r.stderr().strip().split('\n') | ||
| message(line) | ||
| endforeach | ||
| foreach line : r.stdout().strip().split('\n') | ||
| cpp_args += [line] | ||
| foreach line : r.stdout().strip().split('\n') # check whether the line first char is - | ||
| if line[0] == '-' | ||
| cpp_args += [line] | ||
| elif line == 'AVX2 detected' | ||
| avx2_support = true | ||
| elif line == 'AVX512F detected' | ||
| avx512f_support = true | ||
| elif line == 'SVE detected' | ||
| sve_support = true | ||
| endif | ||
| endforeach | ||
| message('Got CXXFLAGS:', cpp_args) | ||
| endif | ||
|
|
||
| if cpu == 'x86_64' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we already print CXXFLAGS above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh it's for ISA detection. |
||
| message('Got CXXFLAGS:', cpp_args) | ||
| endif | ||
|
|
||
|
|
||
| message('Final CXXFLAGS:', cpp_args) | ||
| message('Final LDFLAGS:', link_args) | ||
|
|
@@ -49,18 +65,76 @@ endforeach | |
| libs = [] | ||
|
|
||
| if cpu == 'x86_64' | ||
| gather_avx2 = executable('gather_avx2', | ||
| 'src/gather.cpp', | ||
| cpp_args: ['-DAVX2', '-mavx2'], | ||
| link_with: utils, | ||
| install: true) | ||
| if avx2_support | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can just build these binaries without checking if the cpu actually supports it? It allows us to build them on one machine, and run them on another. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's all right. I'm just afraid that it may cause confuse. |
||
| gather_avx2 = executable('gather_avx2', | ||
| 'src/gather.cpp', | ||
| cpp_args: ['-DAVX2', '-mavx2'], | ||
| link_with: utils, | ||
| install: true) | ||
| endif | ||
| if avx512f_support | ||
| gather_avx512 = executable('gather_avx512', | ||
| 'src/gather.cpp', | ||
| cpp_args: ['-DAVX512', '-mavx512f'], | ||
| link_with: utils, | ||
| install: true) | ||
| endif | ||
| elif cpu == 'aarch64' | ||
| # gather_neon = executable('gather_neon', | ||
| # 'src/gather_aarch64.cpp', | ||
| # cpp_args: ['-DNEON'], | ||
| # link_with: utils, | ||
| # install: true) | ||
| if sve_support | ||
| gather_sve = executable('gather_sve', | ||
| 'src/gather_aarch64.cpp', | ||
| cpp_args: ['-DSVE', '-march=armv8.6-a+sve'], | ||
| link_with: utils, | ||
| install: true) | ||
| sve_fp32_add = executable('sve_fp32_add', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'], | ||
| link_with: utils, | ||
| install: true) | ||
| sve_fp64_add = executable('sve_fp64_add', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'], | ||
| link_with: utils, | ||
| install: true) | ||
| sve_fp32_fma = executable('sve_fp32_fma', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'], | ||
| link_with: utils, | ||
| install: true) | ||
| sve_fp64_fma = executable('sve_fp64_fma', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'], | ||
| link_with: utils, | ||
| install: true) | ||
| endif | ||
| neon_fp32_add = executable('neon_fp32_add', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DNEON_FP32_ADD'], | ||
| link_with: utils, | ||
| install: true) | ||
| neon_fp64_add = executable('neon_fp64_add', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DNEON_FP64_ADD'], | ||
| link_with: utils, | ||
| install: true) | ||
| neon_fp32_fma = executable('neon_fp32_fma', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DNEON_FP32_FMA'], | ||
| link_with: utils, | ||
| install: true) | ||
| neon_fp64_fma = executable('neon_fp64_fma', | ||
| 'src/simd_aarch64.cpp', | ||
| cpp_args: ['-DNEON_FP64_FMA'], | ||
| link_with: utils, | ||
| install: true) | ||
| endif | ||
|
|
||
|
|
||
| cpp = meson.get_compiler('cpp') | ||
| cpu = target_machine.cpu_family() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,7 @@ | |
|
|
||
| extern void elimination(FILE *fp); | ||
| int main(int argc, char *argv[]) { | ||
| FILE *fp = fopen("elimination.csv", "w"); | ||
| FILE *fp = fopen("../run_results/elimination.csv", "w"); | ||
|
||
| assert(fp); | ||
| elimination(fp); | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| #include "include/utils.h" | ||
| #include <arm_sve.h> | ||
| #include <time.h> | ||
| #include <unistd.h> | ||
|
|
||
| int res = 0; | ||
| const int n = 4000; | ||
| uint32_t array[n] = {0}; | ||
| const int repeat = 1200; | ||
| const int unroll = 16; | ||
|
|
||
| void test_1(uint32_t *indices) { | ||
| #ifdef SVE | ||
| uint32_t tmp[svcntw()]; | ||
| svbool_t pg = svptrue_b32(); | ||
| svuint32_t index = svld1_u32(pg, indices); | ||
| for (int i = 0; i < repeat; i++) { | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| index = svld1_gather_u32index_u32(pg, array, index); | ||
| } | ||
| svst1_u32(pg, tmp, index); // 存储向量到数组 | ||
| res += tmp[0]; | ||
| #endif | ||
| #ifdef NEON | ||
|
|
||
| #endif | ||
| } | ||
|
|
||
| int main(int argc, char *argv[]) { | ||
|
|
||
| int opt; | ||
| while ((opt = getopt(argc, argv, "")) != -1) { | ||
| switch (opt) { | ||
| default: | ||
| fprintf(stderr, "Usage: %s [-p]\n", argv[0]); | ||
| exit(EXIT_FAILURE); | ||
| } | ||
| } | ||
|
|
||
| bind_to_core(); | ||
| setup_perf_instructions(); | ||
| setup_perf_cycles(); | ||
|
|
||
| // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7}; | ||
| #ifdef NEON | ||
| const int vlen = 4; | ||
| #endif | ||
|
|
||
| #ifdef SVE | ||
| const int vlen = svcntw(); | ||
| #endif | ||
| uint32_t indices[vlen]; | ||
| srand(time(NULL)); | ||
| for (int i = 0; i < vlen; i++) { | ||
| indices[i] = rand() % 32; | ||
| } | ||
|
|
||
| printf("Numbers:"); | ||
| for (int i = 0; i < vlen; i++) { | ||
| // generate patterns | ||
| printf(" %d", indices[i]); | ||
| array[indices[i]] = indices[i]; | ||
| } | ||
| printf("\n"); | ||
|
|
||
| int warmup = 1000; | ||
|
|
||
| for (int i = 0; i < warmup; i++) { | ||
| test_1(indices); | ||
| } | ||
|
|
||
| int m = 50000; | ||
| uint64_t cycles_before = perf_read_cycles(); | ||
| uint64_t instructions_before = perf_read_instructions(); | ||
|
|
||
| for (int i = 0; i < m; i++) { | ||
| test_1(indices); | ||
| } | ||
|
|
||
| uint64_t cycles_after = perf_read_cycles(); | ||
| uint64_t instructions_after = perf_read_instructions(); | ||
|
|
||
| printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n", | ||
| (cycles_after - cycles_before) / m / repeat / unroll, | ||
| (instructions_after - instructions_before) / m / repeat / unroll, | ||
| (double)(instructions_after - instructions_before) / | ||
| (cycles_after - cycles_before), | ||
| res); | ||
| return 0; | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.