diff --git a/include/counters_mapping.h b/include/counters_mapping.h
index 5fe6b20..92084f3 100644
--- a/include/counters_mapping.h
+++ b/include/counters_mapping.h
@@ -67,6 +67,10 @@ DEFINE_COUNTER_RANGE(cycles, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES)
 DEFINE_COUNTER_RANGE(instructions, all, PERF_TYPE_HARDWARE,
                      PERF_COUNT_HW_INSTRUCTIONS)
 
+// cache misses and loads
+DEFINE_COUNTER_RANGE(llc_misses, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES)
+DEFINE_COUNTER_RANGE(llc_loads, all, PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES)
+
 // branch mispredictions
 DEFINE_COUNTER_RANGE(branch_misses, all, PERF_TYPE_HARDWARE,
                      PERF_COUNT_HW_BRANCH_MISSES)
diff --git a/include/uarch.h b/include/uarch.h
index 4e8f2a2..810004a 100644
--- a/include/uarch.h
+++ b/include/uarch.h
@@ -14,6 +14,9 @@ enum uarch {
   // qualcomm
   oryon,
   // arm
+  cortex_a53,
+  cortex_a55,
+  cortex_a73,
   cortex_a77,
   cortex_a78,
   cortex_x1,
@@ -23,7 +26,7 @@ enum uarch {
   neoverse_v2,
   // hisilicon
   tsv110,
-
+  tsv200m,
   unknown_arm64,
   arm64_begin = firestorm,
   arm64_end = unknown_arm64,
@@ -41,6 +44,7 @@ enum uarch {
   sunny_cove,
   skylake,
   broadwell,
+  whiskylake,
   // amd
   zen1,
   zen2,
diff --git a/include/utils.h b/include/utils.h
index 849e0bc..f00e433 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -129,5 +129,8 @@ int virt_to_phys_user(uintptr_t *paddr, uintptr_t vaddr);
 #ifndef PHR_BRANCHES
 #define PHR_BRANCHES 64
 #endif
+#ifndef PHRB_BRANCHES
+#define PHRB_BRANCHES 32
+#endif
 
 #endif
diff --git a/meson.build b/meson.build
index 89d71ee..d9ba3a1 100644
--- a/meson.build
+++ b/meson.build
@@ -4,6 +4,10 @@ cpp_args = []
 link_args = []
 cpu = target_machine.cpu_family()
 
+avx2_support = false
+avx512f_support = false
+sve_support = false
+
 if get_option('ios')
 	message('Configured for iOS')
 	cpp_args += ['-DIOS', '-std=c++11', '-DHOST_AARCH64', '-march=armv8.4-a']
@@ -20,12 +24,24 @@ else
 	foreach line : r.stderr().strip().split('\n')
 		message(line)
 	endforeach
-	foreach line : r.stdout().strip().split('\n')
-		cpp_args += [line]
+	foreach line : r.stdout().strip().split('\n') # check whether the line first char is -
+		if line[0] == '-'
+			cpp_args += [line]
+		elif line == 'AVX2 detected'
+			avx2_support = true
+		elif line == 'AVX512F detected'
+			avx512f_support = true
+		elif line == 'SVE detected'
+			sve_support = true
+		endif
 	endforeach
 	message('Got CXXFLAGS:', cpp_args)
 endif
 
+if cpu == 'x86_64'
+	message('Got CXXFLAGS:', cpp_args)
+endif
+
 
 message('Final CXXFLAGS:', cpp_args)
 message('Final LDFLAGS:', link_args)
@@ -49,18 +65,81 @@ endforeach
 libs = []
 
 if cpu == 'x86_64'
-	gather_avx2 = executable('gather_avx2',
-		'src/gather.cpp',
-		cpp_args: ['-DAVX2', '-mavx2'],
-		link_with: utils,
-		install: true)
+	if avx2_support
+		gather_avx2 = executable('gather_avx2',
+			'src/gather.cpp',
+			cpp_args: ['-DAVX2', '-mavx2'],
+			link_with: utils,
+			install: true)
+		div_avx2 = executable('div_avx2',
+			'src/div.cpp',
+			cpp_args: ['-DAVX2', '-mavx2'],
+			link_with: utils,
+			install: true)
+	endif
+	if avx512f_support
 	gather_avx512 = executable('gather_avx512',
 		'src/gather.cpp',
 		cpp_args: ['-DAVX512', '-mavx512f'],
 		link_with: utils,
 		install: true)
+	endif
+elif cpu == 'aarch64'
+	# gather_neon = executable('gather_neon',
+	# 	'src/gather_aarch64.cpp',
+	# 	cpp_args: ['-DNEON'],
+	# 	link_with: utils,
+	# 	install: true)
+	if sve_support
+		gather_sve = executable('gather_sve',
+			'src/gather_aarch64.cpp',
+			cpp_args: ['-DSVE', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp32_add = executable('sve_fp32_add',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP32_ADD', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp64_add = executable('sve_fp64_add',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP64_ADD', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp32_fma = executable('sve_fp32_fma',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP32_FMA', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+		sve_fp64_fma = executable('sve_fp64_fma',
+			'src/simd_aarch64.cpp',
+			cpp_args: ['-DSVE_FP64_FMA', '-march=armv8.6-a+sve'],
+			link_with: utils,
+			install: true)
+	endif
+	neon_fp32_add = executable('neon_fp32_add',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP32_ADD'],
+		link_with: utils,
+		install: true)
+	neon_fp64_add = executable('neon_fp64_add',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP64_ADD'],
+		link_with: utils,
+		install: true)
+	neon_fp32_fma = executable('neon_fp32_fma',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP32_FMA'],
+		link_with: utils,
+		install: true)
+	neon_fp64_fma = executable('neon_fp64_fma',
+		'src/simd_aarch64.cpp',
+		cpp_args: ['-DNEON_FP64_FMA'],
+		link_with: utils,
+		install: true)
 endif
 
+
 cpp = meson.get_compiler('cpp')
 cpu = target_machine.cpu_family()
 
diff --git a/src/detect_uarch.cpp b/src/detect_uarch.cpp
index 44ae0e1..19cdcfb 100644
--- a/src/detect_uarch.cpp
+++ b/src/detect_uarch.cpp
@@ -30,6 +30,15 @@ int main() {
   case cortex_a77:
     printf("-DARM_CORTEX_A77\n");
     break;
+  case cortex_a53:
+    printf("-DARM_CORTEX_A53\n");
+    break;
+  case cortex_a55:
+    printf("-DARM_CORTEX_A55\n");
+    break;
+  case cortex_a73:
+    printf("-DARM_CORTEX_A73\n");
+    break;
   case cortex_x1:
     printf("-DARM_CORTEX_X1\n");
     break;
@@ -70,6 +79,10 @@ int main() {
     printf("-DINTEL\n");
     printf("-DINTEL_BROADWELL\n");
     break;
+  case whiskylake:
+    printf("-DINTEL\n");
+    printf("-DINTEL_WHISKYLAKE\n");
+    break;
   case zen1:
     printf("-DAMD\n");
     printf("-DAMD_ZEN1\n");
diff --git a/src/div.cpp b/src/div.cpp
new file mode 100644
index 0000000..46e190d
--- /dev/null
+++ b/src/div.cpp
@@ -0,0 +1,127 @@
+#include "include/utils.h"
+#include <immintrin.h>
+#include <time.h>
+#include <unistd.h>
+
+int res = 0;
+const int n = 1000;
+int array[n] = {0};
+const int repeat = 500;
+const int unroll = 16;
+
+void test_1(int *indices) {
+#ifdef AVX2
+  __m256d index = _mm256_set1_pd(3.33);
+  __m256d d0 = _mm256_set1_pd(1.0001);
+  for (int i = 0; i < repeat; i++) {
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+    index = _mm256_div_pd(index, d0);
+  }
+  res += index[0];
+#endif
+#ifdef AVX512
+  __m512d index = _mm512_set1_pd(3.33);
+  __m512d d0 = _mm512_set1_pd(1.0001);
+  for (int i = 0; i < repeat; i++) {
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+    index = _mm512_div_pd(index, d0);
+  }
+  res += index[0];
+#endif
+}
+
+int main(int argc, char *argv[]) {
+
+  int opt;
+  while ((opt = getopt(argc, argv, "")) != -1) {
+    switch (opt) {
+    default:
+      fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  bind_to_core();
+  setup_perf_instructions();
+  setup_perf_cycles();
+
+  // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
+#ifdef AVX2
+  const int vlen = 8;
+#endif
+#ifdef AVX512
+  const int vlen = 16;
+#endif
+  int indices[vlen];
+  srand(time(NULL));
+  for (int i = 0; i < vlen; i++) {
+    indices[i] = rand() % 32;
+  }
+
+  printf("Numbers:");
+  for (int i = 0; i < vlen; i++) {
+    // generate patterns
+    printf(" %d", indices[i]);
+    array[indices[i]] = indices[i];
+  }
+  printf("\n");
+
+  int warmup = 1000;
+
+  for (int i = 0; i < warmup; i++) {
+    test_1(indices);
+  }
+
+  int m = 50000;
+  uint64_t cycles_before = perf_read_cycles();
+  uint64_t instructions_before = perf_read_instructions();
+
+  for (int i = 0; i < m; i++) {
+    test_1(indices);
+  }
+
+  uint64_t cycles_after = perf_read_cycles();
+  uint64_t instructions_after = perf_read_instructions();
+
+  // i9-14900K: AVX2 24 cycles
+  // i9-12900KS: AVX2 24 cycles
+  // i9-10980XE: AVX2 38 cycles, AVX512 43 cycles
+  // EPYC 9654: AVX2 20 cycles, AVX512 33 cycles
+  // EPYC 7742: AVX2 21 cycles
+  // EPYC 7551: AVX2 20 cycles
+  printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
+         (cycles_after - cycles_before) / m / repeat / unroll,
+         (instructions_after - instructions_before) / m / repeat / unroll,
+         (double)(instructions_after - instructions_before) /
+             (cycles_after - cycles_before),
+         res);
+  return 0;
+}
diff --git a/src/gather_aarch64.cpp b/src/gather_aarch64.cpp
new file mode 100644
index 0000000..2fbdc97
--- /dev/null
+++ b/src/gather_aarch64.cpp
@@ -0,0 +1,104 @@
+#include "include/utils.h"
+#include <arm_sve.h>
+#include <time.h>
+#include <unistd.h>
+
+int res = 0;
+const int n = 4000;
+uint32_t array[n] = {0};
+const int repeat = 1200;
+const int unroll = 16;
+
+void test_1(uint32_t *indices) {
+#ifdef SVE
+  uint32_t tmp[svcntw()];
+  svbool_t pg = svptrue_b32();
+  svuint32_t index = svld1_u32(pg, indices);
+  for (int i = 0; i < repeat; i++) {
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+    index = svld1_gather_u32index_u32(pg, array, index);
+  }
+  svst1_u32(pg, tmp, index); // 存储向量到数组
+  res += tmp[0];
+#endif
+#ifdef NEON
+
+#endif
+}
+
+int main(int argc, char *argv[]) {
+
+  int opt;
+  while ((opt = getopt(argc, argv, "")) != -1) {
+    switch (opt) {
+    default:
+      fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  bind_to_core();
+  setup_perf_instructions();
+  setup_perf_cycles();
+
+  // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
+#ifdef NEON
+  const int vlen = 4;
+#endif
+
+#ifdef SVE
+  const int vlen = svcntw();
+#endif
+  uint32_t indices[vlen];
+  srand(time(NULL));
+  for (int i = 0; i < vlen; i++) {
+    indices[i] = rand() % 32;
+  }
+
+  printf("Numbers:");
+  for (int i = 0; i < vlen; i++) {
+    // generate patterns
+    printf(" %d", indices[i]);
+    array[indices[i]] = indices[i];
+  }
+  printf("\n");
+
+  int warmup = 1000;
+
+  for (int i = 0; i < warmup; i++) {
+    test_1(indices);
+  }
+
+  int m = 50000;
+  uint64_t cycles_before = perf_read_cycles();
+  uint64_t instructions_before = perf_read_instructions();
+
+  for (int i = 0; i < m; i++) {
+    test_1(indices);
+  }
+
+  uint64_t cycles_after = perf_read_cycles();
+  uint64_t instructions_after = perf_read_instructions();
+
+  printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
+         (cycles_after - cycles_before) / m / repeat / unroll,
+         (instructions_after - instructions_before) / m / repeat / unroll,
+         (double)(instructions_after - instructions_before) /
+             (cycles_after - cycles_before),
+         res);
+  return 0;
+}
diff --git a/src/ipc.cpp b/src/ipc.cpp
index 342f920..c68164b 100644
--- a/src/ipc.cpp
+++ b/src/ipc.cpp
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
     test_1();
   }
 
-  int m = 50000;
+  int m = 100000;
   uint64_t cycles_before = perf_read_cycles();
   uint64_t instructions_before = perf_read_instructions();
 
diff --git a/src/simd_aarch64.cpp b/src/simd_aarch64.cpp
new file mode 100644
index 0000000..42525ef
--- /dev/null
+++ b/src/simd_aarch64.cpp
@@ -0,0 +1,312 @@
+#include "include/utils.h"
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <time.h>
+#include <unistd.h>
+
+int res = 0;
+const int n = 4000;
+uint32_t array[n] = {0};
+const int repeat = 1200;
+const int unroll = 16;
+
+#ifdef SVE_FP32_ADD
+void test1(float *indices) {
+  float tmp[svcntw()];
+  svbool_t pg = svptrue_b32();
+  svfloat32_t v0 = svdup_f32(1.0);
+  svfloat32_t v1 = svld1_f32(pg, indices);
+  for (int i = 0; i < n; i++) {
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+    v1 = svadd_f32_z(pg, v1, v0);
+  }
+  svst1_f32(pg, tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef NEON_FP32_ADD
+void test1(float *indices) {
+  float tmp[4];
+  float32x4_t v0 = vdupq_n_f32(1.0);
+  float32x4_t v1 = vld1q_f32(indices);
+  for (int i = 0; i < n; i++) {
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+    v1 = vaddq_f32(v1, v0);
+  }
+  vst1q_f32(tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef SVE_FP64_ADD
+void test1(double *indices) {
+  double tmp[svcntd()];
+  svbool_t pg = svptrue_b64();
+  svfloat64_t v0 = svdup_f64(1.0);
+  svfloat64_t v1 = svld1_f64(pg, indices);
+  for (int i = 0; i < n; i++) {
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+    v1 = svadd_f64_z(pg, v1, v0);
+  }
+  svst1_f64(pg, tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef NEON_FP64_ADD
+void test1(double *indices) {
+  double tmp[2];
+  float64x2_t v0 = vdupq_n_f64(1.0);
+  float64x2_t v1 = vld1q_f64(indices);
+  for (int i = 0; i < n; i++) {
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+    v1 = vaddq_f64(v1, v0);
+  }
+  vst1q_f64(tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef SVE_FP32_FMA
+void test1(float *indices) {
+  float tmp[svcntw()];
+  svbool_t pg = svptrue_b32();
+  svfloat32_t v0 = svdup_f32(1.0);
+  svfloat32_t v1 = svld1_f32(pg, indices);
+  for (int i = 0; i < n; i++) {
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f32_z(pg, v1, v0, 1.0);
+  }
+  svst1_f32(pg, tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef NEON_FP32_FMA
+void test1(float *indices) {
+  float tmp[4];
+  float32x4_t v0 = vdupq_n_f32(1.0);
+  float32x4_t v1 = vld1q_f32(indices);
+  for (int i = 0; i < n; i++) {
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+    v1 = vfmaq_n_f32(v1, v0, 1.0);
+  }
+  vst1q_f32(tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef SVE_FP64_FMA
+void test1(double *indices) {
+  double tmp[svcntd()];
+  svbool_t pg = svptrue_b64();
+  svfloat64_t v0 = svdup_f64(1.0);
+  svfloat64_t v1 = svld1_f64(pg, indices);
+  for (int i = 0; i < n; i++) {
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+    v1 = svmla_n_f64_z(pg, v1, v0, 1.0);
+  }
+  svst1_f64(pg, tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+#ifdef NEON_FP64_FMA
+void test1(double *indices) {
+  double tmp[2];
+  float64x2_t v0 = vdupq_n_f64(1.0);
+  float64x2_t v1 = vld1q_f64(indices);
+  for (int i = 0; i < n; i++) {
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+    v1 = vfmaq_n_f64(v1, v0, 1.0);
+  }
+  vst1q_f64(tmp, v1);
+  res += tmp[0];
+}
+#endif
+
+int main(int argc, char *argv[]) {
+
+  int opt;
+  while ((opt = getopt(argc, argv, "")) != -1) {
+    switch (opt) {
+    default:
+      fprintf(stderr, "Usage: %s [-p]\n", argv[0]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  bind_to_core();
+  setup_perf_instructions();
+  setup_perf_cycles();
+
+  // int indices[] = {0, 1, 2, 3, 4, 5, 6, 7};
+#if defined(NEON_FP32_ADD) || defined(NEON_FP32_FMA)
+  const int vlen = 4;
+  float indices[vlen];
+#endif
+
+#if defined(NEON_FP64_ADD) || defined(NEON_FP64_FMA)
+  const int vlen = 2;
+  double indices[vlen];
+#endif
+
+#if defined(SVE_FP32_ADD) || defined(SVE_FP32_FMA)
+  const int vlen = svcntw();
+  float indices[vlen];
+#endif
+
+#if defined(SVE_FP64_ADD) || defined(SVE_FP64_FMA)
+  const int vlen = svcntd();
+  double indices[vlen];
+#endif
+  for (int i = 0; i < vlen; i++) {
+    indices[i] = i + 1.0;
+  }
+
+  // printf("Numbers:");
+  // for (int i = 0; i < vlen; i++) {
+  //   // generate patterns
+  //   printf(" %d", indices[i]);
+  //   array[indices[i]] = indices[i];
+  // }
+  // printf("\n");
+
+  int warmup = 1000;
+
+  for (int i = 0; i < warmup; i++) {
+    test1(indices);
+  }
+
+  int m = 50000;
+  uint64_t cycles_before = perf_read_cycles();
+  uint64_t instructions_before = perf_read_instructions();
+
+  for (int i = 0; i < m; i++) {
+    test1(indices);
+  }
+
+  uint64_t cycles_after = perf_read_cycles();
+  uint64_t instructions_after = perf_read_instructions();
+
+  printf("%ld cycles, %ld instructions, %.2lf ipc, %d ans\n",
+         (cycles_after - cycles_before) / m / repeat / unroll,
+         (instructions_after - instructions_before) / m / repeat / unroll,
+         (double)(instructions_after - instructions_before) /
+             (cycles_after - cycles_before),
+         res);
+  return 0;
+}
diff --git a/src/uarch.cpp b/src/uarch.cpp
index dbc3b70..7c292fa 100644
--- a/src/uarch.cpp
+++ b/src/uarch.cpp
@@ -74,6 +74,9 @@ enum uarch get_uarch_inner() {
   // arm64
   int implementer = 0;
   int part = 0;
+  bool sve = false;
+  bool avx512f = false;
+  bool avx2 = false;
 
   while (std::getline(t, line)) {
     size_t pos = line.find(':');
@@ -91,11 +94,33 @@ enum uarch get_uarch_inner() {
         part = std::stoi(value, nullptr, 16);
       } else if (key == "Model Name" && value == " Loongson-3C5000") {
         return la464;
+      } else if (key == "flags") {
+        if (value.find("avx512f") != std::string::npos && !avx512f) {
+          avx512f = true;
+        }
+        if (value.find("avx2") != std::string::npos && !avx2) {
+          avx2 = true;
+        }
+      } else if (key == "Features") {
+        if (value.find("sve") != std::string::npos && !sve) {
+          sve = true;
+        }
       }
     }
   }
   fprintf(stderr, "Found CPU family %d, model %d, implementer %d, part %d\n",
           family, model, implementer, part);
+
+  if (avx2) {
+    fprintf(stdout, "AVX2 detected\n");
+  }
+  if (avx512f) {
+    fprintf(stdout, "AVX512F detected\n");
+  }
+  if (sve) {
+    fprintf(stdout, "SVE detected\n");
+  }
+
   if (family == 6 && model == 183) {
     fprintf(stderr, "Intel Raptor Lake detected\n");
     fprintf(stderr, "Configured for Golden Cove\n");
@@ -113,6 +138,9 @@ enum uarch get_uarch_inner() {
     fprintf(stderr, "Intel Broadwell detected\n");
     return broadwell;
     // https://en.wikichip.org/wiki/amd/cpuid
+  } else if (family == 6 && model == 142) {
+    fprintf(stderr, "Intel Whiskylake detected\n");
+    return skylake;
   } else if (family == 23 && model == 1) {
     fprintf(stderr, "AMD Zen 1 detected\n");
     return zen1;
@@ -135,6 +163,18 @@ enum uarch get_uarch_inner() {
   } else if (implementer == 0x41 && part == 0xd0d) {
     fprintf(stderr, "ARM Cortex A77 detected\n");
     return cortex_a77;
+  } else if (implementer == 0x41 && part == 0xd41) {
+    fprintf(stderr, "ARM Cortex A78 detected\n");
+    return cortex_a78;
+  } else if (implementer == 0x41 && part == 0xd05) {
+    fprintf(stderr, "ARM Cortex A55 detected\n");
+    return cortex_a55;
+  } else if (implementer == 0x41 && part == 0xd03) {
+    fprintf(stderr, "ARM Cortex A75 detected\n");
+    return cortex_a53;
+  } else if (implementer == 0x41 && part == 0xd09) {
+    fprintf(stderr, "ARM Cortex A76 detected\n");
+    return cortex_a73;
   } else if (implementer == 0x41 && part == 0xd40) {
     fprintf(stderr, "ARM Neoverse V1 detected\n");
     return neoverse_v1;