cpu: aarch64: clean up SIMD getters in eltwise

jondea · jondea · commit 5ff7ead69b4a · 2025-12-22T10:04:38.000Z
- Add cpu_isa_t::sve to simd_elems, where the vector length is
  calculated at runtime (kernel generation time)
- Add simd_bytes (in addition to simd_elems)
- Add get_sve_elements(data_type)
- Clean up use of SIMD width/bytes in jit_uni_eltwise (including one
  example where we were referring to cacheline as SIMD width, which is
  not necessarily equivalent)
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
@@ -236,6 +236,13 @@ inline uint64_t get_sve_length() {
     return cpu().getSveLen();
 }
 
+// SVE length in element type
+inline uint64_t get_sve_length(data_type_t data_type) {
+    const size_t dt_size = types::data_type_size(data_type);
+    assert(dt_size > 0);
+    return get_sve_length() / dt_size;
+}
+
 inline bool mayiuse_atomic() {
     using namespace Xbyak_aarch64::util;
     return cpu().isAtomicSupported();
@@ -292,13 +299,15 @@ inline size_t data_type_vnni_simd_elems(data_type_t data_type) {
 }
 
 // Maximum number of elements of a given type in a SIMD (SVE/Neon) vector for a
-// given ISA
+// given ISA. Note that if cpu_isa_t is just sve (not sve_vl) then the value is
+// determined at runtime (unlike the others, which can be determiend at compile time)
 inline size_t simd_elems(data_type_t dt, cpu_isa_t cpu_isa) {
     switch (cpu_isa) {
         case sve_512: return data_type_vnni_simd_elems<sve_512>(dt);
         case sve_256: return data_type_vnni_simd_elems<sve_256>(dt);
         case sve_128:
         case asimd: return data_type_vnni_simd_elems<sve_128>(dt);
+        case sve: return get_sve_length(dt);
         default: {
             // If this ISA does implement SIMD, then you need to add support for
             // it in this function. If not, then you need to check earlier in
@@ -309,6 +318,10 @@ inline size_t simd_elems(data_type_t dt, cpu_isa_t cpu_isa) {
     }
 }
 
+inline size_t simd_bytes(cpu_isa_t isa) {
+    return simd_elems(data_type::s8, isa);
+}
+
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/aarch64/jit_uni_eltwise.cpp b/src/cpu/aarch64/jit_uni_eltwise.cpp
@@ -101,6 +101,8 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel_t {
 
     void generate() override {
         const bool is_fwd = pd_->is_fwd();
+        // Note: load type may not the same as compute type
+        const auto simd_elems_per_load = simd_elems(data_type(), isa);
 
         preamble();
         XReg param = param1;
@@ -116,7 +118,7 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel_t {
         ldr(reg_work_amount, ptr(X_TMP_0));
         eltwise_injector_->load_table_addr();
         Label vectorized_loop_start, remainder_loop_start, remainder_loop_end;
-        cmp(reg_work_amount, simd_w());
+        cmp(reg_work_amount, simd_elems_per_load);
         b(LT, remainder_loop_start);
         L(vectorized_loop_start);
 
@@ -163,17 +165,17 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel_t {
             }
         }
 
-        const auto shift = vlen();
         store_vector(reg_dst, vmm_src.s);
         // Update pointers for the next iteration
         // Note: we use X_TMP_0 as a temporary register to avoid conflicts with
         // other registers.
-        add_imm(reg_src, reg_src, shift, X_TMP_0);
-        add_imm(reg_dst, reg_dst, shift, X_TMP_0);
-        if (!is_fwd) add_imm(reg_diff_dst, reg_diff_dst, shift, X_TMP_0);
+        add_imm(reg_src, reg_src, simd_bytes(isa), X_TMP_0);
+        add_imm(reg_dst, reg_dst, simd_bytes(isa), X_TMP_0);
+        if (!is_fwd)
+            add_imm(reg_diff_dst, reg_diff_dst, simd_bytes(isa), X_TMP_0);
 
-        sub_imm(reg_work_amount, reg_work_amount, simd_w(), X_TMP_0);
-        cmp(reg_work_amount, simd_w());
+        sub_imm(reg_work_amount, reg_work_amount, simd_elems_per_load, X_TMP_0);
+        cmp(reg_work_amount, simd_elems_per_load);
         b(GE, vectorized_loop_start);
 
         // tail processing
@@ -214,7 +216,7 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel_t {
 
         add_imm(reg_src, reg_src, dtype_size(), X_TMP_0);
         add_imm(reg_dst, reg_dst, dtype_size(), X_TMP_0);
-        add_imm(reg_diff_dst, reg_diff_dst, dtype_size(), X_TMP_0);
+        if (!is_fwd) add_imm(reg_diff_dst, reg_diff_dst, dtype_size(), X_TMP_0);
         subs(reg_work_amount, reg_work_amount, 1);
 
         b(remainder_loop_start);
@@ -229,12 +231,6 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel_t {
 private:
     using TReg = typename cpu_isa_traits<isa>::TReg;
     using TRegS = typename cpu_isa_traits<isa>::TRegS;
-    int vlen() {
-        // TODO: If we do decide to add a different enum for
-        // VLA SVE, we should handle this in cpu_isa_traits
-        return isa == asimd ? cpu_isa_traits<asimd>::vlen : get_sve_length();
-    }
-    int simd_w() { return vlen() / dtype_size(); }
 
     XReg reg_src = x11;
     XReg reg_dst = x8;
@@ -472,7 +468,8 @@ status_t jit_uni_eltwise_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
 
     const memory_desc_wrapper data_d(pd()->src_md());
     const auto nelems = data_d.nelems(true);
-    const int simd_w = 64 / data_d.data_type_size();
+    // Number of elements in a cacheline. We don't want threads to share
+    const int cacheline_elems = 64 / data_d.data_type_size();
 
     const data_type_t src_dt = pd()->src_md()->data_type;
     const auto offset_bytes
@@ -484,9 +481,10 @@ status_t jit_uni_eltwise_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
     parallel(0, [&](const int ithr, const int nthr) {
         dim_t start {0}, end {0};
 
-        balance211(utils::div_up(nelems, simd_w), nthr, ithr, start, end);
-        start = nstl::min(nelems, start * simd_w);
-        end = nstl::min(nelems, end * simd_w);
+        balance211(
+                utils::div_up(nelems, cacheline_elems), nthr, ithr, start, end);
+        start = nstl::min(nelems, start * cacheline_elems);
+        end = nstl::min(nelems, end * cacheline_elems);
         if (start == end) return;
 
         jit_args_t args;
@@ -563,7 +561,8 @@ status_t jit_uni_eltwise_bwd_t<isa>::execute(const exec_ctx_t &ctx) const {
     const memory_desc_wrapper data_d(pd()->data_md());
     const memory_desc_wrapper diff_data_d(pd()->diff_src_md());
     const auto nelems = data_d.nelems(true);
-    const int simd_w = 64 / data_d.data_type_size();
+    // Number of elements in a cacheline. We don't want threads to share
+    const int cacheline_elems = 64 / data_d.data_type_size();
 
     const data_type_t data_dt = pd()->use_dst() ? pd()->dst_md()->data_type
                                                 : pd()->src_md()->data_type;
@@ -579,9 +578,10 @@ status_t jit_uni_eltwise_bwd_t<isa>::execute(const exec_ctx_t &ctx) const {
     parallel(0, [&](const int ithr, const int nthr) {
         dim_t start {0}, end {0};
 
-        balance211(utils::div_up(nelems, simd_w), nthr, ithr, start, end);
-        start = nstl::min(nelems, start * simd_w);
-        end = nstl::min(nelems, end * simd_w);
+        balance211(
+                utils::div_up(nelems, cacheline_elems), nthr, ithr, start, end);
+        start = nstl::min(nelems, start * cacheline_elems);
+        end = nstl::min(nelems, end * cacheline_elems);
         if (start == end) return;
 
         jit_args_t args;