add overload function mli_prv_load_mac for mac scalar on array's element

Fargk · Fargk · commit cd35b35818b0 · 2019-03-13T12:55:26.000+03:00
diff --git a/lib/src/kernels/eltwise/mli_krn_eltwise.h b/lib/src/kernels/eltwise/mli_krn_eltwise.h
@@ -357,7 +357,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_mul_fx (
         if ((out_size & 0x3) || (out_size < 0x7)) {
             for (int j = 0; j < (out_size & 0x3); j++) {
                 auto acc = mli_prv_init_accu((io_T)0);
-                mli_prv_load_mac(&acc, vec++, (const io_T *__restrict) &broadcast_val);
+                mli_prv_load_mac(&acc, vec++, broadcast_val);
                 mli_prv_clip_and_store_output(out++, &acc, mul_out_shift);
             }
             for (int j = 0; j < (out_size & ~0x3) / 2; j++) {
@@ -431,7 +431,7 @@ static inline void __attribute__ ((always_inline)) eltwise_op_mul_with_restricts
         if ((out_size & 0x3) || (out_size < 0x7)) {
             for (int j = 0; j < (out_size & 0x3); j++) {
                 auto acc = mli_prv_init_accu((io_T)0);
-                mli_prv_load_mac(&acc, vec++, (const io_T *__restrict) &broadcast_val);
+                mli_prv_load_mac(&acc, vec++, broadcast_val);
                 mli_prv_clip_and_store_output(out++, &acc, mul_out_shift);
             }
             for (int j = 0; j < (out_size & ~0x3) / 2; j++) {
diff --git a/lib/src/private/mli_prv_dsp.h b/lib/src/private/mli_prv_dsp.h
@@ -67,6 +67,12 @@ static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
         MLI_PTR(in_T) __restrict in,
         MLI_PTR(w_T) __restrict k);
 
+template < typename in_T, typename w_T, typename acc_T >
+static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
+        acc_T * accu,
+        MLI_PTR(in_T) __restrict in,
+        w_T k);
+
 template < typename in_T, typename w_T, typename acc_T >
 static inline void __attribute__ ((always_inline)) mli_prv_load_mac_vec2(
         acc_T * accu,
@@ -550,6 +556,37 @@ static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
     *accu = _dmachbl(*in, *(MLI_PTR(uint8_t)) k);
 }
 
+static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
+        accum40_t * accu,
+        const MLI_PTR(int16_t) __restrict in,
+        const int16_t k) {
+    *accu = fx_a40_mac_q15(*accu, *in, k);
+}
+
+static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
+        int32_t * accu,
+        const MLI_PTR(int8_t) __restrict in,
+        const int8_t k) {
+    /* casting the in pointer to unsigned to make sure no sign extension happens on the load
+     * this way the 'second' byte contains zeros. and it is safe to use dmac.
+     * the sign extension happens inside the dmachbl operation.
+     * for the load of 'k' we need sign extension because we need a 16bit value.
+     * the value of the second half is don't care because it will be multiplied by 0
+     */
+    *accu = _dmachbl(k, *(MLI_PTR(uint8_t)) in);
+}
+
+static inline void __attribute__ ((always_inline)) mli_prv_load_mac(
+        int32_t * accu, const MLI_PTR(int16_t) __restrict in,
+        const int8_t k) {
+    /* casting the in pointer to unsigned to make sure no sign extension happens on the load
+     * this way the 'second' byte contains zeros. and it is safe to use dmac.
+     * the sign extension happens inside the dmachbl operation.
+     * for the load of 'in' we need sign extension because we need a 16bit value.
+     * the value of the second half is don't care because it will be multiplied by 0
+     */
+    *accu = _dmachbl(*in, (uint8_t)k);
+}
 
 static inline void __attribute__ ((always_inline)) mli_prv_load_mac_vec2(
         accum40_t * accu,