Use pointers to pointers for depthwise and create specific functions for conv2d for save pointers state after a function call

kiyaev · JaccovG · commit 536cdb23a7b6 · 2020-03-06T13:24:33.000+01:00
diff --git a/lib/src/kernels/convolution/mli_krn_conv2d_hwc.h b/lib/src/kernels/convolution/mli_krn_conv2d_hwc.h
@@ -83,9 +83,9 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
     const int out_increment_in_ch_loop = channel_per_loop - out_compensation_row_loop;
     const int out_increment_in_ch_loop_v = channels_per_loop_v - out_compensation_row_loop;
 
-    MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T) __restrict)in_ftrs;
+    const MLI_PTR(io_T) __restrict in_ptr = (MLI_PTR(io_T) __restrict)in_ftrs;
     MLI_CONV_OUT_PTR(io_T) __restrict out_ptr = (MLI_CONV_OUT_PTR(io_T) __restrict)out_ftrs;
-    MLI_PTR(w_T) __restrict w_ptr = (MLI_PTR(w_T) __restrict)weights;
+    const MLI_PTR(w_T) __restrict w_ptr = (MLI_PTR(w_T) __restrict)weights;
     // MLI_PTR(w_T) __restrict w_ptr_local = (MLI_PTR(w_T) __restrict)weights;
     int out_ch_idx = 0;
 
@@ -114,20 +114,21 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
                 __builtin_assume(amount_columns > 0);
 
                 __v2i32_t v2accu_dotprod = v2acc_weights_add;
-                dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
-                                in_col_step, in_row_step, krn_col_step, krn_row_step);
-                in_ptr += in_increment_clmn_loop;
                 for (int W_idx = 0; W_idx < amount_columns; W_idx++) {
+                    dotprod2D_hwc_v(&in_ptr, &w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
+                                    in_col_step, in_row_step, krn_col_step, krn_row_step);
+                    //compensite increment of input tensor pointer from dotprod2D_hwc_v function
+                    in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
+                    //compensite increment of weights pointer from dotprod2D_hwc_v function
+                    w_ptr -= kernel_height * krn_row_step;
+
                     // Cast result to output type
                     mli_prv_clip_relu_store_output_v(out_ptr, &v2accu_dotprod, v2quant_params, val_min_limit, val_max_limit);
                     out_ptr += out_increment_clmn_loop;
 
                     v2accu_dotprod = v2acc_weights_add;
-                    dotprod2D_hwc_v(in_ptr, w_ptr, &v2accu_dotprod, kernel_width, kernel_height,
-                                    in_col_step, in_row_step, krn_col_step, krn_row_step);
-                    in_ptr += in_increment_clmn_loop;
                 } // for W_idx
-                in_ptr += in_increment_row_loop - stride_width * filters * in_ch;
+                in_ptr += in_increment_row_loop;
                 out_ptr += out_increment_row_loop;
             } // for H_idx
             in_ptr -= in_compensation_row_loop;
@@ -154,28 +155,31 @@ static __attribute__ ((always_inline)) void depthwise_convolution2D_hwcn_nopad(
                     // out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
                     //============================================
                     __v2i32_t accu = v2global_other_additives;
-                    accu = dotprod2D_inp_width_v(in_ptr, w_ptr, &accu, kernel_width, kernel_height,
+                    accu = dotprod2D_inp_width_v(&in_ptr, &w_ptr, &accu, kernel_width, kernel_height,
                                         in_col_step, in_row_step, krn_col_step, krn_row_step, in_increment_clmn_loop);
-
+                    //compensite increment of input tensor pointer from dotprod2D_hwc_v function
+                    in_ptr += 2 * in_increment_clmn_loop - kernel_height * in_row_step;
+                    //compensite increment of weights pointer from dotprod2D_hwc_v function
+                    w_ptr -= kernel_height * krn_row_step;
                     // Cast result to output type
                     mli_prv_clip_relu_store_output_inp_width_v(out_ptr, &accu, &quant_params, val_min_limit, val_max_limit, out_increment_clmn_loop);
-
-                    in_ptr += 2 * in_increment_clmn_loop;
                     out_ptr += 2 * out_increment_clmn_loop;
                 } // for W_idx
                 if( amount_columns & 0x1) {
                     // Convolution core. Here calculations performes in a unfolded expression way: 
                     // out_val = (x-x_zp)*(w) + b) = -sum_i(w*x_zp) + sum(x*w) + b
                     //============================================
                     acc_T accu = 0;
-                    accu = dotprod2D(in_ptr, w_ptr, accu, kernel_width, kernel_height,
+                    accu = dotprod2D(&in_ptr, &w_ptr, accu, kernel_width, kernel_height,
                                         in_col_step, in_row_step, krn_col_step, krn_row_step);
+                    //compensite increment of input tensor pointer from dotprod2D_hwc_v function
+                    in_ptr += in_increment_clmn_loop - kernel_height * in_row_step;
+                    //compensite increment of weights pointer from dotprod2D_hwc_v function
+                    w_ptr -= kernel_height * krn_row_step;
                     accu += global_other_additives;
 
                     // Cast result to output type
                     mli_prv_clip_relu_store_output(out_ptr, accu, &quant_params, val_min_limit, val_max_limit);
-
-                    in_ptr += in_increment_clmn_loop;
                     out_ptr += out_increment_clmn_loop;
                 }
                 in_ptr += in_increment_row_loop;
@@ -882,3 +886,4 @@ LOOP_PIPELINE_ENABLE
 
 #endif // _MLI_KRN_CONV2D_HWC_H_
 
+
diff --git a/lib/src/kernels/convolution/mli_krn_dotprod.h b/lib/src/kernels/convolution/mli_krn_dotprod.h
@@ -75,6 +75,130 @@ static void __attribute__ ((always_inline)) dotprod2D_hwc_v (
     }
 }
 
+//The function uses pointers to pointers for in and krn. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template < typename in_T, typename w_T, typename acc_T >
+static void __attribute__ ((always_inline)) dotprod2D_hwc_d (
+        const MLI_PTR(in_T) __restrict *in, 
+        const MLI_PTR(w_T) __restrict *krn,
+        acc_T * accu,        
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step,
+        int kern_col_step,
+        int kern_row_step) {
+    in_row_step -= width * in_col_step;
+    kern_row_step -= width * kern_col_step;
+
+#pragma clang loop unroll(full)
+    for (int32_t row = 0; row < height; row++) {
+#pragma clang loop unroll(full)
+        for (int32_t clmn = 0; clmn < width; clmn++) {
+            mli_prv_load_mac_vec2 (accu, *in, *krn);
+            *krn += kern_col_step;
+            *in += in_col_step;
+        }
+        *in += in_row_step;
+        *krn += kern_row_step;
+    }
+}
+
+//The function uses pointers to pointers for in and krn. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template < typename in_T, typename w_T, typename acc_T >
+static void __attribute__ ((always_inline)) dotprod2D_hwc_v (
+        const MLI_PTR(in_T) __restrict *in, 
+        const MLI_PTR(w_T) __restrict *krn,
+        acc_T * accu,        
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step,
+        int kern_col_step,
+        int kern_row_step) {
+
+    in_row_step -= width * in_col_step;
+    kern_row_step -= width * kern_col_step;
+#pragma clang loop unroll(full)
+    for (int32_t row = 0; row < height; row++) {
+#pragma clang loop unroll(full)
+        for (int32_t clmn = 0; clmn < width; clmn++) {
+            v2q15_t k_v = mli_prv_load_2_samples(*krn);
+            *krn += kern_col_step;
+            v2q15_t tx = mli_prv_load_2_samples(*in);
+            *in += in_col_step;
+            mli_math_mac_fx_vec2 (accu, tx, k_v);
+        }
+        *in += in_row_step;
+        *krn += kern_row_step;
+    }
+}
+
+//The function uses pointers to pointers for in and krn. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template <typename io_T, typename w_T, typename acc_T>
+static acc_T __attribute__ ((always_inline)) dotprod2D_inp_width_v(
+        const MLI_PTR(io_T) __restrict *inp,
+        const MLI_PTR(w_T)  __restrict *krn,
+        acc_T *accu,
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step,
+        int kern_col_step,
+        int kern_row_step,
+        int in_width_step) {
+    in_row_step -= width * in_col_step;
+    kern_row_step -= width * kern_col_step;
+#pragma clang loop unroll(full)
+    for (int row = 0; row < height; row++) {
+#pragma clang loop unroll(full)
+        for (int clmn = 0; clmn < width; clmn++) {
+            int16_t k = **krn;
+            v2q15_t k_v = { k, k };
+            v2q15_t in_v = {(*inp)[0], (*inp)[in_width_step]};
+            mli_math_mac_fx_vec2(accu, in_v, k_v);
+            *inp += in_col_step;
+            *krn += kern_col_step;
+        }
+        *inp += in_row_step;
+        *krn += kern_row_step;
+    }
+    return *accu;
+}
+
+//The function uses pointers to pointers for in and krn. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template <typename io_T, typename w_T, typename acc_T>
+static acc_T __attribute__ ((always_inline)) dotprod2D(
+        const MLI_PTR(io_T) __restrict *in,
+        const MLI_PTR(w_T)  __restrict *krn,
+        acc_T accu,
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step,
+        int kern_col_step,
+        int kern_row_step) {
+    in_row_step -= width * in_col_step;
+    kern_row_step -= width * kern_col_step;
+    for (int row = 0; row < height; row++) {
+        for (int clmn = 0; clmn < width; clmn++) {
+            accu = mli_math_mac_fx(accu, (**in), (**krn));
+            *in += in_col_step;
+            *krn += kern_col_step;
+        }
+        *in += in_row_step;
+        *krn += kern_row_step;
+    }
+    return accu;
+}
+
 template < typename in_T, typename w_T, typename acc_T >
 static void __attribute__ ((always_inline)) dotprod2D_hwc_d (
         const MLI_PTR(in_T) __restrict in, 
diff --git a/lib/src/kernels/pooling/mli_krn_reduce_sum2d.h b/lib/src/kernels/pooling/mli_krn_reduce_sum2d.h
@@ -298,6 +298,48 @@ inline acc_T __attribute__((always_inline)) reduce_sum2D(
     return accu;
 }
 
+//The function uses pointers to pointers for in. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template <typename io_T, typename acc_T>
+static inline acc_T __attribute__((always_inline)) reduce_sum2D_v(
+        const MLI_PTR(io_T) __restrict *in,
+        const int16_t mul,
+        acc_T *v2acc,
+
+        const int width,
+        const int height,
+        int in_col_step,
+        int in_row_step) {
+
+    v2q15_t v2mul = {mul, mul};
+    if (width == 1){
+#pragma clang loop unroll(full)
+        for (int row = 0; row < height; row++) {
+            mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
+            *in += in_row_step;
+        }
+    } else if (height == 1){
+#pragma clang loop unroll(full)
+        for (int clmn = 0; clmn < width; clmn++) {
+            mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
+            *in += in_col_step;
+        }
+    } else {
+        in_row_step -= width * in_col_step;
+#pragma clang loop unroll(full)
+        for (int row = 0; row < height; row++) {
+#pragma clang loop unroll(full)
+            for (int clmn = 0; clmn < width; clmn++) {
+                mli_math_mac_fx_vec2(v2acc, mli_prv_load_2_samples(*in), v2mul);
+                *in += in_col_step;
+            }
+            *in += in_row_step;
+        }
+    }
+    return *v2acc;
+}
+
 template <typename io_T, typename acc_T>
 static inline acc_T __attribute__((always_inline)) reduce_sum2D_v(
         const MLI_PTR(io_T) __restrict in,
diff --git a/lib/src/private/mli_prv_quant.h b/lib/src/private/mli_prv_quant.h
@@ -125,6 +125,25 @@ inline mli_acc32_t __attribute__ ((always_inline)) weights_additive(
         return init_accum;
 }
 
+//The function uses pointers to pointers for weights. 
+//The caller of the function should compensate for the increment
+//done inside this function.
+template <typename acc_T>
+inline acc_T __attribute__ ((always_inline)) weights_additive_v(
+        const MLI_PTR(int8_t) __restrict *weights, acc_T *init_accum,
+        const s8asym_quant_specific_params* quant_params,
+        const int width,  const int height, int col_step, int row_step) {
+
+    // returns -(in_zero_point * cumsum(weights)) For S8ASYM 
+    if (quant_params->in_offset != 0) {
+        acc_T tmp_acc = reduce_sum2D_v(weights, -quant_params->in_offset, init_accum, width, height, col_step, row_step);
+        //compensite increment of weights pointer from reduce_sum2D_v function
+        weights -= height * row_step;
+        return tmp_acc;
+    } else {
+        return *init_accum;
+    }
+}
 
 template <typename acc_T>
 inline acc_T __attribute__ ((always_inline)) weights_additive_v(