Merge pull request #56 from foss-for-synopsys-dwc-arc-processors/optimize

JaccovG · web-flow · commit 4f2688c8b24d · 2019-05-28T17:53:51.000+02:00
Optimize
diff --git a/README.md b/README.md
@@ -73,6 +73,15 @@ Building of embARC MLI library
 
 5. Result Quality shall be "S/N=1823.9     (65.2 db)"
 		
+## Optimizations for code size
+------------------------------
+By default the embARC MLI Library is build for optimal speed. If code size needs to be reduced, there are two things that can be done:
+1. For convolution and pooling layers there are specialized funtions for specific kernel sizes, they are called by a wrapper functions based on the parameters.
+These parameters are compile time constant in the application, so the application can directly call the specialized functions. This will reduce over all code size.
+Please be aware that the list of specializations is not guaranteed to be backwards compatible between releases.
+
+2. Use a different optimization mode when calling the makefile. OPTMODE=size will optimize for size. default is OPTMODE=speed
+	'gmake TCF_FILE=../../hw/em9d.tcf OPTMODE=size'
 
 ## Known Issues
 ---------------
diff --git a/build/rules.mk b/build/rules.mk
@@ -64,6 +64,8 @@ quote=$(subst %,$(Q)%, \
 # Global settings
 #=============================================================
 TOOLCHAIN ?= gnu
+#optmization mode
+OPTMODE ?= speed
 
 export DEBUG_BUILD?=ON
 #export ASM_OUT?=OFF
@@ -76,6 +78,13 @@ endif
 #    # CFLAGS += -Hon=Print_var_info
 #endif
 
+ifeq ($(OPTMODE),size)
+	CFLAGS += -O2 -Hlto
+endif
+ifeq ($(OPTMODE),speed)
+	CFLAGS += -O3
+endif
+
 #=============================================================
 # Files and directories
 #=============================================================
diff --git a/examples/example_cifar10_caffe/Makefile b/examples/example_cifar10_caffe/Makefile
@@ -28,7 +28,7 @@ BUILD_DIR    ?= ./obj
 OUT_NAME     ?= example_cifar10_caffe
 ifeq ($(TOOLCHAIN),mwdt)
 # MWDT specific options
-CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
+CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
 else
 PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
 
diff --git a/examples/example_cifar10_caffe/cifar10_model_chw.c b/examples/example_cifar10_caffe/cifar10_model_chw.c
@@ -434,7 +434,7 @@ static void check_result(
 //========================================================================================
 #if (MODEL_BIT_DEPTH != MODEL_FX_8)
 static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
-    return mli_krn_maxpool_chw_fx16_k3x3(in, cfg, out);
+    return mli_krn_maxpool_chw_fx16_k3x3_krnpad(in, cfg, out);
 }
 
 static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
@@ -455,7 +455,7 @@ static inline mli_status mli_krn_permute_fx(const mli_tensor *in, const mli_perm
 
 #else // MODEL_BIT_DEPTH == (MODEL_FX_8W16D || MODEL_FX_8W16D)
 static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
-    return mli_krn_maxpool_chw_fx8_k3x3(in, cfg, out);
+    return mli_krn_maxpool_chw_fx8_k3x3_krnpad(in, cfg, out);
 }
 
 static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
diff --git a/examples/example_har_smartphone/Makefile b/examples/example_har_smartphone/Makefile
@@ -28,7 +28,7 @@ BUILD_DIR    ?= ./obj
 OUT_NAME     ?= example_har_smartphone
 ifeq ($(TOOLCHAIN),mwdt)
 # MWDT specific options
-CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
+CFLAGS       =  -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
 else
 PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
 
diff --git a/include/api/mli_krn_maxpool_spec_api.h b/include/api/mli_krn_maxpool_spec_api.h
@@ -101,6 +101,8 @@ mli_status mli_krn_maxpool_chw_fx16_k2x2(const mli_tensor * in, const mli_pool_c
 mli_status mli_krn_maxpool_chw_fx16_k2x2_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx16_k3x3(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx16_k3x3_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
+mli_status mli_krn_maxpool_chw_fx16_k2x2_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
+mli_status mli_krn_maxpool_chw_fx16_k3x3_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx16_generic(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 
 mli_status mli_krn_maxpool_chw_fx8_k2x2_str1_nopad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
@@ -180,6 +182,8 @@ mli_status mli_krn_maxpool_chw_fx8_k2x2(const mli_tensor * in, const mli_pool_cf
 mli_status mli_krn_maxpool_chw_fx8_k2x2_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx8_k3x3(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx8_k3x3_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
+mli_status mli_krn_maxpool_chw_fx8_k2x2_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
+mli_status mli_krn_maxpool_chw_fx8_k3x3_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 mli_status mli_krn_maxpool_chw_fx8_generic(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
 
 #ifdef __cplusplus
diff --git a/lib/gen/func.py b/lib/gen/func.py
@@ -124,10 +124,15 @@ def print_padding_condition(self, split=False):
             else:
                 cond = "(1)"
         elif self.padding == "nopad":
-                cond  = "(padding_top == 0) && "
-                cond += "(padding_bot == 0) && "
-                cond += "(padding_left == 0) && "
-                cond += "(padding_right == 0)"
+            cond  = "(padding_top == 0) && "
+            cond += "(padding_bot == 0) && "
+            cond += "(padding_left == 0) && "
+            cond += "(padding_right == 0)"
+        elif self.padding == "krnpad" and (self.kernel_h > 0) and (self.kernel_w > 0):
+            cond  = "(padding_top <= " + str(int((self.kernel_h - 1) / 2)) + ") && "
+            cond += "(padding_bot <= " + str(int(self.kernel_h / 2)) + ") && "
+            cond += "(padding_left <= " + str(int((self.kernel_w -1) / 2)) + ") && "
+            cond += "(padding_right <= " + str(int(self.kernel_w / 2)) + ")"
         else:
             cond = "(1)"
         return cond
diff --git a/lib/gen/mli_krn_avepool_gen.py b/lib/gen/mli_krn_avepool_gen.py
@@ -2,6 +2,18 @@
 from codegen import Codegen
 import sys
 
+# This script is used to generate the specialized versions for the avepool functions.
+# The specialized functions can be called directly from the application, or the generated
+# wrapper function can be called. The script builds a list with specializations by
+# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
+# different value ranges. A value 0 means that the specific parameter is not fixed.
+# After the complete list is build, the code is generated based on a function template,
+# and inserted into the file template. This script can be used to generate the cc files for
+# different bit precisions, and it can also generate the header file that contains the
+# function prototypes of all specializations. For normal operation of the lib there is no
+# need to update the script.
+# The script can be exectued with python 2.7
+
 #------------------------------------------------------------
 # avepool functions chw
 #------------------------------------------------------------
diff --git a/lib/gen/mli_krn_conv2d_gen.py b/lib/gen/mli_krn_conv2d_gen.py
@@ -2,6 +2,18 @@
 from codegen import Codegen
 import sys
 
+# This script is used to generate the specialized versions for the conv2d functions.
+# The specialized functions can be called directly from the application, or the generated
+# wrapper function can be called. The script builds a list with specializations by
+# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
+# different value ranges. A value 0 means that the specific parameter is not fixed.
+# After the complete list is build, the code is generated based on a function template,
+# and inserted into the file template. This script can be used to generate the cc files for
+# different bit precisions, and it can also generate the header file that contains the
+# function prototypes of all specializations. For normal operation of the lib there is no
+# need to update the script.
+# The script can be exectued with python 2.7
+
 #------------------------------------------------------------
 # convolution functions chw
 #------------------------------------------------------------
diff --git a/lib/gen/mli_krn_depthwise_conv2d_gen.py b/lib/gen/mli_krn_depthwise_conv2d_gen.py
@@ -2,6 +2,18 @@
 from codegen import Codegen
 import sys
 
+# This script is used to generate the specialized versions for the depthwise_conv2d functions.
+# The specialized functions can be called directly from the application, or the generated
+# wrapper function can be called. The script builds a list with specializations by
+# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
+# different value ranges. A value 0 means that the specific parameter is not fixed.
+# After the complete list is build, the code is generated based on a function template,
+# and inserted into the file template. This script can be used to generate the cc files for
+# different bit precisions, and it can also generate the header file that contains the
+# function prototypes of all specializations. For normal operation of the lib there is no
+# need to update the script.
+# The script can be exectued with python 2.7
+
 #------------------------------------------------------------
 # convolution functions chw
 #------------------------------------------------------------
diff --git a/lib/gen/mli_krn_maxpool_gen.py b/lib/gen/mli_krn_maxpool_gen.py
@@ -2,6 +2,18 @@
 from codegen import Codegen
 import sys
 
+# This script is used to generate the specialized versions for the maxpool functions.
+# The specialized functions can be called directly from the application, or the generated
+# wrapper function can be called. The script builds a list with specializations by
+# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
+# different value ranges. A value 0 means that the specific parameter is not fixed.
+# After the complete list is build, the code is generated based on a function template,
+# and inserted into the file template. This script can be used to generate the cc files for
+# different bit precisions, and it can also generate the header file that contains the
+# function prototypes of all specializations. For normal operation of the lib there is no
+# need to update the script.
+# The script can be exectued with python 2.7
+
 #------------------------------------------------------------
 # maxpool functions chw
 #------------------------------------------------------------
@@ -53,35 +65,41 @@
 channel_range = [0,1,3]
 f_list.extend([Func(fbase, k, k, ch, stride, stride, corefunc, "krnpad") for k in kernel_range for ch in channel_range])
 
-corefunc = "maxpool_chw_krnpad"
+corefunc = "maxpool_chw_pad"
 stride = 1
 kernel_range = range(4,11)
 channel_range = [0,1,3]
 f_list.extend([Func(fbase, k, k, ch, stride, stride, corefunc, "krnpad") for k in kernel_range for ch in channel_range])
 
 #stride = 1, 1xk and kx1 versions
-corefunc = "maxpool_chw_krnpad"
+corefunc = "maxpool_chw_pad"
 stride = 1
 kernel_range = range(2,4)
 channel_range = [0,1]
 f_list.extend([Func(fbase, 1, k, ch, stride, stride, corefunc, "krnpad") for k in kernel_range for ch in channel_range])
 f_list.extend([Func(fbase, k, 1, ch, stride, stride, corefunc, "krnpad") for k in kernel_range for ch in channel_range])
 
 #fix single dimension, others flex
-corefunc = "maxpool_chw_krnpad"
+corefunc = "maxpool_chw_pad"
 stride = 1
 f_list.extend([Func(fbase, 1, 0, 0, stride, stride, corefunc, "")]) #k_width == 1
 f_list.extend([Func(fbase, 0, 1, 0, stride, stride, corefunc, "")]) #k_heigth == 1
 f_list.extend([Func(fbase, 0, 0, 1, stride, stride, corefunc, "")]) #channels == 1
 
-corefunc = "maxpool_chw_krnpad_small"
+corefunc = "maxpool_chw_pad"
 stride = 0
 kernel_range = [2,3]
 channel_range = [0,1]
 f_list.extend([Func(fbase, k, k, ch, stride, stride, corefunc, "") for k in kernel_range for ch in channel_range])
 
+corefunc = "maxpool_chw_krnpad_small"
+stride = 0
+kernel_range = [2,3]
+channel_range = [0]
+f_list.extend([Func(fbase, k, k, ch, stride, stride, corefunc, "krnpad") for k in kernel_range for ch in channel_range])
+
 #at last add the generic function that can be used in the else branch in the wrapper.
-corefunc = "maxpool_chw_krnpad"
+corefunc = "maxpool_chw_pad"
 default_func = Func(fbase, 0, 0, 0, 0, 0, corefunc, generic=True)
 f_list.append(default_func)
 
diff --git a/lib/make/makefile b/lib/make/makefile
@@ -53,7 +53,7 @@ LIBRARY_DIR ?= ../../bin
 OUT_NAME    ?= libmli
 BUILD_DIR   ?= ../../obj
 GEN_DIR     ?= ../gen
-CFLAGS      += -O3 -Xdsp_ctrl=postshift,guard,convergent -Hnocopyr -Hfxapi -Hpurge -Hsdata0 -Hdense_prologue
+CFLAGS      += -Xdsp_ctrl=postshift,guard,convergent -Hnocopyr -Hfxapi -Hpurge -Hsdata0 -Hdense_prologue
 
 vpath %.py  $(GEN_DIR)
 vpath %.txt  $(GEN_DIR)
diff --git a/lib/src/kernels/pooling/mli_krn_maxpool_chw.h b/lib/src/kernels/pooling/mli_krn_maxpool_chw.h
@@ -379,7 +379,7 @@ static inline void __attribute__((always_inline)) maxpool_chw_small(
 }
 
 template <typename io_T>
-static inline void __attribute__((always_inline)) maxpool_chw_krnpad(
+static inline void __attribute__((always_inline)) maxpool_chw_pad(
         const MLI_PTR(io_T) __restrict in_ftrs,
         MLI_OUT_PTR(io_T) __restrict out_ftrs,
         const int row_beg,
diff --git a/lib/src/kernels/pooling/mli_krn_maxpool_chw_fx16.cc b/lib/src/kernels/pooling/mli_krn_maxpool_chw_fx16.cc
diff --git a/lib/src/kernels/pooling/mli_krn_maxpool_chw_fx8.cc b/lib/src/kernels/pooling/mli_krn_maxpool_chw_fx8.cc
diff --git a/lib/src/private/mli_prv_dsp.h b/lib/src/private/mli_prv_dsp.h

Original file line number	Diff line number	Diff line change
`@@ -434,7 +434,7 @@ static void check_result(`
`434`	`434`	`//========================================================================================`
`435`	`435`	`#if (MODEL_BIT_DEPTH != MODEL_FX_8)`
`436`	`436`	`static inline mli_status maxpool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`437`		`- return mli_krn_maxpool_chw_fx16_k3x3(in, cfg, out);`
	`437`	`+ return mli_krn_maxpool_chw_fx16_k3x3_krnpad(in, cfg, out);`
`438`	`438`	`}`
`439`	`439`
`440`	`440`	`static inline mli_status avepool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`@@ -455,7 +455,7 @@ static inline mli_status mli_krn_permute_fx(const mli_tensor *in, const mli_perm`
`455`	`455`
`456`	`456`	`#else // MODEL_BIT_DEPTH == (MODEL_FX_8W16D \|\| MODEL_FX_8W16D)`
`457`	`457`	`static inline mli_status maxpool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`
`458`		`- return mli_krn_maxpool_chw_fx8_k3x3(in, cfg, out);`
	`458`	`+ return mli_krn_maxpool_chw_fx8_k3x3_krnpad(in, cfg, out);`
`459`	`459`	`}`
`460`	`460`
`461`	`461`	`static inline mli_status avepool_chw(const mli_tensor in, const mli_pool_cfg cfg, mli_tensor *out) {`