Skip to content

Commit 4f2688c

Browse files
authored
Merge pull request #56 from foss-for-synopsys-dwc-arc-processors/optimize
Optimize
2 parents 455fc11 + 3c1dbea commit 4f2688c

File tree

16 files changed

+498
-91
lines changed

16 files changed

+498
-91
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@ Building of embARC MLI library
7373

7474
5. Result Quality shall be "S/N=1823.9 (65.2 db)"
7575

76+
## Optimizations for code size
77+
------------------------------
78+
By default the embARC MLI Library is build for optimal speed. If code size needs to be reduced, there are two things that can be done:
79+
1. For convolution and pooling layers there are specialized funtions for specific kernel sizes, they are called by a wrapper functions based on the parameters.
80+
These parameters are compile time constant in the application, so the application can directly call the specialized functions. This will reduce over all code size.
81+
Please be aware that the list of specializations is not guaranteed to be backwards compatible between releases.
82+
83+
2. Use a different optimization mode when calling the makefile. OPTMODE=size will optimize for size. default is OPTMODE=speed
84+
'gmake TCF_FILE=../../hw/em9d.tcf OPTMODE=size'
7685

7786
## Known Issues
7887
---------------

build/rules.mk

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ quote=$(subst %,$(Q)%, \
6464
# Global settings
6565
#=============================================================
6666
TOOLCHAIN ?= gnu
67+
#optmization mode
68+
OPTMODE ?= speed
6769

6870
export DEBUG_BUILD?=ON
6971
#export ASM_OUT?=OFF
@@ -76,6 +78,13 @@ endif
7678
# # CFLAGS += -Hon=Print_var_info
7779
#endif
7880

81+
ifeq ($(OPTMODE),size)
82+
CFLAGS += -O2 -Hlto
83+
endif
84+
ifeq ($(OPTMODE),speed)
85+
CFLAGS += -O3
86+
endif
87+
7988
#=============================================================
8089
# Files and directories
8190
#=============================================================

examples/example_cifar10_caffe/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ BUILD_DIR ?= ./obj
2828
OUT_NAME ?= example_cifar10_caffe
2929
ifeq ($(TOOLCHAIN),mwdt)
3030
# MWDT specific options
31-
CFLAGS = -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
31+
CFLAGS = -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
3232
else
3333
PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
3434

examples/example_cifar10_caffe/cifar10_model_chw.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ static void check_result(
434434
//========================================================================================
435435
#if (MODEL_BIT_DEPTH != MODEL_FX_8)
436436
static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
437-
return mli_krn_maxpool_chw_fx16_k3x3(in, cfg, out);
437+
return mli_krn_maxpool_chw_fx16_k3x3_krnpad(in, cfg, out);
438438
}
439439

440440
static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
@@ -455,7 +455,7 @@ static inline mli_status mli_krn_permute_fx(const mli_tensor *in, const mli_perm
455455

456456
#else // MODEL_BIT_DEPTH == (MODEL_FX_8W16D || MODEL_FX_8W16D)
457457
static inline mli_status maxpool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {
458-
return mli_krn_maxpool_chw_fx8_k3x3(in, cfg, out);
458+
return mli_krn_maxpool_chw_fx8_k3x3_krnpad(in, cfg, out);
459459
}
460460

461461
static inline mli_status avepool_chw(const mli_tensor *in, const mli_pool_cfg *cfg, mli_tensor *out) {

examples/example_har_smartphone/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ BUILD_DIR ?= ./obj
2828
OUT_NAME ?= example_har_smartphone
2929
ifeq ($(TOOLCHAIN),mwdt)
3030
# MWDT specific options
31-
CFLAGS = -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -O0 -Hsdata0
31+
CFLAGS = -Hnocopyr -Hpurge -Hheap=8K -Hstack=1K -Hfxapi -e_start -Bgrouplib -Hldopt=-q -Hsdata0 -Xdsp_ctrl=postshift,guard,convergent -Hdense_prologue
3232
else
3333
PREBUILT_LIB ?= $(EMBARC_MLI_DIR)/examples/prebuilt/libmli.a
3434

include/api/mli_krn_maxpool_spec_api.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ mli_status mli_krn_maxpool_chw_fx16_k2x2(const mli_tensor * in, const mli_pool_c
101101
mli_status mli_krn_maxpool_chw_fx16_k2x2_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
102102
mli_status mli_krn_maxpool_chw_fx16_k3x3(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
103103
mli_status mli_krn_maxpool_chw_fx16_k3x3_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
104+
mli_status mli_krn_maxpool_chw_fx16_k2x2_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
105+
mli_status mli_krn_maxpool_chw_fx16_k3x3_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
104106
mli_status mli_krn_maxpool_chw_fx16_generic(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
105107

106108
mli_status mli_krn_maxpool_chw_fx8_k2x2_str1_nopad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
@@ -180,6 +182,8 @@ mli_status mli_krn_maxpool_chw_fx8_k2x2(const mli_tensor * in, const mli_pool_cf
180182
mli_status mli_krn_maxpool_chw_fx8_k2x2_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
181183
mli_status mli_krn_maxpool_chw_fx8_k3x3(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
182184
mli_status mli_krn_maxpool_chw_fx8_k3x3_ch1(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
185+
mli_status mli_krn_maxpool_chw_fx8_k2x2_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
186+
mli_status mli_krn_maxpool_chw_fx8_k3x3_krnpad(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
183187
mli_status mli_krn_maxpool_chw_fx8_generic(const mli_tensor * in, const mli_pool_cfg * cfg, mli_tensor * out);
184188

185189
#ifdef __cplusplus

lib/gen/func.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,15 @@ def print_padding_condition(self, split=False):
124124
else:
125125
cond = "(1)"
126126
elif self.padding == "nopad":
127-
cond = "(padding_top == 0) && "
128-
cond += "(padding_bot == 0) && "
129-
cond += "(padding_left == 0) && "
130-
cond += "(padding_right == 0)"
127+
cond = "(padding_top == 0) && "
128+
cond += "(padding_bot == 0) && "
129+
cond += "(padding_left == 0) && "
130+
cond += "(padding_right == 0)"
131+
elif self.padding == "krnpad" and (self.kernel_h > 0) and (self.kernel_w > 0):
132+
cond = "(padding_top <= " + str(int((self.kernel_h - 1) / 2)) + ") && "
133+
cond += "(padding_bot <= " + str(int(self.kernel_h / 2)) + ") && "
134+
cond += "(padding_left <= " + str(int((self.kernel_w -1) / 2)) + ") && "
135+
cond += "(padding_right <= " + str(int(self.kernel_w / 2)) + ")"
131136
else:
132137
cond = "(1)"
133138
return cond

lib/gen/mli_krn_avepool_gen.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
from codegen import Codegen
33
import sys
44

5+
# This script is used to generate the specialized versions for the avepool functions.
6+
# The specialized functions can be called directly from the application, or the generated
7+
# wrapper function can be called. The script builds a list with specializations by
8+
# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
9+
# different value ranges. A value 0 means that the specific parameter is not fixed.
10+
# After the complete list is build, the code is generated based on a function template,
11+
# and inserted into the file template. This script can be used to generate the cc files for
12+
# different bit precisions, and it can also generate the header file that contains the
13+
# function prototypes of all specializations. For normal operation of the lib there is no
14+
# need to update the script.
15+
# The script can be exectued with python 2.7
16+
517
#------------------------------------------------------------
618
# avepool functions chw
719
#------------------------------------------------------------

lib/gen/mli_krn_conv2d_gen.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
from codegen import Codegen
33
import sys
44

5+
# This script is used to generate the specialized versions for the conv2d functions.
6+
# The specialized functions can be called directly from the application, or the generated
7+
# wrapper function can be called. The script builds a list with specializations by
8+
# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
9+
# different value ranges. A value 0 means that the specific parameter is not fixed.
10+
# After the complete list is build, the code is generated based on a function template,
11+
# and inserted into the file template. This script can be used to generate the cc files for
12+
# different bit precisions, and it can also generate the header file that contains the
13+
# function prototypes of all specializations. For normal operation of the lib there is no
14+
# need to update the script.
15+
# The script can be exectued with python 2.7
16+
517
#------------------------------------------------------------
618
# convolution functions chw
719
#------------------------------------------------------------

lib/gen/mli_krn_depthwise_conv2d_gen.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,18 @@
22
from codegen import Codegen
33
import sys
44

5+
# This script is used to generate the specialized versions for the depthwise_conv2d functions.
6+
# The specialized functions can be called directly from the application, or the generated
7+
# wrapper function can be called. The script builds a list with specializations by
8+
# (optionally) fixing strides, kernel sizes, number of channels, or padding mode for
9+
# different value ranges. A value 0 means that the specific parameter is not fixed.
10+
# After the complete list is build, the code is generated based on a function template,
11+
# and inserted into the file template. This script can be used to generate the cc files for
12+
# different bit precisions, and it can also generate the header file that contains the
13+
# function prototypes of all specializations. For normal operation of the lib there is no
14+
# need to update the script.
15+
# The script can be exectued with python 2.7
16+
517
#------------------------------------------------------------
618
# convolution functions chw
719
#------------------------------------------------------------

0 commit comments

Comments
 (0)