Skip to content

Commit c25a5ba

Browse files
authored
Merge pull request cnlohr#857 from monte-monte/master
Add Coremark to projects
2 parents c271574 + 548a32d commit c25a5ba

File tree

10 files changed

+1811
-0
lines changed

10 files changed

+1811
-0
lines changed

.gitmodules

Whitespace-only changes.

projects/coremark/Makefile

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
all : flash
2+
3+
TARGET:=coremark
4+
# TARGET_MCU:=CH32V006
5+
# TARGET_MCU:=CH32X035
6+
# TARGET_MCU:=CH32V103
7+
# TARGET_MCU:=CH32V203
8+
# TARGET_MCU_PACKAGE:=CH32V203F8 #64K version
9+
# TARGET_MCU:=CH32V208
10+
# TARGET_MCU_PACKAGE:=CH32V208WBU6
11+
# TARGET_MCU:=CH32V307
12+
TARGET_MCU:=CH570
13+
TARGET_MCU_PACKAGE:=CH570D
14+
# TARGET_MCU:=CH585
15+
# TARGET_MCU_PACKAGE:=CH585F
16+
17+
COREMARK:=coremark
18+
ENABLE_FPU:=0
19+
CORE_ITERATIONS=4000 #Standard is 4K if you get runs less than 10s increase this number
20+
CORE_DATA_SIZE=2000 #Will result in this much RAM use
21+
CORE_MEM_METHOD=0 #0 - STATIC, 1 - HEAP, 2 - STACK
22+
CORE_MEM_STR="STATIC"
23+
24+
LDFLAGS+=
25+
# OPT_FLAGS:=-O3 -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting
26+
# OPT_FLAGS:=-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8
27+
OPT_FLAGS:=-O3
28+
29+
COREMARK_CFLAGS=$(CFLAGS_ARCH) -DTARGET_MCU=$(TARGET_MCU) -DMCU_PACKAGE=$(MCU_PACKAGE) -static-libgcc -I$(NEWLIB) -I$(CH32FUN) -nostdlib -I. -Wall $(EXTRA_CFLAGS) -I$(COREMARK) -DCOMPILER_FLAGS=\""$(OPT_FLAGS)"\" -DITERATIONS=$(CORE_ITERATIONS) -DTOTAL_DATA_SIZE=$(CORE_DATA_SIZE) -DMEM_METHOD=$(CORE_MEM_METHOD) -DMEM_LOCATION=\"$(CORE_MEM_STR)\" -DMAIN_HAS_NOARGC -ffreestanding $(OPT_FLAGS)
30+
include ../../ch32fun/ch32fun.mk
31+
CFLAGS+=-ffreestanding
32+
COREMARK_FILES=core_portme.c $(COREMARK)/core_list_join.c $(COREMARK)/core_main.c $(COREMARK)/core_matrix.c $(COREMARK)/core_state.c $(COREMARK)/core_util.c
33+
34+
RAM_LD_FILE:=generated_$(TARGET_MCU_PACKAGE)_RAM.ld
35+
36+
$(RAM_LD_FILE):
37+
$(PREFIX)-gcc -E -P -x c -DTARGET_MCU=$(TARGET_MCU) -DMCU_PACKAGE=$(MCU_PACKAGE) -DTARGET_MCU_LD=$(TARGET_MCU_LD) -DTARGET_MCU_MEMORY_SPLIT=$(TARGET_MCU_MEMORY_SPLIT) app_in_ram.ld > $(RAM_LD_FILE)
38+
39+
coremark.elf:
40+
$(PREFIX)-gcc -c -o ch32fun.o $(SYSTEM_C) $(CFLAGS)
41+
$(PREFIX)-gcc -c -o printf.o printf.c $(CFLAGS)
42+
$(PREFIX)-gcc -o $@ ch32fun.o printf.o $(COREMARK_FILES) $(COREMARK_CFLAGS) $(LDFLAGS)
43+
# $(PREFIX)-gcc -o $@ ch32fun.o $(COREMARK_FILES) $(COREMARK_CFLAGS) $(LDFLAGS)
44+
45+
runfromram: LINKER_SCRIPT:=$(RAM_LD_FILE)
46+
47+
runfromram: $(RAM_LD_FILE) $(TARGET).bin
48+
$(MINICHLINK)/minichlink -ks 0x10 0x80000001 -s 0x10 0x80000003 -s 0x10 0x80000001 -s 0x18 0 -w $(TARGET).bin 0x20000000 -s 0x04 0x20000000 -s 0x17 0x002307b1 -s 0x10 0x40000001 -T
49+
50+
flash: cv_flash
51+
clean:
52+
rm -rf ch32fun.o printf.o $(TARGET).elf $(TARGET).bin $(TARGET)_ext.bin $(TARGET).hex $(TARGET).lst $(TARGET).map $(TARGET).hex $(GENERATED_LD_FILE) $(RAM_LD_FILE) || true
53+

projects/coremark/README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Coremark build script for ch32fun
2+
3+
This allows you to easily build and run [Coremark](https://www.eembc.org/coremark/index.php) benchmark on any MCU that is supported in ch32fun. Coremark is industry adopted benchmark for roughly comparing core performance of different MCUs and other embedded processors. Same as all other benchmarks it can't be used to absolutely measure the performance of the device, but nevertheless it can be a useful tool when comparing and choosing among many different chip models available on the market. Also running this test with different compilers and/or compiler settings can show what is influencing the performance of your chip and how, and maybe it can help you to optimize your code if needed.
4+
5+
If you want so compare your results with other known runs you can visit this page: https://www.eembc.org/coremark/scores.php
6+
7+
Also there is an official result list for various WCH MCUs: https://special.wch.cn/zh_cn/coremark_scores/#/
8+
There were some doubts in legitimacy of those numbers, but after my own tests they seem pretty close to what I was getting on these chips, albeit the benchmark was compiled with pretty aggressive gcc flags, that you would rarely see in everyday life.
9+
10+
## How to use
11+
12+
- You will need a copy of [Coremark](https://github.com/eembc/coremark) in the ``coremark`` subfolder. Alternatively, you can point to the Coremark in some other location on your PC by editing ``COREMARK`` variable in the ``Makefile``.
13+
- Select your MCU model in the ``Makefile``
14+
- Connect your board to the PC and do ``make`` in this folder.
15+
- Open the terminal in minichlink, or do ``make terminal``, wait until the test is done running and results are printed.
16+
17+
## Quirks and optimizations
18+
19+
As stated in Coremark description for a result to be valid the benchmark needs run for at least 10 seconds. The duration of the test depends on the number of iterations. By default it is set to 4000, but it is too much for slower processors. You may want to decrease this number to 1500 or even 500. Why does it matter? This *port* uses a raw SysTick value to measure the time spent and if you are running SysTick at HCLK and the processor has a 32 bit counter, it will likely overflow and wrap around for longer durations of the test, this will inevitably produce a wrong result. So you would want a number of iterations be such that it takes 10-20 seconds to complete. You can easily change the number of iterations by editing ``CORE_ITERATIONS`` variable in the ``Makefile``.
20+
21+
Another setting you can change (but shouldn't if you want a comparable result) is ``CORE_DATA_SIZE``. By default it is 2000 and it will use this much bytes in the RAM for a data buffer. This is why you can't run this benchmark on chips like ch32v003 or ch32v002 to get a number to compare with other known results.
22+
23+
``CORE_MEM_METHOD`` is a setting that allows you to choose where the data is stored. The options are: Static - 0, Heap - 1 and Stack - 2. Most results online I've seen were using the Stack option.
24+
25+
``OPT_FLAGS`` - gcc flags for optimizing the compilation of Coremark files. Usually they are compiled with ``-O3``. WCH used ``-Ofast -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting`` for their official table. Some of these flags may even decrease the performance on some chips, you'll have to experiment with them. ``-funroll-all-loops`` - doubles the resulting binary and the size will depend on a number of iterations. The default compiler settings used in ch32fun are ``-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8`` you can use them to see how much performance is traded for space optimization.
26+
27+
The ``printf`` function that is used in ch32fun lacks float support, but to see the Coremark results you need it. This is why there is a third-party printf library in this folder.
28+
29+
## Running from RAM
30+
31+
Many WCH's chips have a rather slow flash memory. This impacts the performance drastically in some cases. The most advanced chips like ch32v20x and ch32v30x use a SRAM cache for the *zero-wait flash* to increase their performance, but chips from ch5xx family doesn't have that cache even though they have pretty fast cores. To measure the core performance on such chips you may want to runt the benchmark in the RAM. You can do it by using the ``make runfromram`` command. But be aware that you would need the resulting firmware size + data buffer be lower than the RAM size of your MCU. You can achieve this by changing compiler flags and disabling third-party printf library. If you disable printf lib, the results will print total number of ticks and you will have to calculate the result by yourself, but this is a worthy tradeoff.
32+
33+
## Some results
34+
35+
| |Marks |Flash/RAM|Frequency|Iterations|Compiler version|Build flags|
36+
|:- |:-: |:-: |:-: |:-: |:-: |:-: |
37+
|CH570 |27 |Flash |50MHz |500 |gcc 13.2.0 |-O3 |
38+
|CH570 |187 |RAM |100MHz |2500 |gcc 13.2.0 |-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8 |
39+
|CH32V006|57 |Flash |48MHz |1500 |gcc 13.2.0 |-O3 |
40+
|CH32V006|40 |Flash |48MHz |1500 |gcc 13.2.0 |-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8 |
41+
|CH32V006|62 |Flash |48MHz |1500 |gcc 13.2.0 |-O3 -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting |
42+
|CH32X035|29 |Flash |48MHz |500 |gcc 13.2.0 |-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8 |
43+
|CH32X035|71 |RAM |48MHz |1500 |gcc 13.2.0 |-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8 |
44+
|CH32X035|106 |RAM |48MHz |1500 |gcc 13.2.0 |-O3 |
45+
|CH32X035|115 |RAM |48MHz |1500 |gcc 13.2.0 |-O3 -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting |
46+
|CH585 |242 |RAM |78MHz |4000 |gcc 13.2.0 |-O3 -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting |
47+
|CH32V307|266 |Flash (zero-wait) |144MHz |4000 |gcc 13.2.0 |-Os -flto -ffunction-sections -fdata-sections -fmessage-length=0 -msmall-data-limit=8 |
48+
|CH32V307|384 |Flash (zero-wait) |144MHz |4000 |gcc 13.2.0 |-O3 |
49+
|CH32V307|459 |Flash (zero-wait) |144MHz |5000 |gcc 13.2.0 |-O3 -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting|
50+
|CH32V307|463 |Flash (zero-wait) |144MHz |5000 |gcc 13.2.0 |-O3 -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling |
51+
|CH32V307|469 |Flash (zero-wait) |144MHz |5000 |gcc 13.2.0 |-Ofast -funroll-all-loops -finline-limit=600 -ftree-dominator-opts -fno-if-conversion2 -fselective-scheduling -fno-code-hoisting |
52+
53+
I've included multiple ch32v307 results to demonstrate how unintuitively some compiler flags can influence the performance. Adding ``-fno-code-hoisting`` produced worse results with ``-O3`` than on all other chips I've tried, but with ``-Ofast`` it make result better.

projects/coremark/app_in_ram.ld

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
ENTRY( InterruptVector )
2+
3+
/* Run from RAM example, small RAM footprint so should run on any CH32V's */
4+
5+
MEMORY
6+
{
7+
#if TARGET_MCU_LD == 0 /* v003 */
8+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 2K
9+
#elif TARGET_MCU_LD == 1 /* v10x */
10+
#if MCU_PACKAGE == 1
11+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
12+
#elif MCU_PACKAGE == 2
13+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 10K
14+
#else
15+
#error "Unknown MCU package"
16+
#endif
17+
#elif TARGET_MCU_LD == 2 /* v20x */
18+
#if MCU_PACKAGE == 1
19+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
20+
#elif MCU_PACKAGE == 2
21+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 10K
22+
#elif MCU_PACKAGE == 3
23+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 64K
24+
#else
25+
#error "Unknown MCU package"
26+
#endif
27+
#elif TARGET_MCU_LD == 3 /* v30x */
28+
#if MCU_PACKAGE == 1
29+
#if TARGET_MCU_MEMORY_SPLIT == 1
30+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 96K
31+
#elif TARGET_MCU_MEMORY_SPLIT == 2
32+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 64K
33+
#elif TARGET_MCU_MEMORY_SPLIT == 3
34+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
35+
#else
36+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 128K
37+
#endif
38+
#elif MCU_PACKAGE == 2
39+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
40+
#else
41+
#error "Unknown MCU package"
42+
#endif
43+
#elif TARGET_MCU_LD == 4 /* x03x */
44+
#if MCU_PACKAGE == 1
45+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
46+
#else
47+
#error "Unknown MCU package"
48+
#endif
49+
#elif TARGET_MCU_LD == 5
50+
/* CH32V002 */
51+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 4K
52+
#elif TARGET_MCU_LD == 6
53+
/* CH32V005, CH32V004 */
54+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 6K
55+
#elif TARGET_MCU_LD == 7
56+
/* CH32V006, CH32V007 */
57+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 8K
58+
#elif TARGET_MCU_LD == 8
59+
/* CH582/3/4/5 */
60+
#if MCU_PACKAGE == 2 || MCU_PACKAGE == 3
61+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
62+
#elif MCU_PACKAGE == 4
63+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 96K
64+
#elif MCU_PACKAGE == 5
65+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 128K
66+
#else
67+
#error "Unknown MCU package"
68+
#endif
69+
#elif TARGET_MCU_LD == 9
70+
/* CH591/2 */
71+
#if MCU_PACKAGE == 1
72+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 26K
73+
#elif MCU_PACKAGE == 2
74+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 26K
75+
#else
76+
#error "Unknown MCU package"
77+
#endif
78+
#elif TARGET_MCU_LD == 10
79+
/* CH57x */
80+
#if MCU_PACKAGE == 0 || MCU_PACKAGE == 2
81+
RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 12K
82+
#elif MCU_PACKAGE == 1
83+
RAM (xrw) : ORIGIN = 0x20003800, LENGTH = 18K
84+
#elif MCU_PACKAGE == 3
85+
RAM (xrw) : ORIGIN = 0x20003800, LENGTH = 18K
86+
#else
87+
#error "Unknown MCU package"
88+
#endif
89+
#elif TARGET_MCU_LD == 11
90+
/* CH32H41x */
91+
#if MCU_PACKAGE == 1 || MCU_PACKAGE == 3
92+
RAM (xrw) : ORIGIN = 0x20100000, LENGTH = 512K
93+
#elif MCU_PACKAGE == 2
94+
RAM (xrw) : ORIGIN = 0x20100000, LENGTH = 512K
95+
#else
96+
#error "Unknown MCU package"
97+
#endif
98+
#else
99+
#error "Unknown MCU target"
100+
#endif
101+
}
102+
103+
SECTIONS
104+
{
105+
.init :
106+
{
107+
_sinit = .;
108+
. = ALIGN(4);
109+
KEEP(*(SORT_NONE(.init)))
110+
. = ALIGN(4);
111+
_einit = .;
112+
} >RAM AT>RAM
113+
114+
#if TARGET_MCU_LD == 8 || TARGET_MCU_LD == 9 || TARGET_MCU_LD == 10
115+
.highcodelalign :
116+
{
117+
. = ALIGN(4);
118+
PROVIDE(_highcode_lma = .);
119+
} >RAM AT>RAM
120+
121+
.highcode :
122+
{
123+
. = ALIGN(4);
124+
PROVIDE(_highcode_vma_start = .);
125+
*(.highcode*)
126+
. = ALIGN(4);
127+
PROVIDE(_highcode_vma_end = .);
128+
} >RAM AT>RAM
129+
130+
.data :
131+
{
132+
. = ALIGN(4);
133+
__global_pointer$ = . + 0x7f8;
134+
*(.gnu.linkonce.r.*)
135+
*(.data .data.*)
136+
*(.gnu.linkonce.d.*)
137+
. = ALIGN(4);
138+
*(.sdata .sdata.*)
139+
*(.sdata2*)
140+
*(.gnu.linkonce.s.*)
141+
. = ALIGN(4);
142+
*(.srodata.cst16)
143+
*(.srodata.cst8)
144+
*(.srodata.cst4)
145+
*(.srodata.cst2)
146+
*(.srodata .srodata.*)
147+
. = ALIGN(4);
148+
PROVIDE( _edata = .);
149+
} >RAM AT>RAM
150+
#endif
151+
152+
.text :
153+
{
154+
. = ALIGN(4);
155+
*(.text)
156+
*(.text.*)
157+
*(.rodata)
158+
*(.rodata*)
159+
*(.gnu.linkonce.t.*)
160+
. = ALIGN(4);
161+
} >RAM AT>RAM
162+
163+
.fini :
164+
{
165+
KEEP(*(SORT_NONE(.fini)))
166+
. = ALIGN(4);
167+
} >RAM AT>RAM
168+
169+
PROVIDE( _etext = . );
170+
PROVIDE( _eitcm = . );
171+
172+
.preinit_array :
173+
{
174+
PROVIDE_HIDDEN (__preinit_array_start = .);
175+
KEEP (*(.preinit_array))
176+
PROVIDE_HIDDEN (__preinit_array_end = .);
177+
} >RAM AT>RAM
178+
179+
.init_array :
180+
{
181+
PROVIDE_HIDDEN (__init_array_start = .);
182+
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
183+
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
184+
PROVIDE_HIDDEN (__init_array_end = .);
185+
} >RAM AT>RAM
186+
187+
.fini_array :
188+
{
189+
PROVIDE_HIDDEN (__fini_array_start = .);
190+
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
191+
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
192+
PROVIDE_HIDDEN (__fini_array_end = .);
193+
} >RAM AT>RAM
194+
195+
.ctors :
196+
{
197+
/* gcc uses crtbegin.o to find the start of
198+
the constructors, so we make sure it is
199+
first. Because this is a wildcard, it
200+
doesn't matter if the user does not
201+
actually link against crtbegin.o; the
202+
linker won't look for a file to match a
203+
wildcard. The wildcard also means that it
204+
doesn't matter which directory crtbegin.o
205+
is in. */
206+
KEEP (*crtbegin.o(.ctors))
207+
KEEP (*crtbegin?.o(.ctors))
208+
/* We don't want to include the .ctor section from
209+
the crtend.o file until after the sorted ctors.
210+
The .ctor section from the crtend file contains the
211+
end of ctors marker and it must be last */
212+
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
213+
KEEP (*(SORT(.ctors.*)))
214+
KEEP (*(.ctors))
215+
} >RAM AT>RAM
216+
217+
.dtors :
218+
{
219+
KEEP (*crtbegin.o(.dtors))
220+
KEEP (*crtbegin?.o(.dtors))
221+
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
222+
KEEP (*(SORT(.dtors.*)))
223+
KEEP (*(.dtors))
224+
} >RAM AT>RAM
225+
226+
.dalign :
227+
{
228+
. = ALIGN(4);
229+
PROVIDE(_data_vma = .);
230+
} >RAM AT>RAM
231+
232+
.dlalign :
233+
{
234+
. = ALIGN(4);
235+
PROVIDE(_data_lma = .);
236+
} >RAM AT>RAM
237+
#if TARGET_MCU_LD != 8 && TARGET_MCU_LD != 9 && TARGET_MCU_LD != 10
238+
.data :
239+
{
240+
. = ALIGN(4);
241+
__global_pointer$ = . + 0x7f8;
242+
*(.gnu.linkonce.r.*)
243+
*(.data .data.*)
244+
*(.gnu.linkonce.d.*)
245+
. = ALIGN(4);
246+
*(.sdata .sdata.*)
247+
*(.sdata2*)
248+
*(.gnu.linkonce.s.*)
249+
. = ALIGN(4);
250+
*(.srodata.cst16)
251+
*(.srodata.cst8)
252+
*(.srodata.cst4)
253+
*(.srodata.cst2)
254+
*(.srodata .srodata.*)
255+
. = ALIGN(4);
256+
PROVIDE( _edata = .);
257+
} >RAM AT>RAM
258+
#endif
259+
.bss :
260+
{
261+
. = ALIGN(4);
262+
PROVIDE( _sbss = .);
263+
*(.sbss*)
264+
*(.gnu.linkonce.sb.*)
265+
*(.bss*)
266+
*(.gnu.linkonce.b.*)
267+
*(COMMON*)
268+
. = ALIGN(4);
269+
PROVIDE( _ebss = .);
270+
} >RAM AT>RAM
271+
272+
PROVIDE( _end = _ebss);
273+
PROVIDE( end = . );
274+
275+
PROVIDE( _eusrstack = ORIGIN(RAM) + LENGTH(RAM));
276+
}
277+
278+
279+
280+

0 commit comments

Comments
 (0)