Skip to content

Commit efa8019

Browse files
loganekJames Marshjammar1Zzzabiyakawenyongh
authored
Merge dev/simd for fast-interp (#4131)
* Implement the first few SIMD opcodes for fast interpreter (v128.const, v128.any_true) (#3818) Tested on the following code: ``` (module (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32))) (memory (export "memory") 1) ;; WASI entry point (func $main (export "_start") v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 v128.any_true if unreachable end v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 v128.any_true i32.const 0 i32.eq if unreachable end i32.const 0 call $proc_exit ) ) ``` * implement POP_V128() This is to simplify the simd implementation for fast interpreter * Add all SIMD operations into wasm_interp_fast switch * Add V128 comparison operations Tested using ``` (module (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32))) (memory (export "memory") 1) (func $assert_true (param v128) local.get 0 v128.any_true i32.eqz if unreachable end ) (func $main (export "_start") ;; Test v128.not v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 v128.not v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 i8x16.eq call $assert_true ;; Test v128.and v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0 v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0 v128.and v128.const i8x16 255 255 0 0 0 0 0 0 255 255 0 0 0 0 0 0 i8x16.eq call $assert_true ;; Test v128.andnot v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0 v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0 v128.andnot v128.const i8x16 0 0 255 255 0 0 0 0 0 0 255 255 0 0 0 0 i8x16.eq call $assert_true ;; Test v128.or v128.const i8x16 255 255 0 0 0 0 255 255 255 255 0 0 0 0 255 0 v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0 v128.or v128.const i8x16 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 0 i8x16.eq call $assert_true ;; Test v128.xor v128.const i8x16 255 255 0 0 255 255 0 0 255 255 0 0 255 255 0 0 v128.const i8x16 255 255 255 255 0 0 0 0 255 255 255 255 0 0 0 0 v128.xor v128.const i8x16 0 0 255 255 255 255 0 0 0 0 255 255 255 255 0 0 i8x16.eq call $assert_true i32.const 0 call $proc_exit ) ) ``` * Add first NEON SIMD opcode implementations to fast interpreter (#3859) Add some implementations of SIMD opcodes using NEON instructions. Tested using: ```wast (module (import "wasi_snapshot_preview1" "proc_exit" (func $proc_exit (param i32))) (memory (export "memory") 1) (func $assert_true (param v128) local.get 0 v128.any_true i32.eqz if unreachable end ) (func $main (export "_start") i32.const 0 i32.const 32 memory.grow drop i32.const 0 v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 v128.store i32.const 0 v128.load v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 i8x16.eq call $assert_true i32.const 16 v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 v128.store i32.const 16 v128.load v128.const i8x16 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 i8x16.eq call $assert_true i32.const 0 v128.load v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 i8x16.eq call $assert_true drop i32.const 0 i32.const 1 memory.grow drop i32.const 0 i64.const 0x7F80FF017E02FE80 i64.store i32.const 0 v128.load8x8_s v128.const i16x8 127 -128 -1 1 126 2 -2 -128 i16x8.eq call $assert_true i32.const 0 i64.const 0x80FE027E01FF807F i64.store i32.const 0 v128.load8x8_u v128.const i16x8 128 254 2 126 1 255 128 127 i16x8.eq call $assert_true i32.const 0 i64.const 0x8000FFFE7FFF0001 i64.store i32.const 0 v128.load16x4_s v128.const i32x4 -32768 -2 32767 1 i32x4.eq call $assert_true i32.const 0 i64.const 0x8000FFFE7FFF0001 i64.store i32.const 0 v128.load16x4_u v128.const i32x4 32768 65534 32767 1 i32x4.eq call $assert_true i32.const 0 i64.const 0x8000000000000001 i64.store i32.const 0 v128.load32x2_s v128.const i64x2 -2147483648 1 i64x2.eq call $assert_true i32.const 0 i64.const 0x8000000000000001 i64.store i32.const 0 v128.load32x2_u v128.const i64x2 2147483648 1 i64x2.eq call $assert_true call $proc_exit ) ) ``` * Emit imm for lane extract and replace (#3906) * Fix replacement value not being correct (#3919) * Implement load lanes opcodes for wasm (#3942) * Implement final SIMD opcodes: store lane (#4001) * Fix load/store (#4054) * Correctly use unsigned functions (#4055) * implement local and function calls for v128 in the fast interpreter * Fix splat opcodes, add V128 handling in preserve_referenced_local and reserve_block_ret * Fix incorrect memory overflow values + SIMD ifdefs * Fix load/load_splat macros * correct endif wasm loader * Update core/iwasm/interpreter/wasm_opcode.h * Fix spec tests when WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS is 0 * Resolve merge conflicts arising from main -> dev/simd_for_interp and implement fast interpreter const offset loader support for V128 * Enable SIMDe tests on CI * Document WAMR_BUILD_LIB_SIMDE --------- Co-authored-by: James Marsh <[email protected]> Co-authored-by: jammar1 <[email protected]> Co-authored-by: Maks Litskevich <[email protected]> Co-authored-by: Marcin Kolny <[email protected]> Co-authored-by: Wenyong Huang <[email protected]>
1 parent c30e65b commit efa8019

File tree

13 files changed

+2337
-221
lines changed

13 files changed

+2337
-221
lines changed

.github/workflows/compilation_on_android_ubuntu.yml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ jobs:
158158
"-DWAMR_BUILD_PERF_PROFILING=1",
159159
"-DWAMR_BUILD_REF_TYPES=1",
160160
"-DWAMR_BUILD_SIMD=1",
161+
"-DWAMR_BUILD_LIB_SIMDE=1",
161162
"-DWAMR_BUILD_TAIL_CALL=1",
162163
"-DWAMR_DISABLE_HW_BOUND_CHECK=1",
163164
"-DWAMR_BUILD_MEMORY64=1",
@@ -178,11 +179,9 @@ jobs:
178179
make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
179180
- make_options_run_mode: $MULTI_TIER_JIT_BUILD_OPTIONS
180181
make_options_feature: "-DWAMR_BUILD_MULTI_MODULE=1"
181-
# SIMD only on JIT/AOT mode
182+
# SIMD only on JIT/AOT/fast interpreter mode
182183
- make_options_run_mode: $CLASSIC_INTERP_BUILD_OPTIONS
183184
make_options_feature: "-DWAMR_BUILD_SIMD=1"
184-
- make_options_run_mode: $FAST_INTERP_BUILD_OPTIONS
185-
make_options_feature: "-DWAMR_BUILD_SIMD=1"
186185
# DEBUG_INTERP only on CLASSIC INTERP mode
187186
- make_options_run_mode: $AOT_BUILD_OPTIONS
188187
make_options_feature: "-DWAMR_BUILD_DEBUG_INTERP=1"
@@ -649,11 +648,9 @@ jobs:
649648
test_option: $WAMR_COMPILER_TEST_OPTIONS
650649
exclude:
651650
# incompatible modes and features
652-
# classic-interp and fast-interp don't support simd
651+
# classic-interp doesn't support simd
653652
- running_mode: "classic-interp"
654653
test_option: $SIMD_TEST_OPTIONS
655-
- running_mode: "fast-interp"
656-
test_option: $SIMD_TEST_OPTIONS
657654
# llvm jit doesn't support multi module
658655
- running_mode: "jit"
659656
test_option: $MULTI_MODULES_TEST_OPTIONS

.github/workflows/compilation_on_sgx.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ env:
4949
# ref types enabled in wamrc by default, so we need to enable it for iwasm in AOT mode
5050
AOT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_REF_TYPES=1"
5151
CLASSIC_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
52-
FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0"
52+
FAST_INTERP_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=0 -DWAMR_BUILD_FAST_INTERP=1 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=0 -DWAMR_BUILD_SIMD=0"
5353
FAST_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=1 -DWAMR_BUILD_FAST_JIT=1 -DWAMR_BUILD_JIT=0 -DWAMR_BUILD_LAZY_JIT=1"
5454
LLVM_LAZY_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=1"
5555
LLVM_EAGER_JIT_BUILD_OPTIONS: "-DWAMR_BUILD_AOT=1 -DWAMR_BUILD_FAST_INTERP=0 -DWAMR_BUILD_INTERP=0 -DWAMR_BUILD_JIT=1 -DWAMR_BUILD_LAZY_JIT=0"
@@ -97,7 +97,7 @@ jobs:
9797
"-DWAMR_BUILD_PERF_PROFILING=1",
9898
"-DWAMR_BUILD_REF_TYPES=1",
9999
# doesn't support
100-
# "-DWAMR_BUILD_SIMD=1",
100+
"-DWAMR_BUILD_SIMD=0",
101101
"-DWAMR_BUILD_TAIL_CALL=1",
102102
"-DWAMR_DISABLE_HW_BOUND_CHECK=1",
103103
"-DWAMR_BUILD_SGX_IPFS=1",

build-scripts/config_common.cmake

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,9 @@ endif ()
300300
if (WAMR_BUILD_LIB_RATS EQUAL 1)
301301
message (" Lib rats enabled")
302302
endif()
303+
if ((WAMR_BUILD_LIB_SIMDE EQUAL 1))
304+
message (" Lib simde enabled")
305+
endif()
303306
################## WAMR features ##################
304307
if (WAMR_BUILD_MULTI_MODULE EQUAL 1)
305308
add_definitions (-DWASM_ENABLE_MULTI_MODULE=1)
@@ -371,11 +374,17 @@ else ()
371374
message (" Wakeup of blocking operations enabled")
372375
endif ()
373376
if (WAMR_BUILD_SIMD EQUAL 1)
374-
if (NOT WAMR_BUILD_TARGET MATCHES "RISCV64.*")
375-
add_definitions (-DWASM_ENABLE_SIMD=1)
376-
else ()
377+
if (WAMR_BUILD_FAST_INTERP EQUAL 1 AND WAMR_BUILD_SIMDE EQUAL 0)
378+
set(SIMD_ENABLED 0)
379+
message(" SIMD disabled for fast-interp as simde is not being built")
380+
elseif (WAMR_BUILD_TARGET MATCHES "RISCV64.*")
381+
set(SIMD_ENABLED 0)
377382
message (" SIMD disabled due to not supported on target RISCV64")
383+
else()
384+
set(SIMD_ENABLED 1)
385+
message (" SIMD enabled")
378386
endif ()
387+
add_definitions(-DWASM_ENABLE_SIMD=${SIMD_ENABLED})
379388
endif ()
380389
if (WAMR_BUILD_AOT_STACK_FRAME EQUAL 1)
381390
add_definitions (-DWASM_ENABLE_AOT_STACK_FRAME=1)

build-scripts/runtime_lib.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,16 @@ if (WAMR_BUILD_LIB_RATS EQUAL 1)
155155
include (${IWASM_DIR}/libraries/lib-rats/lib_rats.cmake)
156156
endif ()
157157

158+
if (WAMR_BUILD_SIMD EQUAL 1 AND WAMR_BUILD_FAST_INTERP EQUAL 1)
159+
if (WAMR_BUILD_PLATFORM STREQUAL "windows")
160+
message(STATUS "SIMDe doesnt support platform " ${WAMR_BUILD_PLATFORM})
161+
set(WAMR_BUILD_SIMDE 0)
162+
else()
163+
include (${IWASM_DIR}/libraries/simde/simde.cmake)
164+
set (WAMR_BUILD_SIMDE 1)
165+
endif()
166+
endif ()
167+
158168
if (WAMR_BUILD_WASM_CACHE EQUAL 1)
159169
include (${WAMR_ROOT_DIR}/build-scripts/involve_boringssl.cmake)
160170
endif ()

core/config.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,12 @@
322322
#define WASM_ENABLE_SIMD 0
323323
#endif
324324

325+
/* Disable SIMDe (used in the fast interpreter for SIMD opcodes)
326+
unless used elsewhere */
327+
#ifndef WASM_ENABLE_SIMDE
328+
#define WASM_ENABLE_SIMDE 0
329+
#endif
330+
325331
/* GC performance profiling */
326332
#ifndef WASM_ENABLE_GC_PERF_PROFILING
327333
#define WASM_ENABLE_GC_PERF_PROFILING 0

core/iwasm/common/wasm_loader_common.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,8 @@ is_valid_value_type(uint8 type)
151151
bool
152152
is_valid_value_type_for_interpreter(uint8 value_type)
153153
{
154-
#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0)
154+
#if (WASM_ENABLE_WAMR_COMPILER == 0) && (WASM_ENABLE_JIT == 0) \
155+
&& (WASM_ENABLE_FAST_INTERP == 0)
155156
/*
156157
* Note: regardless of WASM_ENABLE_SIMD, our interpreters don't have
157158
* SIMD implemented. It's safer to reject v128, especially for the

core/iwasm/common/wasm_runtime_common.h

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ extern "C" {
3737
do { \
3838
*(int64 *)(addr) = (int64)(value); \
3939
} while (0)
40+
#define PUT_V128_TO_ADDR(addr, value) \
41+
do { \
42+
*(V128 *)(addr) = (value); \
43+
} while (0)
4044
#define PUT_F64_TO_ADDR(addr, value) \
4145
do { \
4246
*(float64 *)(addr) = (float64)(value); \
@@ -49,6 +53,7 @@ extern "C" {
4953
#define GET_I64_FROM_ADDR(addr) (*(int64 *)(addr))
5054
#define GET_F64_FROM_ADDR(addr) (*(float64 *)(addr))
5155
#define GET_REF_FROM_ADDR(addr) (*(void **)(addr))
56+
#define GET_V128_FROM_ADDR(addr) (*(V128 *)(addr))
5257

5358
/* For STORE opcodes */
5459
#define STORE_I64 PUT_I64_TO_ADDR
@@ -68,13 +73,20 @@ STORE_U8(void *addr, uint8_t value)
6873
*(uint8 *)addr = value;
6974
}
7075

76+
static inline void
77+
STORE_V128(void *addr, V128 value)
78+
{
79+
*(V128 *)addr = value;
80+
}
81+
7182
/* For LOAD opcodes */
7283
#define LOAD_I64(addr) (*(int64 *)(addr))
7384
#define LOAD_F64(addr) (*(float64 *)(addr))
7485
#define LOAD_I32(addr) (*(int32 *)(addr))
7586
#define LOAD_U32(addr) (*(uint32 *)(addr))
7687
#define LOAD_I16(addr) (*(int16 *)(addr))
7788
#define LOAD_U16(addr) (*(uint16 *)(addr))
89+
#define LOAD_V128(addr) (*(V128 *)(addr))
7890

7991
#define STORE_PTR(addr, ptr) \
8092
do { \
@@ -83,6 +95,15 @@ STORE_U8(void *addr, uint8_t value)
8395

8496
#else /* WASM_CPU_SUPPORTS_UNALIGNED_ADDR_ACCESS != 0 */
8597

98+
#define PUT_V128_TO_ADDR(addr, value) \
99+
do { \
100+
uint32 *addr_u32 = (uint32 *)(addr); \
101+
addr_u32[0] = (value).i32x4[0]; \
102+
addr_u32[1] = (value).i32x4[1]; \
103+
addr_u32[2] = (value).i32x4[2]; \
104+
addr_u32[3] = (value).i32x4[3]; \
105+
} while (0)
106+
86107
#define PUT_I64_TO_ADDR(addr, value) \
87108
do { \
88109
uint32 *addr_u32 = (uint32 *)(addr); \
@@ -124,6 +145,17 @@ STORE_U8(void *addr, uint8_t value)
124145
} while (0)
125146
#endif
126147

148+
static inline V128
149+
GET_V128_FROM_ADDR(uint32 *addr)
150+
{
151+
V128 ret;
152+
ret.i32x4[0] = addr[0];
153+
ret.i32x4[1] = addr[1];
154+
ret.i32x4[2] = addr[2];
155+
ret.i32x4[3] = addr[3];
156+
return ret;
157+
}
158+
127159
static inline int64
128160
GET_I64_FROM_ADDR(uint32 *addr)
129161
{
@@ -239,7 +271,94 @@ STORE_U16(void *addr, uint16_t value)
239271
((uint8_t *)(addr))[0] = u.u8[0];
240272
((uint8_t *)(addr))[1] = u.u8[1];
241273
}
274+
275+
static inline void
276+
STORE_V128(void *addr, V128 value)
277+
{
278+
uintptr_t addr_ = (uintptr_t)(addr);
279+
union {
280+
V128 val;
281+
uint64 u64[2];
282+
uint32 u32[4];
283+
uint16 u16[8];
284+
uint8 u8[16];
285+
} u;
286+
287+
if ((addr_ & (uintptr_t)15) == 0) {
288+
*(V128 *)addr = value;
289+
}
290+
else if ((addr_ & (uintptr_t)7) == 0) {
291+
u.val = value;
292+
((uint64 *)(addr))[0] = u.u64[0];
293+
((uint64 *)(addr))[1] = u.u64[1];
294+
}
295+
else if ((addr_ & (uintptr_t)3) == 0) {
296+
u.val = value;
297+
((uint32 *)addr)[0] = u.u32[0];
298+
((uint32 *)addr)[1] = u.u32[1];
299+
((uint32 *)addr)[2] = u.u32[2];
300+
((uint32 *)addr)[3] = u.u32[3];
301+
}
302+
else if ((addr_ & (uintptr_t)1) == 0) {
303+
u.val = value;
304+
((uint16 *)addr)[0] = u.u16[0];
305+
((uint16 *)addr)[1] = u.u16[1];
306+
((uint16 *)addr)[2] = u.u16[2];
307+
((uint16 *)addr)[3] = u.u16[3];
308+
((uint16 *)addr)[4] = u.u16[4];
309+
((uint16 *)addr)[5] = u.u16[5];
310+
((uint16 *)addr)[6] = u.u16[6];
311+
((uint16 *)addr)[7] = u.u16[7];
312+
}
313+
else {
314+
u.val = value;
315+
for (int i = 0; i < 16; i++)
316+
((uint8 *)addr)[i] = u.u8[i];
317+
}
318+
}
319+
242320
/* For LOAD opcodes */
321+
static inline V128
322+
LOAD_V128(void *addr)
323+
{
324+
uintptr_t addr1 = (uintptr_t)addr;
325+
union {
326+
V128 val;
327+
uint64 u64[2];
328+
uint32 u32[4];
329+
uint16 u16[8];
330+
uint8 u8[16];
331+
} u;
332+
if ((addr1 & (uintptr_t)15) == 0)
333+
return *(V128 *)addr;
334+
335+
if ((addr1 & (uintptr_t)7) == 0) {
336+
u.u64[0] = ((uint64 *)addr)[0];
337+
u.u64[1] = ((uint64 *)addr)[1];
338+
}
339+
else if ((addr1 & (uintptr_t)3) == 0) {
340+
u.u32[0] = ((uint32 *)addr)[0];
341+
u.u32[1] = ((uint32 *)addr)[1];
342+
u.u32[2] = ((uint32 *)addr)[2];
343+
u.u32[3] = ((uint32 *)addr)[3];
344+
}
345+
else if ((addr1 & (uintptr_t)1) == 0) {
346+
u.u16[0] = ((uint16 *)addr)[0];
347+
u.u16[1] = ((uint16 *)addr)[1];
348+
u.u16[2] = ((uint16 *)addr)[2];
349+
u.u16[3] = ((uint16 *)addr)[3];
350+
u.u16[4] = ((uint16 *)addr)[4];
351+
u.u16[5] = ((uint16 *)addr)[5];
352+
u.u16[6] = ((uint16 *)addr)[6];
353+
u.u16[7] = ((uint16 *)addr)[7];
354+
}
355+
else {
356+
for (int i = 0; i < 16; i++)
357+
u.u8[i] = ((uint8 *)addr)[i];
358+
}
359+
return u.val;
360+
}
361+
243362
static inline int64
244363
LOAD_I64(void *addr)
245364
{

0 commit comments

Comments
 (0)