@@ -1023,6 +1023,272 @@ pub(crate) fn memcopy(
10231023 Ok ( inc_pc ( pc) ?)
10241024}
10251025
1026+ #[ cfg( feature = "experimental" ) ]
1027+ #[ cfg( all( target_arch = "aarch64" , target_feature = "neon" ) ) ]
1028+ fn slices_equal_neon ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1029+ use std:: arch:: aarch64:: * ;
1030+
1031+ if a. len ( ) != b. len ( ) {
1032+ return false ;
1033+ }
1034+
1035+ let len = a. len ( ) ;
1036+ let mut i = 0 ;
1037+ const CHUNK : usize = 96 ;
1038+
1039+ // if the slices are small, we don't need to
1040+ // use SIMD instructions due to overhead
1041+ if a. len ( ) < CHUNK {
1042+ return slices_equal_fallback ( a, b) ;
1043+ }
1044+
1045+ unsafe {
1046+ while i + CHUNK <= len {
1047+ let mut cmp =
1048+ vceqq_u8 ( vld1q_u8 ( a. as_ptr ( ) . add ( i) ) , vld1q_u8 ( b. as_ptr ( ) . add ( i) ) ) ;
1049+
1050+ cmp = vandq_u8 (
1051+ cmp,
1052+ vceqq_u8 (
1053+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 16 ) ) ,
1054+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 16 ) ) ,
1055+ ) ,
1056+ ) ;
1057+ cmp = vandq_u8 (
1058+ cmp,
1059+ vceqq_u8 (
1060+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 32 ) ) ,
1061+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 32 ) ) ,
1062+ ) ,
1063+ ) ;
1064+ cmp = vandq_u8 (
1065+ cmp,
1066+ vceqq_u8 (
1067+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 48 ) ) ,
1068+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 48 ) ) ,
1069+ ) ,
1070+ ) ;
1071+ cmp = vandq_u8 (
1072+ cmp,
1073+ vceqq_u8 (
1074+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 64 ) ) ,
1075+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 64 ) ) ,
1076+ ) ,
1077+ ) ;
1078+ cmp = vandq_u8 (
1079+ cmp,
1080+ vceqq_u8 (
1081+ vld1q_u8 ( a. as_ptr ( ) . add ( i + 80 ) ) ,
1082+ vld1q_u8 ( b. as_ptr ( ) . add ( i + 80 ) ) ,
1083+ ) ,
1084+ ) ;
1085+
1086+ if vmaxvq_u8 ( cmp) != 0xFF {
1087+ return false ;
1088+ }
1089+
1090+ i += CHUNK ;
1091+ }
1092+
1093+ // Scalar comparison for the remainder
1094+ a[ i..] == b[ i..]
1095+ }
1096+ }
1097+
1098+ #[ cfg( feature = "experimental" ) ]
1099+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx2" ) ) ]
1100+ fn slices_equal_avx2 ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1101+ use std:: arch:: x86_64:: * ;
1102+
1103+ if a. len ( ) != b. len ( ) {
1104+ return false ;
1105+ }
1106+
1107+ let len = a. len ( ) ;
1108+ let mut i = 0 ;
1109+ const CHUNK : usize = 256 ;
1110+
1111+ // if the slices are small, we don't need to
1112+ // use SIMD instructions due to overhead
1113+ if a. len ( ) < CHUNK {
1114+ return slices_equal_fallback ( a, b) ;
1115+ }
1116+
1117+ unsafe {
1118+ let mut aggregate_mask_a = -1i32 ;
1119+ let mut aggregate_mask_b = -1i32 ;
1120+ let mut aggregate_mask_c = -1i32 ;
1121+ let mut aggregate_mask_d = -1i32 ;
1122+ let mut aggregate_mask_a_b = -1i32 ;
1123+ let mut aggregate_mask_c_d = -1i32 ;
1124+
1125+ while i + CHUNK <= len {
1126+ let simd_a1 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i) as * const _ ) ;
1127+ let simd_b1 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i) as * const _ ) ;
1128+
1129+ let simd_a2 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 32 ) as * const _ ) ;
1130+ let simd_b2 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 32 ) as * const _ ) ;
1131+
1132+ let simd_a3 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1133+ let simd_b3 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1134+
1135+ let simd_a4 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 96 ) as * const _ ) ;
1136+ let simd_b4 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 96 ) as * const _ ) ;
1137+
1138+ let simd_a5 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1139+ let simd_b5 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1140+
1141+ let simd_a6 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 160 ) as * const _ ) ;
1142+ let simd_b6 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 160 ) as * const _ ) ;
1143+
1144+ let simd_a7 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1145+ let simd_b7 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1146+
1147+ let simd_a8 = _mm256_loadu_si256 ( a. as_ptr ( ) . add ( i + 224 ) as * const _ ) ;
1148+ let simd_b8 = _mm256_loadu_si256 ( b. as_ptr ( ) . add ( i + 224 ) as * const _ ) ;
1149+
1150+ let cmp1 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a1, simd_b1) ) ;
1151+ let cmp2 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a2, simd_b2) ) ;
1152+ let cmp3 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a3, simd_b3) ) ;
1153+ let cmp4 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a4, simd_b4) ) ;
1154+ let cmp5 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a5, simd_b5) ) ;
1155+ let cmp6 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a6, simd_b6) ) ;
1156+ let cmp7 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a7, simd_b7) ) ;
1157+ let cmp8 = _mm256_movemask_epi8 ( _mm256_cmpeq_epi8 ( simd_a8, simd_b8) ) ;
1158+
1159+ aggregate_mask_a &= cmp1 & cmp2;
1160+ aggregate_mask_b &= cmp3 & cmp4;
1161+ aggregate_mask_c &= cmp5 & cmp6;
1162+ aggregate_mask_d &= cmp7 & cmp8;
1163+
1164+ aggregate_mask_a_b &= aggregate_mask_a & aggregate_mask_b;
1165+ aggregate_mask_c_d &= aggregate_mask_c & aggregate_mask_d;
1166+
1167+ if aggregate_mask_a_b & aggregate_mask_c_d != -1i32 {
1168+ return false ;
1169+ }
1170+
1171+ i += CHUNK ;
1172+ }
1173+
1174+ a[ i..] == b[ i..]
1175+ }
1176+ }
1177+
1178+ #[ cfg( feature = "experimental" ) ]
1179+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx512f" ) ) ]
1180+ fn slices_equal_avx512 ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1181+ use std:: arch:: x86_64:: * ;
1182+
1183+ if a. len ( ) != b. len ( ) {
1184+ return false ;
1185+ }
1186+
1187+ let len = a. len ( ) ;
1188+ let mut i = 0 ;
1189+ const CHUNK : usize = 512 ;
1190+
1191+ // if the slices are small, we don't need to
1192+ // use SIMD instructions due to overhead
1193+ if a. len ( ) < CHUNK {
1194+ return slices_equal_fallback ( a, b) ;
1195+ }
1196+
1197+ unsafe {
1198+ while i + CHUNK <= len {
1199+ let simd_a1 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i) as * const _ ) ;
1200+ let simd_b1 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i) as * const _ ) ;
1201+
1202+ let simd_a2 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1203+ let simd_b2 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 64 ) as * const _ ) ;
1204+
1205+ let simd_a3 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1206+ let simd_b3 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 128 ) as * const _ ) ;
1207+
1208+ let simd_a4 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1209+ let simd_b4 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 192 ) as * const _ ) ;
1210+
1211+ let simd_a5 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 256 ) as * const _ ) ;
1212+ let simd_b5 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 256 ) as * const _ ) ;
1213+
1214+ let simd_a6 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 320 ) as * const _ ) ;
1215+ let simd_b6 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 320 ) as * const _ ) ;
1216+
1217+ let simd_a7 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 384 ) as * const _ ) ;
1218+ let simd_b7 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 384 ) as * const _ ) ;
1219+
1220+ let simd_a8 = _mm512_loadu_si512 ( a. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
1221+ let simd_b8 = _mm512_loadu_si512 ( b. as_ptr ( ) . add ( i + 448 ) as * const _ ) ;
1222+
1223+ let cmp1 = _mm512_cmpeq_epi8_mask ( simd_a1, simd_b1) ;
1224+ let cmp2 = _mm512_cmpeq_epi8_mask ( simd_a2, simd_b2) ;
1225+ let cmp3 = _mm512_cmpeq_epi8_mask ( simd_a3, simd_b3) ;
1226+ let cmp4 = _mm512_cmpeq_epi8_mask ( simd_a4, simd_b4) ;
1227+ let cmp5 = _mm512_cmpeq_epi8_mask ( simd_a5, simd_b5) ;
1228+ let cmp6 = _mm512_cmpeq_epi8_mask ( simd_a6, simd_b6) ;
1229+ let cmp7 = _mm512_cmpeq_epi8_mask ( simd_a7, simd_b7) ;
1230+ let cmp8 = _mm512_cmpeq_epi8_mask ( simd_a8, simd_b8) ;
1231+
1232+ let cmp1_2 = cmp1 & cmp2;
1233+ let cmp3_4 = cmp3 & cmp4;
1234+ let cmp5_6 = cmp5 & cmp6;
1235+ let cmp7_8 = cmp7 & cmp8;
1236+
1237+ let cmp1_4 = cmp1_2 & cmp3_4;
1238+ let cmp5_8 = cmp5_6 & cmp7_8;
1239+
1240+ let full_cmp = cmp1_4 & cmp5_8;
1241+
1242+ if full_cmp != u64:: MAX {
1243+ return false ;
1244+ }
1245+
1246+ i += CHUNK_SIZE ;
1247+ }
1248+
1249+ a[ i..] == b[ i..]
1250+ }
1251+ }
1252+
1253+ #[ inline]
1254+ fn slices_equal_fallback ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1255+ a == b
1256+ }
1257+
1258+ #[ inline]
1259+ fn slice_eq ( a : & [ u8 ] , b : & [ u8 ] ) -> bool {
1260+ #[ cfg( feature = "experimental" ) ]
1261+ {
1262+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx512f" ) ) ]
1263+ {
1264+ return slices_equal_avx512 ( a, b) ;
1265+ }
1266+ #[ cfg( all( target_arch = "x86_64" , target_feature = "avx2" ) ) ]
1267+ {
1268+ return slices_equal_avx2 ( a, b) ;
1269+ }
1270+ #[ cfg( all( target_arch = "aarch64" , target_feature = "neon" ) ) ]
1271+ {
1272+ return slices_equal_neon ( a, b) ;
1273+ }
1274+
1275+ #[ allow( unreachable_code) ]
1276+ slices_equal_fallback ( a, b)
1277+ }
1278+ #[ cfg( not( feature = "experimental" ) ) ]
1279+ {
1280+ slices_equal_fallback ( a, b)
1281+ }
1282+ }
1283+
1284+ #[ test]
1285+ fn slice_eq_test ( ) {
1286+ let a = [ 1u8 ; 20000 ] ;
1287+ let b = [ 1u8 ; 20000 ] ;
1288+
1289+ assert ! ( slice_eq( & a, & b) ) ;
1290+ }
1291+
10261292pub ( crate ) fn memeq (
10271293 memory : & mut MemoryInstance ,
10281294 result : & mut Word ,
@@ -1031,7 +1297,9 @@ pub(crate) fn memeq(
10311297 c : Word ,
10321298 d : Word ,
10331299) -> SimpleResult < ( ) > {
1034- * result = ( memory. read ( b, d) ? == memory. read ( c, d) ?) as Word ;
1300+ let range_a = memory. read ( b, d) ?;
1301+ let range_b = memory. read ( c, d) ?;
1302+ * result = slice_eq ( range_a, range_b) as Word ;
10351303 Ok ( inc_pc ( pc) ?)
10361304}
10371305
0 commit comments