Skip to content

Commit 348f01f

Browse files
authored
[WebAssembly] Combine i128 to v16i8 for setcc & expand memcmp for 16 byte loads with simd128 (llvm#149461)
Fixes llvm#149230 Previously, even with simd enabled via `-mattr=+simd128`, the compiler cannot utilize v128 to optimize loads and setcc of i128, instead legalizing it to consecutive i64s. This PR then adds support for setcc of i128 by converting them to v16i8's anytrue and alltrue; consequently, this benefits memcmp of 16 bytes or more (when simd128 is present). The check for enabling this optimization is if the comparison operand is either a load or an integer in i128, with the comparison code being either `EQ | NE`, without `NoImplicitFloat` function flag. Inspiration taken from RISCV's isel lowering.
1 parent 44f41f5 commit 348f01f

File tree

4 files changed

+150
-18
lines changed

4 files changed

+150
-18
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3383,15 +3383,65 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
33833383
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
33843384
}
33853385

3386+
/// Try to convert a i128 comparison to a v16i8 comparison before type
3387+
/// legalization splits it up into chunks
3388+
static SDValue
3389+
combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
3390+
const WebAssemblySubtarget *Subtarget) {
3391+
3392+
SDLoc DL(N);
3393+
SDValue X = N->getOperand(0);
3394+
SDValue Y = N->getOperand(1);
3395+
EVT VT = N->getValueType(0);
3396+
EVT OpVT = X.getValueType();
3397+
3398+
SelectionDAG &DAG = DCI.DAG;
3399+
if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
3400+
Attribute::NoImplicitFloat))
3401+
return SDValue();
3402+
3403+
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
3404+
// We're looking for an oversized integer equality comparison with SIMD
3405+
if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
3406+
!Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
3407+
return SDValue();
3408+
3409+
// Don't perform this combine if constructing the vector will be expensive.
3410+
auto IsVectorBitCastCheap = [](SDValue X) {
3411+
X = peekThroughBitcasts(X);
3412+
return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
3413+
};
3414+
3415+
if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
3416+
return SDValue();
3417+
3418+
SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
3419+
SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
3420+
SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
3421+
3422+
SDValue Intr =
3423+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3424+
{DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
3425+
: Intrinsic::wasm_anytrue,
3426+
DL, MVT::i32),
3427+
Cmp});
3428+
3429+
return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32), CC);
3430+
}
3431+
33863432
static SDValue performSETCCCombine(SDNode *N,
3387-
TargetLowering::DAGCombinerInfo &DCI) {
3433+
TargetLowering::DAGCombinerInfo &DCI,
3434+
const WebAssemblySubtarget *Subtarget) {
33883435
if (!DCI.isBeforeLegalize())
33893436
return SDValue();
33903437

33913438
EVT VT = N->getValueType(0);
33923439
if (!VT.isScalarInteger())
33933440
return SDValue();
33943441

3442+
if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
3443+
return V;
3444+
33953445
SDValue LHS = N->getOperand(0);
33963446
if (LHS->getOpcode() != ISD::BITCAST)
33973447
return SDValue();
@@ -3571,7 +3621,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35713621
case ISD::BITCAST:
35723622
return performBitcastCombine(N, DCI);
35733623
case ISD::SETCC:
3574-
return performSETCCCombine(N, DCI);
3624+
return performSETCCCombine(N, DCI, Subtarget);
35753625
case ISD::VECTOR_SHUFFLE:
35763626
return performVECTOR_SHUFFLECombine(N, DCI);
35773627
case ISD::SIGN_EXTEND:

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
147147

148148
Options.AllowOverlappingLoads = true;
149149

150-
// TODO: Teach WebAssembly backend about load v128.
150+
if (ST->hasSIMD128())
151+
Options.LoadSizes.push_back(16);
151152

152153
Options.LoadSizes.append({8, 4, 2, 1});
153154
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

llvm/test/CodeGen/WebAssembly/memcmp-expand.ll

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
33

44
target triple = "wasm32-unknown-unknown"
55

@@ -127,24 +127,16 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) {
127127
ret i1 %res
128128
}
129129

130-
; TODO: Should be using a single load i64x2 or equivalent in bitsizes
131130
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
132131
; CHECK-LABEL: memcmp_expand_16:
133132
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
134133
; CHECK-NEXT: # %bb.0:
135-
; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
136-
; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
137-
; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
138-
; CHECK-NEXT: i32.const $push0=, 8
139-
; CHECK-NEXT: i32.add $push3=, $0, $pop0
140-
; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
141-
; CHECK-NEXT: i32.const $push11=, 8
142-
; CHECK-NEXT: i32.add $push1=, $1, $pop11
143-
; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
144-
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
145-
; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
146-
; CHECK-NEXT: i64.eqz $push10=, $pop9
147-
; CHECK-NEXT: return $pop10
134+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
135+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
136+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
137+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
138+
; CHECK-NEXT: i32.eqz $push4=, $pop3
139+
; CHECK-NEXT: return $pop4
148140
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
149141
%res = icmp eq i32 %cmp_16, 0
150142
ret i1 %res
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
3+
4+
target triple = "wasm32-unknown-unknown"
5+
6+
declare i32 @memcmp(ptr, ptr, i32)
7+
8+
define i1 @setcc_load(ptr %a, ptr %b) {
9+
; CHECK-LABEL: setcc_load:
10+
; CHECK: .functype setcc_load (i32, i32) -> (i32)
11+
; CHECK-NEXT: # %bb.0:
12+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
13+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
14+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
15+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
16+
; CHECK-NEXT: i32.eqz $push4=, $pop3
17+
; CHECK-NEXT: return $pop4
18+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
19+
%res = icmp eq i32 %cmp_16, 0
20+
ret i1 %res
21+
}
22+
23+
; INFO: Negative test: noimplicitfloat disables simd
24+
define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat {
25+
; CHECK-LABEL: setcc_load_should_not_vectorize:
26+
; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32)
27+
; CHECK-NEXT: # %bb.0:
28+
; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0
29+
; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0
30+
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3
31+
; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0
32+
; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
33+
; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0
34+
; CHECK-NEXT: i64.or $push6=, $pop5, $pop2
35+
; CHECK-NEXT: i64.eqz $push7=, $pop6
36+
; CHECK-NEXT: return $pop7
37+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
38+
%res = icmp eq i32 %cmp_16, 0
39+
ret i1 %res
40+
}
41+
42+
define i1 @setcc_eq_const_i128(ptr %ptr) {
43+
; CHECK-LABEL: setcc_eq_const_i128:
44+
; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32)
45+
; CHECK-NEXT: # %bb.0:
46+
; CHECK-NEXT: v128.load $push0=, 0($0)
47+
; CHECK-NEXT: v128.const $push1=, 6, 0
48+
; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1
49+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
50+
; CHECK-NEXT: i32.eqz $push4=, $pop3
51+
; CHECK-NEXT: return $pop4
52+
%l = load i128, ptr %ptr
53+
%res = icmp eq i128 %l, 6
54+
ret i1 %res
55+
}
56+
57+
define i1 @setcc_ne_const_i128(ptr %ptr) {
58+
; CHECK-LABEL: setcc_ne_const_i128:
59+
; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32)
60+
; CHECK-NEXT: # %bb.0:
61+
; CHECK-NEXT: v128.load $push0=, 0($0)
62+
; CHECK-NEXT: v128.const $push1=, 16, 0
63+
; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1
64+
; CHECK-NEXT: v128.any_true $push3=, $pop2
65+
; CHECK-NEXT: return $pop3
66+
%l = load i128, ptr %ptr
67+
%res = icmp ne i128 %l, 16
68+
ret i1 %res
69+
}
70+
71+
; INFO: Negative test: only eq and ne works
72+
define i1 @setcc_slt_const_i128(ptr %ptr) {
73+
; CHECK-LABEL: setcc_slt_const_i128:
74+
; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32)
75+
; CHECK-NEXT: # %bb.0:
76+
; CHECK-NEXT: i64.load $push2=, 0($0)
77+
; CHECK-NEXT: i64.const $push3=, 25
78+
; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3
79+
; CHECK-NEXT: i64.load $push8=, 8($0)
80+
; CHECK-NEXT: local.tee $push7=, $1=, $pop8
81+
; CHECK-NEXT: i64.const $push0=, 0
82+
; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0
83+
; CHECK-NEXT: i64.eqz $push5=, $1
84+
; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5
85+
; CHECK-NEXT: return $pop6
86+
%l = load i128, ptr %ptr
87+
%res = icmp slt i128 %l, 25
88+
ret i1 %res
89+
}

0 commit comments

Comments
 (0)