Skip to content

Commit 39ac696

Browse files
committed
Reapply "[WebAssembly] Combine i128 to v16i8 for setcc & expand memcmp for 16 byte loads with simd128" (llvm#153360)
This reverts commit d32793c.
1 parent f9b9e9b commit 39ac696

File tree

4 files changed

+150
-18
lines changed

4 files changed

+150
-18
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,15 +3386,65 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
33863386
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
33873387
}
33883388

3389+
/// Try to convert a i128 comparison to a v16i8 comparison before type
3390+
/// legalization splits it up into chunks
3391+
static SDValue
3392+
combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
3393+
const WebAssemblySubtarget *Subtarget) {
3394+
3395+
SDLoc DL(N);
3396+
SDValue X = N->getOperand(0);
3397+
SDValue Y = N->getOperand(1);
3398+
EVT VT = N->getValueType(0);
3399+
EVT OpVT = X.getValueType();
3400+
3401+
SelectionDAG &DAG = DCI.DAG;
3402+
if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
3403+
Attribute::NoImplicitFloat))
3404+
return SDValue();
3405+
3406+
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
3407+
// We're looking for an oversized integer equality comparison with SIMD
3408+
if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
3409+
!Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
3410+
return SDValue();
3411+
3412+
// Don't perform this combine if constructing the vector will be expensive.
3413+
auto IsVectorBitCastCheap = [](SDValue X) {
3414+
X = peekThroughBitcasts(X);
3415+
return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
3416+
};
3417+
3418+
if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
3419+
return SDValue();
3420+
3421+
SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
3422+
SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
3423+
SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
3424+
3425+
SDValue Intr =
3426+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3427+
{DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
3428+
: Intrinsic::wasm_anytrue,
3429+
DL, MVT::i32),
3430+
Cmp});
3431+
3432+
return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32), CC);
3433+
}
3434+
33893435
static SDValue performSETCCCombine(SDNode *N,
3390-
TargetLowering::DAGCombinerInfo &DCI) {
3436+
TargetLowering::DAGCombinerInfo &DCI,
3437+
const WebAssemblySubtarget *Subtarget) {
33913438
if (!DCI.isBeforeLegalize())
33923439
return SDValue();
33933440

33943441
EVT VT = N->getValueType(0);
33953442
if (!VT.isScalarInteger())
33963443
return SDValue();
33973444

3445+
if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
3446+
return V;
3447+
33983448
SDValue LHS = N->getOperand(0);
33993449
if (LHS->getOpcode() != ISD::BITCAST)
34003450
return SDValue();
@@ -3574,7 +3624,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35743624
case ISD::BITCAST:
35753625
return performBitcastCombine(N, DCI);
35763626
case ISD::SETCC:
3577-
return performSETCCCombine(N, DCI);
3627+
return performSETCCCombine(N, DCI, Subtarget);
35783628
case ISD::VECTOR_SHUFFLE:
35793629
return performVECTOR_SHUFFLECombine(N, DCI);
35803630
case ISD::SIGN_EXTEND:

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
147147

148148
Options.AllowOverlappingLoads = true;
149149

150-
// TODO: Teach WebAssembly backend about load v128.
150+
if (ST->hasSIMD128())
151+
Options.LoadSizes.push_back(16);
151152

152153
Options.LoadSizes.append({8, 4, 2, 1});
153154
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

llvm/test/CodeGen/WebAssembly/memcmp-expand.ll

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
33

44
target triple = "wasm32-unknown-unknown"
55

@@ -127,24 +127,16 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) {
127127
ret i1 %res
128128
}
129129

130-
; TODO: Should be using a single load i64x2 or equivalent in bitsizes
131130
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
132131
; CHECK-LABEL: memcmp_expand_16:
133132
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
134133
; CHECK-NEXT: # %bb.0:
135-
; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
136-
; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
137-
; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
138-
; CHECK-NEXT: i32.const $push0=, 8
139-
; CHECK-NEXT: i32.add $push3=, $0, $pop0
140-
; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
141-
; CHECK-NEXT: i32.const $push11=, 8
142-
; CHECK-NEXT: i32.add $push1=, $1, $pop11
143-
; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
144-
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
145-
; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
146-
; CHECK-NEXT: i64.eqz $push10=, $pop9
147-
; CHECK-NEXT: return $pop10
134+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
135+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
136+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
137+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
138+
; CHECK-NEXT: i32.eqz $push4=, $pop3
139+
; CHECK-NEXT: return $pop4
148140
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
149141
%res = icmp eq i32 %cmp_16, 0
150142
ret i1 %res
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
3+
4+
target triple = "wasm32-unknown-unknown"
5+
6+
declare i32 @memcmp(ptr, ptr, i32)
7+
8+
define i1 @setcc_load(ptr %a, ptr %b) {
9+
; CHECK-LABEL: setcc_load:
10+
; CHECK: .functype setcc_load (i32, i32) -> (i32)
11+
; CHECK-NEXT: # %bb.0:
12+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
13+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
14+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
15+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
16+
; CHECK-NEXT: i32.eqz $push4=, $pop3
17+
; CHECK-NEXT: return $pop4
18+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
19+
%res = icmp eq i32 %cmp_16, 0
20+
ret i1 %res
21+
}
22+
23+
; INFO: Negative test: noimplicitfloat disables simd
24+
define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat {
25+
; CHECK-LABEL: setcc_load_should_not_vectorize:
26+
; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32)
27+
; CHECK-NEXT: # %bb.0:
28+
; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0
29+
; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0
30+
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3
31+
; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0
32+
; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
33+
; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0
34+
; CHECK-NEXT: i64.or $push6=, $pop5, $pop2
35+
; CHECK-NEXT: i64.eqz $push7=, $pop6
36+
; CHECK-NEXT: return $pop7
37+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
38+
%res = icmp eq i32 %cmp_16, 0
39+
ret i1 %res
40+
}
41+
42+
define i1 @setcc_eq_const_i128(ptr %ptr) {
43+
; CHECK-LABEL: setcc_eq_const_i128:
44+
; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32)
45+
; CHECK-NEXT: # %bb.0:
46+
; CHECK-NEXT: v128.load $push0=, 0($0)
47+
; CHECK-NEXT: v128.const $push1=, 6, 0
48+
; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1
49+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
50+
; CHECK-NEXT: i32.eqz $push4=, $pop3
51+
; CHECK-NEXT: return $pop4
52+
%l = load i128, ptr %ptr
53+
%res = icmp eq i128 %l, 6
54+
ret i1 %res
55+
}
56+
57+
define i1 @setcc_ne_const_i128(ptr %ptr) {
58+
; CHECK-LABEL: setcc_ne_const_i128:
59+
; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32)
60+
; CHECK-NEXT: # %bb.0:
61+
; CHECK-NEXT: v128.load $push0=, 0($0)
62+
; CHECK-NEXT: v128.const $push1=, 16, 0
63+
; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1
64+
; CHECK-NEXT: v128.any_true $push3=, $pop2
65+
; CHECK-NEXT: return $pop3
66+
%l = load i128, ptr %ptr
67+
%res = icmp ne i128 %l, 16
68+
ret i1 %res
69+
}
70+
71+
; INFO: Negative test: only eq and ne works
72+
define i1 @setcc_slt_const_i128(ptr %ptr) {
73+
; CHECK-LABEL: setcc_slt_const_i128:
74+
; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32)
75+
; CHECK-NEXT: # %bb.0:
76+
; CHECK-NEXT: i64.load $push2=, 0($0)
77+
; CHECK-NEXT: i64.const $push3=, 25
78+
; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3
79+
; CHECK-NEXT: i64.load $push8=, 8($0)
80+
; CHECK-NEXT: local.tee $push7=, $1=, $pop8
81+
; CHECK-NEXT: i64.const $push0=, 0
82+
; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0
83+
; CHECK-NEXT: i64.eqz $push5=, $1
84+
; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5
85+
; CHECK-NEXT: return $pop6
86+
%l = load i128, ptr %ptr
87+
%res = icmp slt i128 %l, 25
88+
ret i1 %res
89+
}

0 commit comments

Comments
 (0)