Skip to content

Commit d7a29e5

Browse files
authored
[WebAssembly] Reapply #149461 with correct CondCode in combine of SETCC (#153703)
This PR reapplies #149461 In the original `combineVectorSizedSetCCEquality`, the result of setcc is being negated by returning setcc with the same cond code, leading to wrong logic. For example, with ```llvm %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) %res = icmp eq i32 %cmp_16, 0 ``` the original PR producese all_true and then also compares the result equal to 0 (using the same SETEQ in the returning setcc), meaning that semantically, it effectively is calling icmp ne. Instead, the PR should have use SETNE in the returning setcc, this way, all true return 1, then it is compared again ne 0, which is equivalent to icmp eq.
1 parent 79cf877 commit d7a29e5

File tree

4 files changed

+148
-18
lines changed

4 files changed

+148
-18
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3386,15 +3386,66 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
33863386
return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
33873387
}
33883388

3389+
/// Try to convert a i128 comparison to a v16i8 comparison before type
3390+
/// legalization splits it up into chunks
3391+
static SDValue
3392+
combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
3393+
const WebAssemblySubtarget *Subtarget) {
3394+
3395+
SDLoc DL(N);
3396+
SDValue X = N->getOperand(0);
3397+
SDValue Y = N->getOperand(1);
3398+
EVT VT = N->getValueType(0);
3399+
EVT OpVT = X.getValueType();
3400+
3401+
SelectionDAG &DAG = DCI.DAG;
3402+
if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute(
3403+
Attribute::NoImplicitFloat))
3404+
return SDValue();
3405+
3406+
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
3407+
// We're looking for an oversized integer equality comparison with SIMD
3408+
if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 ||
3409+
!Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC))
3410+
return SDValue();
3411+
3412+
// Don't perform this combine if constructing the vector will be expensive.
3413+
auto IsVectorBitCastCheap = [](SDValue X) {
3414+
X = peekThroughBitcasts(X);
3415+
return isa<ConstantSDNode>(X) || X.getOpcode() == ISD::LOAD;
3416+
};
3417+
3418+
if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
3419+
return SDValue();
3420+
3421+
SDValue VecX = DAG.getBitcast(MVT::v16i8, X);
3422+
SDValue VecY = DAG.getBitcast(MVT::v16i8, Y);
3423+
SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC);
3424+
3425+
SDValue Intr =
3426+
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3427+
{DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue
3428+
: Intrinsic::wasm_anytrue,
3429+
DL, MVT::i32),
3430+
Cmp});
3431+
3432+
return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32),
3433+
ISD::SETNE);
3434+
}
3435+
33893436
static SDValue performSETCCCombine(SDNode *N,
3390-
TargetLowering::DAGCombinerInfo &DCI) {
3437+
TargetLowering::DAGCombinerInfo &DCI,
3438+
const WebAssemblySubtarget *Subtarget) {
33913439
if (!DCI.isBeforeLegalize())
33923440
return SDValue();
33933441

33943442
EVT VT = N->getValueType(0);
33953443
if (!VT.isScalarInteger())
33963444
return SDValue();
33973445

3446+
if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
3447+
return V;
3448+
33983449
SDValue LHS = N->getOperand(0);
33993450
if (LHS->getOpcode() != ISD::BITCAST)
34003451
return SDValue();
@@ -3574,7 +3625,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35743625
case ISD::BITCAST:
35753626
return performBitcastCombine(N, DCI);
35763627
case ISD::SETCC:
3577-
return performSETCCCombine(N, DCI);
3628+
return performSETCCCombine(N, DCI, Subtarget);
35783629
case ISD::VECTOR_SHUFFLE:
35793630
return performVECTOR_SHUFFLECombine(N, DCI);
35803631
case ISD::SIGN_EXTEND:

llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
147147

148148
Options.AllowOverlappingLoads = true;
149149

150-
// TODO: Teach WebAssembly backend about load v128.
150+
if (ST->hasSIMD128())
151+
Options.LoadSizes.push_back(16);
151152

152153
Options.LoadSizes.append({8, 4, 2, 1});
153154
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);

llvm/test/CodeGen/WebAssembly/memcmp-expand.ll

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2-
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
33

44
target triple = "wasm32-unknown-unknown"
55

@@ -127,24 +127,15 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) {
127127
ret i1 %res
128128
}
129129

130-
; TODO: Should be using a single load i64x2 or equivalent in bitsizes
131130
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
132131
; CHECK-LABEL: memcmp_expand_16:
133132
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
134133
; CHECK-NEXT: # %bb.0:
135-
; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
136-
; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
137-
; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
138-
; CHECK-NEXT: i32.const $push0=, 8
139-
; CHECK-NEXT: i32.add $push3=, $0, $pop0
140-
; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
141-
; CHECK-NEXT: i32.const $push11=, 8
142-
; CHECK-NEXT: i32.add $push1=, $1, $pop11
143-
; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
144-
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
145-
; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
146-
; CHECK-NEXT: i64.eqz $push10=, $pop9
147-
; CHECK-NEXT: return $pop10
134+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
135+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
136+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
137+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
138+
; CHECK-NEXT: return $pop3
148139
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
149140
%res = icmp eq i32 %cmp_16, 0
150141
ret i1 %res
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
3+
4+
target triple = "wasm32-unknown-unknown"
5+
6+
declare i32 @memcmp(ptr, ptr, i32)
7+
8+
define i1 @setcc_load(ptr %a, ptr %b) {
9+
; CHECK-LABEL: setcc_load:
10+
; CHECK: .functype setcc_load (i32, i32) -> (i32)
11+
; CHECK-NEXT: # %bb.0:
12+
; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0
13+
; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0
14+
; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0
15+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
16+
; CHECK-NEXT: return $pop3
17+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
18+
%res = icmp eq i32 %cmp_16, 0
19+
ret i1 %res
20+
}
21+
22+
; INFO: Negative test: noimplicitfloat disables simd
23+
define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat {
24+
; CHECK-LABEL: setcc_load_should_not_vectorize:
25+
; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32)
26+
; CHECK-NEXT: # %bb.0:
27+
; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0
28+
; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0
29+
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3
30+
; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0
31+
; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
32+
; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0
33+
; CHECK-NEXT: i64.or $push6=, $pop5, $pop2
34+
; CHECK-NEXT: i64.eqz $push7=, $pop6
35+
; CHECK-NEXT: return $pop7
36+
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
37+
%res = icmp eq i32 %cmp_16, 0
38+
ret i1 %res
39+
}
40+
41+
define i1 @setcc_eq_const_i128(ptr %ptr) {
42+
; CHECK-LABEL: setcc_eq_const_i128:
43+
; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32)
44+
; CHECK-NEXT: # %bb.0:
45+
; CHECK-NEXT: v128.load $push0=, 0($0)
46+
; CHECK-NEXT: v128.const $push1=, 6, 0
47+
; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1
48+
; CHECK-NEXT: i8x16.all_true $push3=, $pop2
49+
; CHECK-NEXT: return $pop3
50+
%l = load i128, ptr %ptr
51+
%res = icmp eq i128 %l, 6
52+
ret i1 %res
53+
}
54+
55+
define i1 @setcc_ne_const_i128(ptr %ptr) {
56+
; CHECK-LABEL: setcc_ne_const_i128:
57+
; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32)
58+
; CHECK-NEXT: # %bb.0:
59+
; CHECK-NEXT: v128.load $push0=, 0($0)
60+
; CHECK-NEXT: v128.const $push1=, 16, 0
61+
; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1
62+
; CHECK-NEXT: v128.any_true $push3=, $pop2
63+
; CHECK-NEXT: return $pop3
64+
%l = load i128, ptr %ptr
65+
%res = icmp ne i128 %l, 16
66+
ret i1 %res
67+
}
68+
69+
; INFO: Negative test: only eq and ne works
70+
define i1 @setcc_slt_const_i128(ptr %ptr) {
71+
; CHECK-LABEL: setcc_slt_const_i128:
72+
; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32)
73+
; CHECK-NEXT: # %bb.0:
74+
; CHECK-NEXT: i64.load $push2=, 0($0)
75+
; CHECK-NEXT: i64.const $push3=, 25
76+
; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3
77+
; CHECK-NEXT: i64.load $push8=, 8($0)
78+
; CHECK-NEXT: local.tee $push7=, $1=, $pop8
79+
; CHECK-NEXT: i64.const $push0=, 0
80+
; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0
81+
; CHECK-NEXT: i64.eqz $push5=, $1
82+
; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5
83+
; CHECK-NEXT: return $pop6
84+
%l = load i128, ptr %ptr
85+
%res = icmp slt i128 %l, 25
86+
ret i1 %res
87+
}

0 commit comments

Comments
 (0)