Skip to content

Commit f369a53

Browse files
[DAGCombiner] Fold select into partial.reduce.add operands. (#167857)
This generates more optimal codegen when using partial reductions with predication. ``` partial_reduce_*mla(acc, sel(p, mul(*ext(a), *ext(b)), splat(0)), splat(1)) -> partial_reduce_*mla(acc, sel(p, a, splat(0)), b) partial.reduce.*mla(acc, sel(p, *ext(op), splat(0)), splat(1)) -> partial.reduce.*mla(acc, sel(p, op, splat(0)), splat(trunc(1))) ```
1 parent 7c34848 commit f369a53

File tree

5 files changed

+247
-28
lines changed

5 files changed

+247
-28
lines changed

llvm/include/llvm/CodeGen/SelectionDAGNodes.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1968,6 +1968,10 @@ LLVM_ABI bool isOnesOrOnesSplat(SDValue N, bool AllowUndefs = false);
19681968
/// Build vector implicit truncation is allowed.
19691969
LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs = false);
19701970

1971+
/// Return true if the value is a constant (+/-)0.0 floating-point value or a
1972+
/// splatted vector thereof (with no undefs).
1973+
LLVM_ABI bool isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs = false);
1974+
19711975
/// Return true if \p V is either a integer or FP constant.
19721976
inline bool isIntOrFPConstant(SDValue V) {
19731977
return isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V);

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13018,22 +13018,34 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
1301813018
return SDValue();
1301913019
}
1302013020

13021-
// partial_reduce_*mla(acc, mul(ext(a), ext(b)), splat(1))
13021+
// partial_reduce_*mla(acc, mul(*ext(a), *ext(b)), splat(1))
1302213022
// -> partial_reduce_*mla(acc, a, b)
1302313023
//
13024-
// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
13025-
// -> partial_reduce_*mla(acc, x, C)
13024+
// partial_reduce_*mla(acc, mul(*ext(x), splat(C)), splat(1))
13025+
// -> partial_reduce_*mla(acc, x, splat(C))
1302613026
//
13027-
// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
13028-
// -> partial_reduce_fmla(acc, a, b)
13027+
// partial_reduce_*mla(acc, sel(p, mul(*ext(a), *ext(b)), splat(0)), splat(1))
13028+
// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), b)
13029+
//
13030+
// partial_reduce_*mla(acc, sel(p, mul(*ext(a), splat(C)), splat(0)), splat(1))
13031+
// -> partial_reduce_*mla(acc, sel(p, a, splat(0)), splat(C))
1302913032
SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1303013033
SDLoc DL(N);
1303113034
auto *Context = DAG.getContext();
1303213035
SDValue Acc = N->getOperand(0);
1303313036
SDValue Op1 = N->getOperand(1);
1303413037
SDValue Op2 = N->getOperand(2);
13035-
1303613038
unsigned Opc = Op1->getOpcode();
13039+
13040+
// Handle predication by moving the SELECT into the operand of the MUL.
13041+
SDValue Pred;
13042+
if (Opc == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) ||
13043+
isZeroOrZeroSplatFP(Op1->getOperand(2)))) {
13044+
Pred = Op1->getOperand(0);
13045+
Op1 = Op1->getOperand(1);
13046+
Opc = Op1->getOpcode();
13047+
}
13048+
1303713049
if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
1303813050
return SDValue();
1303913051

@@ -13068,6 +13080,19 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1306813080
SDValue LHSExtOp = LHS->getOperand(0);
1306913081
EVT LHSExtOpVT = LHSExtOp.getValueType();
1307013082

13083+
// When Pred is non-zero, set Op = select(Pred, Op, splat(0)) and freeze
13084+
// OtherOp to keep the same semantics when moving the selects into the MUL
13085+
// operands.
13086+
auto ApplyPredicate = [&](SDValue &Op, SDValue &OtherOp) {
13087+
if (Pred) {
13088+
EVT OpVT = Op.getValueType();
13089+
SDValue Zero = OpVT.isFloatingPoint() ? DAG.getConstantFP(0.0, DL, OpVT)
13090+
: DAG.getConstant(0, DL, OpVT);
13091+
Op = DAG.getSelect(DL, OpVT, Pred, Op, Zero);
13092+
OtherOp = DAG.getFreeze(OtherOp);
13093+
}
13094+
};
13095+
1307113096
// partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
1307213097
// -> partial_reduce_*mla(acc, x, C)
1307313098
APInt C;
@@ -13090,8 +13115,9 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1309013115
TLI.getTypeToTransformTo(*Context, LHSExtOpVT)))
1309113116
return SDValue();
1309213117

13093-
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
13094-
DAG.getConstant(CTrunc, DL, LHSExtOpVT));
13118+
SDValue C = DAG.getConstant(CTrunc, DL, LHSExtOpVT);
13119+
ApplyPredicate(C, LHSExtOp);
13120+
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp, C);
1309513121
}
1309613122

1309713123
unsigned RHSOpcode = RHS->getOpcode();
@@ -13132,17 +13158,17 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
1313213158
TLI.getTypeToTransformTo(*Context, LHSExtOpVT)))
1313313159
return SDValue();
1313413160

13161+
ApplyPredicate(RHSExtOp, LHSExtOp);
1313513162
return DAG.getNode(NewOpc, DL, N->getValueType(0), Acc, LHSExtOp, RHSExtOp);
1313613163
}
1313713164

13138-
// partial.reduce.umla(acc, zext(op), splat(1))
13139-
// -> partial.reduce.umla(acc, op, splat(trunc(1)))
13140-
// partial.reduce.smla(acc, sext(op), splat(1))
13141-
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
13165+
// partial.reduce.*mla(acc, *ext(op), splat(1))
13166+
// -> partial.reduce.*mla(acc, op, splat(trunc(1)))
1314213167
// partial.reduce.sumla(acc, sext(op), splat(1))
1314313168
// -> partial.reduce.smla(acc, op, splat(trunc(1)))
13144-
// partial.reduce.fmla(acc, fpext(op), splat(1.0))
13145-
// -> partial.reduce.fmla(acc, op, splat(1.0))
13169+
//
13170+
// partial.reduce.*mla(acc, sel(p, *ext(op), splat(0)), splat(1))
13171+
// -> partial.reduce.*mla(acc, sel(p, op, splat(0)), splat(trunc(1)))
1314613172
SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1314713173
SDLoc DL(N);
1314813174
SDValue Acc = N->getOperand(0);
@@ -13152,7 +13178,15 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1315213178
if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
1315313179
return SDValue();
1315413180

13181+
SDValue Pred;
1315513182
unsigned Op1Opcode = Op1.getOpcode();
13183+
if (Op1Opcode == ISD::VSELECT && (isZeroOrZeroSplat(Op1->getOperand(2)) ||
13184+
isZeroOrZeroSplatFP(Op1->getOperand(2)))) {
13185+
Pred = Op1->getOperand(0);
13186+
Op1 = Op1->getOperand(1);
13187+
Op1Opcode = Op1->getOpcode();
13188+
}
13189+
1315613190
if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
1315713191
return SDValue();
1315813192

@@ -13181,6 +13215,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
1318113215
? DAG.getConstantFP(1, DL, UnextOp1VT)
1318213216
: DAG.getConstant(1, DL, UnextOp1VT);
1318313217

13218+
if (Pred) {
13219+
SDValue Zero = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
13220+
? DAG.getConstantFP(0, DL, UnextOp1VT)
13221+
: DAG.getConstant(0, DL, UnextOp1VT);
13222+
Constant = DAG.getSelect(DL, UnextOp1VT, Pred, Constant, Zero);
13223+
}
1318413224
return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
1318513225
Constant);
1318613226
}

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12971,6 +12971,11 @@ bool llvm::isZeroOrZeroSplat(SDValue N, bool AllowUndefs) {
1297112971
return C && C->isZero();
1297212972
}
1297312973

12974+
bool llvm::isZeroOrZeroSplatFP(SDValue N, bool AllowUndefs) {
12975+
ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
12976+
return C && C->isZero();
12977+
}
12978+
1297412979
HandleSDNode::~HandleSDNode() {
1297512980
DropOperands();
1297612981
}
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target triple = "aarch64"
5+
6+
define <4 x i32> @predicate_dot_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a, <16 x i8> %b) #0 {
7+
; CHECK-LABEL: predicate_dot_fixed_length:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: shl v1.16b, v1.16b, #7
10+
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
11+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
12+
; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
13+
; CHECK-NEXT: ret
14+
%ext.1 = sext <16 x i8> %a to <16 x i32>
15+
%ext.2 = sext <16 x i8> %b to <16 x i32>
16+
%mul = mul nsw <16 x i32> %ext.1, %ext.2
17+
%sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer
18+
%red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel)
19+
ret <4 x i32> %red
20+
}
21+
22+
define <4 x i32> @predicate_dot_by_C_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 {
23+
; CHECK-LABEL: predicate_dot_by_C_fixed_length:
24+
; CHECK: // %bb.0:
25+
; CHECK-NEXT: shl v1.16b, v1.16b, #7
26+
; CHECK-NEXT: movi v3.16b, #127
27+
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
28+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
29+
; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
30+
; CHECK-NEXT: ret
31+
%ext.1 = sext <16 x i8> %a to <16 x i32>
32+
%mul = mul nsw <16 x i32> %ext.1, splat(i32 127)
33+
%sel = select <16 x i1> %p, <16 x i32> %mul, <16 x i32> zeroinitializer
34+
%red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel)
35+
ret <4 x i32> %red
36+
}
37+
38+
define <vscale x 4 x i32> @predicate_dot_scalable(<vscale x 4 x i32> %acc, <vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
39+
; CHECK-LABEL: predicate_dot_scalable:
40+
; CHECK: // %bb.0:
41+
; CHECK-NEXT: movi v3.2d, #0000000000000000
42+
; CHECK-NEXT: sel z2.b, p0, z2.b, z3.b
43+
; CHECK-NEXT: sdot z0.s, z1.b, z2.b
44+
; CHECK-NEXT: ret
45+
%ext.1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
46+
%ext.2 = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
47+
%mul = mul nsw <vscale x 16 x i32> %ext.1, %ext.2
48+
%sel = select <vscale x 16 x i1> %p, <vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer
49+
%red = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %sel)
50+
ret <vscale x 4 x i32> %red
51+
}
52+
53+
define <vscale x 4 x i32> @predicate_dot_by_C_scalable(<vscale x 4 x i32> %acc, <vscale x 16 x i1> %p, <vscale x 16 x i8> %a) #0 {
54+
; CHECK-LABEL: predicate_dot_by_C_scalable:
55+
; CHECK: // %bb.0:
56+
; CHECK-NEXT: mov z2.b, p0/z, #127 // =0x7f
57+
; CHECK-NEXT: sdot z0.s, z1.b, z2.b
58+
; CHECK-NEXT: ret
59+
%ext.1 = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
60+
%mul = mul nsw <vscale x 16 x i32> %ext.1, splat(i32 127)
61+
%sel = select <vscale x 16 x i1> %p, <vscale x 16 x i32> %mul, <vscale x 16 x i32> zeroinitializer
62+
%red = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %sel)
63+
ret <vscale x 4 x i32> %red
64+
}
65+
66+
define <4 x i32> @predicate_ext_mul_fixed_length(<4 x i32> %acc, <16 x i1> %p, <16 x i8> %a) #0 {
67+
; CHECK-LABEL: predicate_ext_mul_fixed_length:
68+
; CHECK: // %bb.0:
69+
; CHECK-NEXT: movi v3.16b, #1
70+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
71+
; CHECK-NEXT: sdot v0.4s, v2.16b, v1.16b
72+
; CHECK-NEXT: ret
73+
%ext = sext <16 x i8> %a to <16 x i32>
74+
%sel = select <16 x i1> %p, <16 x i32> %ext, <16 x i32> zeroinitializer
75+
%red = call <4 x i32> @llvm.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %sel)
76+
ret <4 x i32> %red
77+
}
78+
79+
define <vscale x 4 x i32> @predicate_ext_mul_scalable(<vscale x 4 x i32> %acc, <vscale x 16 x i1> %p, <vscale x 16 x i8> %a) #0 {
80+
; CHECK-LABEL: predicate_ext_mul_scalable:
81+
; CHECK: // %bb.0:
82+
; CHECK-NEXT: mov z2.b, p0/z, #1 // =0x1
83+
; CHECK-NEXT: sdot z0.s, z1.b, z2.b
84+
; CHECK-NEXT: ret
85+
%ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
86+
%sel = select <vscale x 16 x i1> %p, <vscale x 16 x i32> %ext, <vscale x 16 x i32> zeroinitializer
87+
%red = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %sel)
88+
ret <vscale x 4 x i32> %red
89+
}
90+
91+
define <4 x float> @predicated_fdot_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a, <8 x half> %b) #1 {
92+
; CHECK-LABEL: predicated_fdot_fixed_length:
93+
; CHECK: // %bb.0:
94+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
95+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
96+
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
97+
; CHECK-NEXT: shl v1.8h, v1.8h, #15
98+
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
99+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
100+
; CHECK-NEXT: fdot z0.s, z2.h, z1.h
101+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
102+
; CHECK-NEXT: ret
103+
%ext.1 = fpext <8 x half> %a to <8 x float>
104+
%ext.2 = fpext <8 x half> %b to <8 x float>
105+
%mul = fmul <8 x float> %ext.1, %ext.2
106+
%sel = select <8 x i1> %p, <8 x float> %mul, <8 x float> zeroinitializer
107+
%red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel)
108+
ret <4 x float> %red
109+
}
110+
111+
define <vscale x 4 x float> @predicated_fdot_scalable(<vscale x 4 x float> %acc, <vscale x 8 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #1 {
112+
; CHECK-LABEL: predicated_fdot_scalable:
113+
; CHECK: // %bb.0:
114+
; CHECK-NEXT: movi v3.2d, #0000000000000000
115+
; CHECK-NEXT: sel z2.h, p0, z2.h, z3.h
116+
; CHECK-NEXT: fdot z0.s, z1.h, z2.h
117+
; CHECK-NEXT: ret
118+
%ext.1 = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
119+
%ext.2 = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
120+
%mul = fmul <vscale x 8 x float> %ext.1, %ext.2
121+
%sel = select <vscale x 8 x i1> %p, <vscale x 8 x float> %mul, <vscale x 8 x float> zeroinitializer
122+
%red = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %sel)
123+
ret <vscale x 4 x float> %red
124+
}
125+
126+
define <4 x float> @predicated_fpext_fmul_fixed_length(<4 x float> %acc, <8 x i1> %p, <8 x half> %a) #1 {
127+
; CHECK-LABEL: predicated_fpext_fmul_fixed_length:
128+
; CHECK: // %bb.0:
129+
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
130+
; CHECK-NEXT: movi v3.8h, #60, lsl #8
131+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
132+
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
133+
; CHECK-NEXT: shl v1.8h, v1.8h, #15
134+
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
135+
; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
136+
; CHECK-NEXT: fdot z0.s, z2.h, z1.h
137+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
138+
; CHECK-NEXT: ret
139+
%ext = fpext <8 x half> %a to <8 x float>
140+
%sel = select <8 x i1> %p, <8 x float> %ext, <8 x float> zeroinitializer
141+
%red = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %sel)
142+
ret <4 x float> %red
143+
}
144+
145+
define <vscale x 4 x float> @predicated_fpext_fmul_scalable(<vscale x 4 x float> %acc, <vscale x 8 x i1> %p, <vscale x 8 x half> %a) #1 {
146+
; CHECK-LABEL: predicated_fpext_fmul_scalable:
147+
; CHECK: // %bb.0:
148+
; CHECK-NEXT: movi v2.2d, #0000000000000000
149+
; CHECK-NEXT: fmov z2.h, p0/m, #1.00000000
150+
; CHECK-NEXT: fdot z0.s, z1.h, z2.h
151+
; CHECK-NEXT: ret
152+
%ext = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
153+
%sel = select <vscale x 8 x i1> %p, <vscale x 8 x float> %ext, <vscale x 8 x float> zeroinitializer
154+
%red = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %sel)
155+
ret <vscale x 4 x float> %red
156+
}
157+
158+
attributes #0 = { nounwind "target-features"="+sve,+dotprod" }
159+
attributes #1 = { nounwind "target-features"="+sve2p1,+dotprod" }

llvm/test/CodeGen/RISCV/rvv/zvqdotq-sdnode.ll

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -996,20 +996,31 @@ entry:
996996
}
997997

998998
define <vscale x 2 x i32> @partial_reduce_select(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m) {
999-
; CHECK-LABEL: partial_reduce_select:
1000-
; CHECK: # %bb.0: # %entry
1001-
; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma
1002-
; CHECK-NEXT: vsext.vf2 v12, v8
1003-
; CHECK-NEXT: vsext.vf2 v14, v9
1004-
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1005-
; CHECK-NEXT: vmv.v.i v8, 0
1006-
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu
1007-
; CHECK-NEXT: vwmul.vv v8, v12, v14, v0.t
1008-
; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1009-
; CHECK-NEXT: vadd.vv v8, v11, v8
1010-
; CHECK-NEXT: vadd.vv v9, v9, v10
1011-
; CHECK-NEXT: vadd.vv v8, v9, v8
1012-
; CHECK-NEXT: ret
999+
; NODOT-LABEL: partial_reduce_select:
1000+
; NODOT: # %bb.0: # %entry
1001+
; NODOT-NEXT: vsetvli a0, zero, e16, m2, ta, ma
1002+
; NODOT-NEXT: vsext.vf2 v12, v8
1003+
; NODOT-NEXT: vsext.vf2 v14, v9
1004+
; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
1005+
; NODOT-NEXT: vmv.v.i v8, 0
1006+
; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, mu
1007+
; NODOT-NEXT: vwmul.vv v8, v12, v14, v0.t
1008+
; NODOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1009+
; NODOT-NEXT: vadd.vv v8, v11, v8
1010+
; NODOT-NEXT: vadd.vv v9, v9, v10
1011+
; NODOT-NEXT: vadd.vv v8, v9, v8
1012+
; NODOT-NEXT: ret
1013+
;
1014+
; DOT-LABEL: partial_reduce_select:
1015+
; DOT: # %bb.0: # %entry
1016+
; DOT-NEXT: vsetvli a0, zero, e8, m1, ta, ma
1017+
; DOT-NEXT: vmv.v.i v10, 0
1018+
; DOT-NEXT: vmerge.vvm v10, v10, v9, v0
1019+
; DOT-NEXT: vsetvli a0, zero, e32, m1, ta, ma
1020+
; DOT-NEXT: vmv.v.i v9, 0
1021+
; DOT-NEXT: vqdot.vv v9, v8, v10
1022+
; DOT-NEXT: vmv.v.v v8, v9
1023+
; DOT-NEXT: ret
10131024
entry:
10141025
%a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i32>
10151026
%b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i32>

0 commit comments

Comments
 (0)