Skip to content

Commit e557ad6

Browse files
authored
[WebAssembly] v8i8 mul support (#151145)
During DAG combine, promote the operands to v8i16 by concanting with an undef vector and then use extmul_low to perform the mul at i16. Finally, shuffle the low bytes out of the i16 elements into the result vector.
1 parent a684610 commit e557ad6

File tree

2 files changed

+44
-87
lines changed

2 files changed

+44
-87
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3588,34 +3588,53 @@ static SDValue performMulCombine(SDNode *N,
35883588
if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
35893589
return Res;
35903590

3591-
// We don't natively support v16i8 mul, but we do support v8i16 so split the
3592-
// inputs and extend them to v8i16. Only do this before legalization in case
3593-
// a narrow vector is widened and may be simplified later.
3594-
if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
3591+
// We don't natively support v16i8 or v8i8 mul, but we do support v8i16. So,
3592+
// extend them to v8i16. Only do this before legalization in case a narrow
3593+
// vector is widened and may be simplified later.
3594+
if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8))
35953595
return SDValue();
35963596

35973597
SDLoc DL(N);
35983598
SelectionDAG &DAG = DCI.DAG;
35993599
SDValue LHS = N->getOperand(0);
36003600
SDValue RHS = N->getOperand(1);
3601-
SDValue LowLHS =
3602-
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
3603-
SDValue HighLHS =
3604-
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
3605-
SDValue LowRHS =
3606-
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
3607-
SDValue HighRHS =
3608-
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
3609-
3610-
SDValue MulLow =
3611-
DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
3612-
SDValue MulHigh = DAG.getBitcast(
3613-
VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
3614-
3615-
// Take the low byte of each lane.
3616-
return DAG.getVectorShuffle(
3617-
VT, DL, MulLow, MulHigh,
3618-
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
3601+
EVT MulVT = MVT::v8i16;
3602+
3603+
if (VT == MVT::v8i8) {
3604+
SDValue PromotedLHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, LHS,
3605+
DAG.getUNDEF(MVT::v8i8));
3606+
SDValue PromotedRHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, RHS,
3607+
DAG.getUNDEF(MVT::v8i8));
3608+
SDValue LowLHS =
3609+
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedLHS);
3610+
SDValue LowRHS =
3611+
DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedRHS);
3612+
SDValue MulLow = DAG.getBitcast(
3613+
MVT::v16i8, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
3614+
// Take the low byte of each lane.
3615+
SDValue Shuffle = DAG.getVectorShuffle(
3616+
MVT::v16i8, DL, MulLow, DAG.getUNDEF(MVT::v16i8),
3617+
{0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
3618+
return extractSubVector(Shuffle, 0, DAG, DL, 64);
3619+
} else {
3620+
assert(VT == MVT::v16i8 && "Expected v16i8");
3621+
SDValue LowLHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, LHS);
3622+
SDValue LowRHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, RHS);
3623+
SDValue HighLHS =
3624+
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, LHS);
3625+
SDValue HighRHS =
3626+
DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, RHS);
3627+
3628+
SDValue MulLow =
3629+
DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
3630+
SDValue MulHigh =
3631+
DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, HighLHS, HighRHS));
3632+
3633+
// Take the low byte of each lane.
3634+
return DAG.getVectorShuffle(
3635+
VT, DL, MulLow, MulHigh,
3636+
{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
3637+
}
36193638
}
36203639

36213640
SDValue

llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll

Lines changed: 3 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -5,71 +5,9 @@ define <8 x i8> @mul_v8i8(<8 x i8> %a, <8 x i8> %b) {
55
; CHECK-LABEL: mul_v8i8:
66
; CHECK: .functype mul_v8i8 (v128, v128) -> (v128)
77
; CHECK-NEXT: # %bb.0:
8-
; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 0
9-
; CHECK-NEXT: i8x16.extract_lane_u $push3=, $1, 0
10-
; CHECK-NEXT: i32.mul $push5=, $pop4, $pop3
11-
; CHECK-NEXT: i8x16.splat $push6=, $pop5
12-
; CHECK-NEXT: i8x16.extract_lane_u $push1=, $0, 1
13-
; CHECK-NEXT: i8x16.extract_lane_u $push0=, $1, 1
14-
; CHECK-NEXT: i32.mul $push2=, $pop1, $pop0
15-
; CHECK-NEXT: i8x16.replace_lane $push7=, $pop6, 1, $pop2
16-
; CHECK-NEXT: i8x16.extract_lane_u $push9=, $0, 2
17-
; CHECK-NEXT: i8x16.extract_lane_u $push8=, $1, 2
18-
; CHECK-NEXT: i32.mul $push10=, $pop9, $pop8
19-
; CHECK-NEXT: i8x16.replace_lane $push11=, $pop7, 2, $pop10
20-
; CHECK-NEXT: i8x16.extract_lane_u $push13=, $0, 3
21-
; CHECK-NEXT: i8x16.extract_lane_u $push12=, $1, 3
22-
; CHECK-NEXT: i32.mul $push14=, $pop13, $pop12
23-
; CHECK-NEXT: i8x16.replace_lane $push15=, $pop11, 3, $pop14
24-
; CHECK-NEXT: i8x16.extract_lane_u $push17=, $0, 4
25-
; CHECK-NEXT: i8x16.extract_lane_u $push16=, $1, 4
26-
; CHECK-NEXT: i32.mul $push18=, $pop17, $pop16
27-
; CHECK-NEXT: i8x16.replace_lane $push19=, $pop15, 4, $pop18
28-
; CHECK-NEXT: i8x16.extract_lane_u $push21=, $0, 5
29-
; CHECK-NEXT: i8x16.extract_lane_u $push20=, $1, 5
30-
; CHECK-NEXT: i32.mul $push22=, $pop21, $pop20
31-
; CHECK-NEXT: i8x16.replace_lane $push23=, $pop19, 5, $pop22
32-
; CHECK-NEXT: i8x16.extract_lane_u $push25=, $0, 6
33-
; CHECK-NEXT: i8x16.extract_lane_u $push24=, $1, 6
34-
; CHECK-NEXT: i32.mul $push26=, $pop25, $pop24
35-
; CHECK-NEXT: i8x16.replace_lane $push27=, $pop23, 6, $pop26
36-
; CHECK-NEXT: i8x16.extract_lane_u $push29=, $0, 7
37-
; CHECK-NEXT: i8x16.extract_lane_u $push28=, $1, 7
38-
; CHECK-NEXT: i32.mul $push30=, $pop29, $pop28
39-
; CHECK-NEXT: i8x16.replace_lane $push31=, $pop27, 7, $pop30
40-
; CHECK-NEXT: i8x16.extract_lane_u $push33=, $0, 8
41-
; CHECK-NEXT: i8x16.extract_lane_u $push32=, $1, 8
42-
; CHECK-NEXT: i32.mul $push34=, $pop33, $pop32
43-
; CHECK-NEXT: i8x16.replace_lane $push35=, $pop31, 8, $pop34
44-
; CHECK-NEXT: i8x16.extract_lane_u $push37=, $0, 9
45-
; CHECK-NEXT: i8x16.extract_lane_u $push36=, $1, 9
46-
; CHECK-NEXT: i32.mul $push38=, $pop37, $pop36
47-
; CHECK-NEXT: i8x16.replace_lane $push39=, $pop35, 9, $pop38
48-
; CHECK-NEXT: i8x16.extract_lane_u $push41=, $0, 10
49-
; CHECK-NEXT: i8x16.extract_lane_u $push40=, $1, 10
50-
; CHECK-NEXT: i32.mul $push42=, $pop41, $pop40
51-
; CHECK-NEXT: i8x16.replace_lane $push43=, $pop39, 10, $pop42
52-
; CHECK-NEXT: i8x16.extract_lane_u $push45=, $0, 11
53-
; CHECK-NEXT: i8x16.extract_lane_u $push44=, $1, 11
54-
; CHECK-NEXT: i32.mul $push46=, $pop45, $pop44
55-
; CHECK-NEXT: i8x16.replace_lane $push47=, $pop43, 11, $pop46
56-
; CHECK-NEXT: i8x16.extract_lane_u $push49=, $0, 12
57-
; CHECK-NEXT: i8x16.extract_lane_u $push48=, $1, 12
58-
; CHECK-NEXT: i32.mul $push50=, $pop49, $pop48
59-
; CHECK-NEXT: i8x16.replace_lane $push51=, $pop47, 12, $pop50
60-
; CHECK-NEXT: i8x16.extract_lane_u $push53=, $0, 13
61-
; CHECK-NEXT: i8x16.extract_lane_u $push52=, $1, 13
62-
; CHECK-NEXT: i32.mul $push54=, $pop53, $pop52
63-
; CHECK-NEXT: i8x16.replace_lane $push55=, $pop51, 13, $pop54
64-
; CHECK-NEXT: i8x16.extract_lane_u $push57=, $0, 14
65-
; CHECK-NEXT: i8x16.extract_lane_u $push56=, $1, 14
66-
; CHECK-NEXT: i32.mul $push58=, $pop57, $pop56
67-
; CHECK-NEXT: i8x16.replace_lane $push59=, $pop55, 14, $pop58
68-
; CHECK-NEXT: i8x16.extract_lane_u $push61=, $0, 15
69-
; CHECK-NEXT: i8x16.extract_lane_u $push60=, $1, 15
70-
; CHECK-NEXT: i32.mul $push62=, $pop61, $pop60
71-
; CHECK-NEXT: i8x16.replace_lane $push63=, $pop59, 15, $pop62
72-
; CHECK-NEXT: return $pop63
8+
; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push0=, $0, $1
9+
; CHECK-NEXT: i8x16.shuffle $push1=, $pop0, $1, 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
10+
; CHECK-NEXT: return $pop1
7311
%mul = mul <8 x i8> %a, %b
7412
ret <8 x i8> %mul
7513
}

0 commit comments

Comments
 (0)