Skip to content

Commit b8059e7

Browse files
authored
[X86] Avoid extra (PMADDUBSW(X,AND(Y)) in <X x i8> multiplication (#168262)
On SSSE3 targets we use PMADDUBSW of odd/even with suitable masking to avoid having to extend/truncate with `<X x i16>` types and avoid additional Port0/5 pressure. However, lower i8 elements in the pair can safely use PMULLW directly without any pre-masking as we will only use the lower i8 bits of the result which is only affected by the lower i8 of the inputs.
1 parent f12ad95 commit b8059e7

24 files changed

+520
-625
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29629,9 +29629,9 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
2962929629
}
2963029630
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
2963129631
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29632-
SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
2963329632
SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29634-
SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29633+
SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, DAG.getBitcast(ExVT, A),
29634+
DAG.getBitcast(ExVT, B));
2963529635
SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
2963629636
RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
2963729637
RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,

llvm/test/CodeGen/X86/avx2-arith.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
121121
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
122122
; CHECK-LABEL: mul_v32i8:
123123
; CHECK: # %bb.0:
124-
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
125-
; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3
126-
; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
127-
; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
128-
; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
124+
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2
125+
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
126+
; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
127+
; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1
129128
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
130129
; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0
131-
; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0
130+
; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
132131
; CHECK-NEXT: ret{{[l|q]}}
133132
%x = mul <32 x i8> %i, %j
134133
ret <32 x i8> %x

llvm/test/CodeGen/X86/combine-mul.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) {
504504
; SSE-NEXT: movdqa %xmm0, %xmm1
505505
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
506506
; SSE-NEXT: psllw $8, %xmm1
507-
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
507+
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
508508
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
509509
; SSE-NEXT: por %xmm1, %xmm0
510510
; SSE-NEXT: retq

llvm/test/CodeGen/X86/gfni-shifts.ll

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
388388
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
389389
; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
390390
; GFNISSE-NEXT: psllw $8, %xmm1
391-
; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
391+
; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
392392
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
393393
; GFNISSE-NEXT: por %xmm1, %xmm0
394394
; GFNISSE-NEXT: retq
@@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
397397
; GFNIAVX1: # %bb.0:
398398
; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
399399
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
400-
; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
400+
; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
401401
; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
402402
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
403403
; GFNIAVX1-NEXT: retq
@@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
12131213
define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
12141214
; GFNISSE-LABEL: constant_shl_v32i8:
12151215
; GFNISSE: # %bb.0:
1216-
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
1216+
; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
12171217
; GFNISSE-NEXT: movdqa %xmm0, %xmm3
1218-
; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
1218+
; GFNISSE-NEXT: pmullw %xmm2, %xmm3
12191219
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
12201220
; GFNISSE-NEXT: pand %xmm4, %xmm3
12211221
; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
12221222
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0
12231223
; GFNISSE-NEXT: psllw $8, %xmm0
12241224
; GFNISSE-NEXT: por %xmm3, %xmm0
1225-
; GFNISSE-NEXT: movdqa %xmm1, %xmm3
1226-
; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
1227-
; GFNISSE-NEXT: pand %xmm4, %xmm3
1225+
; GFNISSE-NEXT: pmullw %xmm1, %xmm2
1226+
; GFNISSE-NEXT: pand %xmm4, %xmm2
12281227
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1
12291228
; GFNISSE-NEXT: psllw $8, %xmm1
1230-
; GFNISSE-NEXT: por %xmm3, %xmm1
1229+
; GFNISSE-NEXT: por %xmm2, %xmm1
12311230
; GFNISSE-NEXT: retq
12321231
;
12331232
; GFNIAVX1-LABEL: constant_shl_v32i8:
@@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
12391238
; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
12401239
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
12411240
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1242-
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
1243-
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
1244-
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
1241+
; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
1242+
; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
1243+
; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
12451244
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
12461245
; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
12471246
; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
12511250
; GFNIAVX2: # %bb.0:
12521251
; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
12531252
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
1254-
; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1253+
; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
12551254
; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
12561255
; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
12571256
; GFNIAVX2-NEXT: retq
12581257
;
12591258
; GFNIAVX512VL-LABEL: constant_shl_v32i8:
12601259
; GFNIAVX512VL: # %bb.0:
1261-
; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
1260+
; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
12621261
; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
12631262
; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0
12641263
; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -2521,33 +2520,32 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
25212520
define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
25222521
; GFNISSE-LABEL: constant_shl_v64i8:
25232522
; GFNISSE: # %bb.0:
2524-
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2]
2523+
; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
25252524
; GFNISSE-NEXT: movdqa %xmm0, %xmm6
2526-
; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
2525+
; GFNISSE-NEXT: pmullw %xmm4, %xmm6
25272526
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
25282527
; GFNISSE-NEXT: pand %xmm5, %xmm6
25292528
; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
25302529
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm0
25312530
; GFNISSE-NEXT: psllw $8, %xmm0
25322531
; GFNISSE-NEXT: por %xmm6, %xmm0
25332532
; GFNISSE-NEXT: movdqa %xmm1, %xmm6
2534-
; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
2533+
; GFNISSE-NEXT: pmullw %xmm4, %xmm6
25352534
; GFNISSE-NEXT: pand %xmm5, %xmm6
25362535
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1
25372536
; GFNISSE-NEXT: psllw $8, %xmm1
25382537
; GFNISSE-NEXT: por %xmm6, %xmm1
25392538
; GFNISSE-NEXT: movdqa %xmm2, %xmm6
2540-
; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
2539+
; GFNISSE-NEXT: pmullw %xmm4, %xmm6
25412540
; GFNISSE-NEXT: pand %xmm5, %xmm6
25422541
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2
25432542
; GFNISSE-NEXT: psllw $8, %xmm2
25442543
; GFNISSE-NEXT: por %xmm6, %xmm2
2545-
; GFNISSE-NEXT: movdqa %xmm3, %xmm6
2546-
; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
2547-
; GFNISSE-NEXT: pand %xmm5, %xmm6
2544+
; GFNISSE-NEXT: pmullw %xmm3, %xmm4
2545+
; GFNISSE-NEXT: pand %xmm5, %xmm4
25482546
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3
25492547
; GFNISSE-NEXT: psllw $8, %xmm3
2550-
; GFNISSE-NEXT: por %xmm6, %xmm3
2548+
; GFNISSE-NEXT: por %xmm4, %xmm3
25512549
; GFNISSE-NEXT: retq
25522550
;
25532551
; GFNIAVX1-LABEL: constant_shl_v64i8:
@@ -2559,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
25592557
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5
25602558
; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5
25612559
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
2562-
; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2]
2563-
; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4
2564-
; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
2560+
; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
2561+
; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
2562+
; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
25652563
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
25662564
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
25672565
; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
@@ -2572,26 +2570,26 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
25722570
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2
25732571
; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
25742572
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
2575-
; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3
2576-
; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
2573+
; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3
2574+
; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
25772575
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
25782576
; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
25792577
; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
25802578
; GFNIAVX1-NEXT: retq
25812579
;
25822580
; GFNIAVX2-LABEL: constant_shl_v64i8:
25832581
; GFNIAVX2: # %bb.0:
2584-
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
2582+
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
25852583
; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2586-
; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
2584+
; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
25872585
; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
25882586
; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
25892587
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
25902588
; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1]
25912589
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
25922590
; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0
25932591
; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
2594-
; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
2592+
; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
25952593
; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
25962594
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
25972595
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -2601,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
26012599
; GFNIAVX512VL-LABEL: constant_shl_v64i8:
26022600
; GFNIAVX512VL: # %bb.0:
26032601
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2604-
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
2602+
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
26052603
; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
2606-
; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
2607-
; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
2604+
; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
2605+
; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
26082606
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
26092607
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
26102608
; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
@@ -2618,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
26182616
;
26192617
; GFNIAVX512BW-LABEL: constant_shl_v64i8:
26202618
; GFNIAVX512BW: # %bb.0:
2621-
; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
2619+
; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
26222620
; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
26232621
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
26242622
; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)

0 commit comments

Comments
 (0)