Skip to content

Commit ec6a15f

Browse files
authored
[X86] optimize masked truncated saturating stores (#169827)
Combine the saturating operation into the masked truncating store. https://godbolt.org/z/n1YfavKP6 ```asm _mm256_mask_cvtusepi16_storeu_epi8_manual: # @_mm256_mask_cvtusepi16_storeu_epi8_manual kmovd k1, esi vmovdqa ymm0, ymmword ptr [rdx] vpminuw ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] vpmovwb xmmword ptr [rdi] {k1}, ymm0 vzeroupper ret _mm256_mask_cvtusepi16_storeu_epi8_intrinsic: # @_mm256_mask_cvtusepi16_storeu_epi8_intrinsic kmovd k1, esi vmovdqa ymm0, ymmword ptr [rdx] vpmovuswb xmmword ptr [rdi] {k1}, ymm0 vzeroupper ret ```
1 parent bd21095 commit ec6a15f

File tree

3 files changed

+158
-135
lines changed

3 files changed

+158
-135
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53523,18 +53523,48 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
5352353523
if (Mst->isCompressingStore())
5352453524
return SDValue();
5352553525

53526-
EVT VT = Mst->getValue().getValueType();
53526+
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53527+
return ScalarStore;
53528+
5352753529
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53530+
SDLoc DL(N);
5352853531

53529-
if (Mst->isTruncatingStore())
53530-
return SDValue();
53532+
SDValue Mask = Mst->getMask();
53533+
SDValue Value = Mst->getValue();
53534+
EVT MemVT = Mst->getMemoryVT();
53535+
EVT VT = Value.getValueType();
5353153536

53532-
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53533-
return ScalarStore;
53537+
// See if the truncating store can be a saturating truncated store.
53538+
if (Mst->isTruncatingStore()) {
53539+
if (VT.isVector() && MemVT.isVector() && VT.getScalarType().isInteger() &&
53540+
MemVT.getScalarType().isInteger() &&
53541+
VT.getVectorNumElements() == MemVT.getVectorNumElements() &&
53542+
Subtarget.hasBWI() && Subtarget.hasVLX()) {
53543+
53544+
SDValue SatSrc;
53545+
unsigned Opc;
53546+
if (SDValue SVal = detectSSatPattern(Value, MemVT)) {
53547+
SatSrc = SVal;
53548+
Opc = X86ISD::VMTRUNCSTORES;
53549+
} else if (SDValue UVal = detectUSatPattern(Value, MemVT, DAG, DL)) {
53550+
SatSrc = UVal;
53551+
Opc = X86ISD::VMTRUNCSTOREUS;
53552+
} else {
53553+
return SDValue();
53554+
}
53555+
53556+
SDVTList VTs = DAG.getVTList(MVT::Other);
53557+
SDValue Ops[] = {Mst->getChain(), SatSrc, Mst->getBasePtr(), Mask};
53558+
MachineMemOperand *MMO = Mst->getMemOperand();
53559+
return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
53560+
}
53561+
53562+
// Otherwise don't combine if this store already truncates.
53563+
return SDValue();
53564+
}
5353453565

5353553566
// If the mask value has been legalized to a non-boolean vector, try to
5353653567
// simplify ops leading up to it. We only demand the MSB of each lane.
53537-
SDValue Mask = Mst->getMask();
5353853568
if (Mask.getScalarValueSizeInBits() != 1) {
5353953569
APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
5354053570
if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
@@ -53550,14 +53580,12 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
5355053580
Mst->getAddressingMode());
5355153581
}
5355253582

53553-
SDValue Value = Mst->getValue();
5355453583
if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53555-
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53556-
Mst->getMemoryVT())) {
53557-
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53558-
Mst->getBasePtr(), Mst->getOffset(), Mask,
53559-
Mst->getMemoryVT(), Mst->getMemOperand(),
53560-
Mst->getAddressingMode(), true);
53584+
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), MemVT)) {
53585+
return DAG.getMaskedStore(Mst->getChain(), DL, Value.getOperand(0),
53586+
Mst->getBasePtr(), Mst->getOffset(), Mask, MemVT,
53587+
Mst->getMemOperand(), Mst->getAddressingMode(),
53588+
true);
5356153589
}
5356253590

5356353591
return SDValue();

llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll

Lines changed: 60 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
55
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
66
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL
7+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512FVL
88
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
9+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512BWVL
1010

1111
define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
1212
; SSE2-LABEL: truncstore_v8i64_v8i32:
@@ -350,14 +350,21 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
350350
; AVX512-NEXT: vzeroupper
351351
; AVX512-NEXT: retq
352352
;
353-
; AVX512VL-LABEL: truncstore_v8i64_v8i32:
354-
; AVX512VL: # %bb.0:
355-
; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
356-
; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
357-
; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
358-
; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
359-
; AVX512VL-NEXT: vzeroupper
360-
; AVX512VL-NEXT: retq
353+
; AVX512FVL-LABEL: truncstore_v8i64_v8i32:
354+
; AVX512FVL: # %bb.0:
355+
; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1
356+
; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
357+
; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
358+
; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
359+
; AVX512FVL-NEXT: vzeroupper
360+
; AVX512FVL-NEXT: retq
361+
;
362+
; AVX512BWVL-LABEL: truncstore_v8i64_v8i32:
363+
; AVX512BWVL: # %bb.0:
364+
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
365+
; AVX512BWVL-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
366+
; AVX512BWVL-NEXT: vzeroupper
367+
; AVX512BWVL-NEXT: retq
361368
%a = icmp ne <8 x i32> %mask, zeroinitializer
362369
%b = icmp slt <8 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
363370
%c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -964,9 +971,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
964971
; AVX512BWVL-LABEL: truncstore_v8i64_v8i16:
965972
; AVX512BWVL: # %bb.0:
966973
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
967-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
968-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
969-
; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
974+
; AVX512BWVL-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
970975
; AVX512BWVL-NEXT: vzeroupper
971976
; AVX512BWVL-NEXT: retq
972977
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1572,9 +1577,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
15721577
; AVX512BWVL-LABEL: truncstore_v8i64_v8i8:
15731578
; AVX512BWVL: # %bb.0:
15741579
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
1575-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1576-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1577-
; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
1580+
; AVX512BWVL-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
15781581
; AVX512BWVL-NEXT: vzeroupper
15791582
; AVX512BWVL-NEXT: retq
15801583
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1788,14 +1791,21 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
17881791
; AVX512-NEXT: vzeroupper
17891792
; AVX512-NEXT: retq
17901793
;
1791-
; AVX512VL-LABEL: truncstore_v4i64_v4i32:
1792-
; AVX512VL: # %bb.0:
1793-
; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
1794-
; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
1795-
; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
1796-
; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
1797-
; AVX512VL-NEXT: vzeroupper
1798-
; AVX512VL-NEXT: retq
1794+
; AVX512FVL-LABEL: truncstore_v4i64_v4i32:
1795+
; AVX512FVL: # %bb.0:
1796+
; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1
1797+
; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
1798+
; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
1799+
; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
1800+
; AVX512FVL-NEXT: vzeroupper
1801+
; AVX512FVL-NEXT: retq
1802+
;
1803+
; AVX512BWVL-LABEL: truncstore_v4i64_v4i32:
1804+
; AVX512BWVL: # %bb.0:
1805+
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
1806+
; AVX512BWVL-NEXT: vpmovsqd %ymm0, (%rdi) {%k1}
1807+
; AVX512BWVL-NEXT: vzeroupper
1808+
; AVX512BWVL-NEXT: retq
17991809
%a = icmp ne <4 x i32> %mask, zeroinitializer
18001810
%b = icmp slt <4 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
18011811
%c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -2141,9 +2151,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
21412151
; AVX512BWVL-LABEL: truncstore_v4i64_v4i16:
21422152
; AVX512BWVL: # %bb.0:
21432153
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
2144-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2145-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2146-
; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1}
2154+
; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) {%k1}
21472155
; AVX512BWVL-NEXT: vzeroupper
21482156
; AVX512BWVL-NEXT: retq
21492157
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2495,9 +2503,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
24952503
; AVX512BWVL-LABEL: truncstore_v4i64_v4i8:
24962504
; AVX512BWVL: # %bb.0:
24972505
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
2498-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2499-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
2500-
; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1}
2506+
; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) {%k1}
25012507
; AVX512BWVL-NEXT: vzeroupper
25022508
; AVX512BWVL-NEXT: retq
25032509
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2641,13 +2647,19 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
26412647
; AVX512-NEXT: vzeroupper
26422648
; AVX512-NEXT: retq
26432649
;
2644-
; AVX512VL-LABEL: truncstore_v2i64_v2i32:
2645-
; AVX512VL: # %bb.0:
2646-
; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
2647-
; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2648-
; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2649-
; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
2650-
; AVX512VL-NEXT: retq
2650+
; AVX512FVL-LABEL: truncstore_v2i64_v2i32:
2651+
; AVX512FVL: # %bb.0:
2652+
; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1
2653+
; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2654+
; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2655+
; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
2656+
; AVX512FVL-NEXT: retq
2657+
;
2658+
; AVX512BWVL-LABEL: truncstore_v2i64_v2i32:
2659+
; AVX512BWVL: # %bb.0:
2660+
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
2661+
; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) {%k1}
2662+
; AVX512BWVL-NEXT: retq
26512663
%a = icmp ne <2 x i64> %mask, zeroinitializer
26522664
%b = icmp slt <2 x i64> %x, <i64 2147483647, i64 2147483647>
26532665
%c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 2147483647, i64 2147483647>
@@ -2832,9 +2844,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
28322844
; AVX512BWVL-LABEL: truncstore_v2i64_v2i16:
28332845
; AVX512BWVL: # %bb.0:
28342846
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
2835-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2836-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
2837-
; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1}
2847+
; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) {%k1}
28382848
; AVX512BWVL-NEXT: retq
28392849
%a = icmp ne <2 x i64> %mask, zeroinitializer
28402850
%b = icmp slt <2 x i64> %x, <i64 32767, i64 32767>
@@ -3018,9 +3028,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
30183028
; AVX512BWVL-LABEL: truncstore_v2i64_v2i8:
30193029
; AVX512BWVL: # %bb.0:
30203030
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
3021-
; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
3022-
; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
3023-
; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1}
3031+
; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) {%k1}
30243032
; AVX512BWVL-NEXT: retq
30253033
%a = icmp ne <2 x i64> %mask, zeroinitializer
30263034
%b = icmp slt <2 x i64> %x, <i64 127, i64 127>
@@ -3816,9 +3824,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
38163824
; AVX512BWVL-LABEL: truncstore_v16i32_v16i16:
38173825
; AVX512BWVL: # %bb.0:
38183826
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
3819-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3820-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3821-
; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
3827+
; AVX512BWVL-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
38223828
; AVX512BWVL-NEXT: vzeroupper
38233829
; AVX512BWVL-NEXT: retq
38243830
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -4594,9 +4600,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
45944600
; AVX512BWVL-LABEL: truncstore_v16i32_v16i8:
45954601
; AVX512BWVL: # %bb.0:
45964602
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
4597-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4598-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
4599-
; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
4603+
; AVX512BWVL-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
46004604
; AVX512BWVL-NEXT: vzeroupper
46014605
; AVX512BWVL-NEXT: retq
46024606
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -5034,9 +5038,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
50345038
; AVX512BWVL-LABEL: truncstore_v8i32_v8i16:
50355039
; AVX512BWVL: # %bb.0:
50365040
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
5037-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
5038-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
5039-
; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1}
5041+
; AVX512BWVL-NEXT: vpmovsdw %ymm0, (%rdi) {%k1}
50405042
; AVX512BWVL-NEXT: vzeroupper
50415043
; AVX512BWVL-NEXT: retq
50425044
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5473,9 +5475,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
54735475
; AVX512BWVL-LABEL: truncstore_v8i32_v8i8:
54745476
; AVX512BWVL: # %bb.0:
54755477
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
5476-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
5477-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
5478-
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1}
5478+
; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) {%k1}
54795479
; AVX512BWVL-NEXT: vzeroupper
54805480
; AVX512BWVL-NEXT: retq
54815481
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5686,9 +5686,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
56865686
; AVX512BWVL-LABEL: truncstore_v4i32_v4i16:
56875687
; AVX512BWVL: # %bb.0:
56885688
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
5689-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
5690-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
5691-
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1}
5689+
; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) {%k1}
56925690
; AVX512BWVL-NEXT: retq
56935691
%a = icmp ne <4 x i32> %mask, zeroinitializer
56945692
%b = icmp slt <4 x i32> %x, <i32 32767, i32 32767, i32 32767, i32 32767>
@@ -5904,9 +5902,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
59045902
; AVX512BWVL-LABEL: truncstore_v4i32_v4i8:
59055903
; AVX512BWVL: # %bb.0:
59065904
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
5907-
; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
5908-
; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
5909-
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1}
5905+
; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) {%k1}
59105906
; AVX512BWVL-NEXT: retq
59115907
%a = icmp ne <4 x i32> %mask, zeroinitializer
59125908
%b = icmp slt <4 x i32> %x, <i32 127, i32 127, i32 127, i32 127>
@@ -7332,9 +7328,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
73327328
; AVX512BWVL-LABEL: truncstore_v32i16_v32i8:
73337329
; AVX512BWVL: # %bb.0:
73347330
; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1
7335-
; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7336-
; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
7337-
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
7331+
; AVX512BWVL-NEXT: vpmovswb %zmm0, (%rdi) {%k1}
73387332
; AVX512BWVL-NEXT: vzeroupper
73397333
; AVX512BWVL-NEXT: retq
73407334
%a = icmp ne <32 x i8> %mask, zeroinitializer
@@ -8083,9 +8077,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
80838077
; AVX512BWVL-LABEL: truncstore_v16i16_v16i8:
80848078
; AVX512BWVL: # %bb.0:
80858079
; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1
8086-
; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
8087-
; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
8088-
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1}
8080+
; AVX512BWVL-NEXT: vpmovswb %ymm0, (%rdi) {%k1}
80898081
; AVX512BWVL-NEXT: vzeroupper
80908082
; AVX512BWVL-NEXT: retq
80918083
%a = icmp ne <16 x i8> %mask, zeroinitializer
@@ -8445,9 +8437,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
84458437
; AVX512BWVL-LABEL: truncstore_v8i16_v8i8:
84468438
; AVX512BWVL: # %bb.0:
84478439
; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1
8448-
; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
8449-
; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
8450-
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1}
8440+
; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) {%k1}
84518441
; AVX512BWVL-NEXT: retq
84528442
%a = icmp ne <8 x i16> %mask, zeroinitializer
84538443
%b = icmp slt <8 x i16> %x, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>

0 commit comments

Comments
 (0)