Skip to content

Commit ec062f6

Browse files
RKSimonmemfrob
authored andcommitted
[ScalarizeMaskedMemIntrin] Scalarize constant mask expandload as shuffle(build_vector,pass_through)
As noticed on D66004, scalarization of an expandload with a constant mask as a chain of irregular loads+inserts makes it tricky to optimize before lowering, resulting in difficulties in merging loads etc. This patch instead scalarizes the expansion to a build_vector(load0, load1, undef, load2,....) style pattern and then performs a blend shuffle with the pass through vector. This allows us to more easily make use of all the build_vector combines, merging of consecutive loads etc. Differential Revision: https://reviews.llvm.org/D85416
1 parent d3018a9 commit ec062f6

File tree

2 files changed

+56
-57
lines changed

2 files changed

+56
-57
lines changed

llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -622,18 +622,29 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
622622
Value *VResult = PassThru;
623623

624624
// Shorten the way if the mask is a vector of constants.
625+
// Create a build_vector pattern, with loads/undefs as necessary and then
626+
// shuffle blend with the pass through value.
625627
if (isConstantIntVector(Mask)) {
626628
unsigned MemIndex = 0;
629+
VResult = UndefValue::get(VecType);
630+
SmallVector<int, 16> ShuffleMask(VectorWidth, UndefMaskElem);
627631
for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
628-
if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
629-
continue;
630-
Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
631-
LoadInst *Load = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
632-
"Load" + Twine(Idx));
633-
VResult =
634-
Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
635-
++MemIndex;
632+
Value *InsertElt;
633+
if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue()) {
634+
InsertElt = UndefValue::get(EltTy);
635+
ShuffleMask[Idx] = Idx + VectorWidth;
636+
} else {
637+
Value *NewPtr =
638+
Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
639+
InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
640+
"Load" + Twine(Idx));
641+
ShuffleMask[Idx] = Idx;
642+
++MemIndex;
643+
}
644+
VResult = Builder.CreateInsertElement(VResult, InsertElt, Idx,
645+
"Res" + Twine(Idx));
636646
}
647+
VResult = Builder.CreateShuffleVector(VResult, PassThru, ShuffleMask);
637648
CI->replaceAllUsesWith(VResult);
638649
CI->eraseFromParent();
639650
return;

llvm/test/CodeGen/X86/masked_expandload.ll

Lines changed: 37 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,29 +1222,27 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
12221222
define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
12231223
; SSE2-LABEL: expandload_v4f32_const:
12241224
; SSE2: ## %bb.0:
1225-
; SSE2-NEXT: movss (%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1226-
; SSE2-NEXT: movss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1227-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[0,0]
12281225
; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1229-
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
1230-
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2]
1226+
; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1227+
; SSE2-NEXT: movaps %xmm1, %xmm3
1228+
; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1229+
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0]
1230+
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
12311231
; SSE2-NEXT: movaps %xmm1, %xmm0
12321232
; SSE2-NEXT: retq
12331233
;
12341234
; SSE42-LABEL: expandload_v4f32_const:
12351235
; SSE42: ## %bb.0:
1236-
; SSE42-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1237-
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1238-
; SSE42-NEXT: insertps $16, 4(%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1239-
; SSE42-NEXT: insertps $32, 8(%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1236+
; SSE42-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1237+
; SSE42-NEXT: insertps $32, 8(%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1238+
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
12401239
; SSE42-NEXT: retq
12411240
;
12421241
; AVX1OR2-LABEL: expandload_v4f32_const:
12431242
; AVX1OR2: ## %bb.0:
1244-
; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1245-
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1246-
; AVX1OR2-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1247-
; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1243+
; AVX1OR2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1244+
; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1245+
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
12481246
; AVX1OR2-NEXT: retq
12491247
;
12501248
; AVX512F-LABEL: expandload_v4f32_const:
@@ -1277,55 +1275,45 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
12771275
define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
12781276
; SSE2-LABEL: expandload_v16f32_const:
12791277
; SSE2: ## %bb.0:
1278+
; SSE2-NEXT: movss 52(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
1279+
; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1280+
; SSE2-NEXT: movaps %xmm4, %xmm6
1281+
; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
1282+
; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
1283+
; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1284+
; SSE2-NEXT: movaps %xmm5, %xmm7
1285+
; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
12801286
; SSE2-NEXT: movups (%rdi), %xmm0
12811287
; SSE2-NEXT: movups 16(%rdi), %xmm1
1282-
; SSE2-NEXT: movss 32(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1283-
; SSE2-NEXT: movss 36(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1284-
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0]
1285-
; SSE2-NEXT: movss 40(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1286-
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[3,0]
1287-
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2]
1288-
; SSE2-NEXT: movss 44(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1289-
; SSE2-NEXT: movss 48(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1290-
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
1291-
; SSE2-NEXT: movss 52(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1292-
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[3,0]
1293-
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2]
1294-
; SSE2-NEXT: movaps %xmm4, %xmm2
1295-
; SSE2-NEXT: movaps %xmm5, %xmm3
1288+
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[2,0]
1289+
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0]
1290+
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm6[2,0]
1291+
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
1292+
; SSE2-NEXT: movaps %xmm5, %xmm2
1293+
; SSE2-NEXT: movaps %xmm4, %xmm3
12961294
; SSE2-NEXT: retq
12971295
;
12981296
; SSE42-LABEL: expandload_v16f32_const:
12991297
; SSE42: ## %bb.0:
1298+
; SSE42-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1299+
; SSE42-NEXT: insertps $32, 52(%rdi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
1300+
; SSE42-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1301+
; SSE42-NEXT: insertps $32, 40(%rdi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3]
13001302
; SSE42-NEXT: movups (%rdi), %xmm0
13011303
; SSE42-NEXT: movups 16(%rdi), %xmm1
1302-
; SSE42-NEXT: movss 32(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1303-
; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
1304-
; SSE42-NEXT: insertps $16, 36(%rdi), %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
1305-
; SSE42-NEXT: insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1306-
; SSE42-NEXT: movss 44(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1307-
; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
1308-
; SSE42-NEXT: insertps $16, 48(%rdi), %xmm3 ## xmm3 = xmm3[0],mem[0],xmm3[2,3]
1309-
; SSE42-NEXT: insertps $32, 52(%rdi), %xmm3 ## xmm3 = xmm3[0,1],mem[0],xmm3[3]
1304+
; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
1305+
; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
13101306
; SSE42-NEXT: retq
13111307
;
13121308
; AVX1OR2-LABEL: expandload_v16f32_const:
13131309
; AVX1OR2: ## %bb.0:
1314-
; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
1315-
; AVX1OR2-NEXT: vmovsd 16(%rdi), %xmm2 ## xmm2 = mem[0],zero
1316-
; AVX1OR2-NEXT: vinsertps $32, 24(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1317-
; AVX1OR2-NEXT: vinsertps $48, 28(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1,2],mem[0]
1318-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1319-
; AVX1OR2-NEXT: vmovss 32(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1320-
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
1321-
; AVX1OR2-NEXT: vinsertps $16, 36(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
1310+
; AVX1OR2-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero
1311+
; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1312+
; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
13221313
; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1323-
; AVX1OR2-NEXT: vmovss 44(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero
1324-
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
1325-
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
1326-
; AVX1OR2-NEXT: vinsertps $16, 48(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
1327-
; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1328-
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1314+
; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
1315+
; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
1316+
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
13291317
; AVX1OR2-NEXT: retq
13301318
;
13311319
; AVX512F-LABEL: expandload_v16f32_const:

0 commit comments

Comments
 (0)