@@ -1222,29 +1222,27 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
1222
1222
define <4 x float > @expandload_v4f32_const (float * %base , <4 x float > %src0 ) {
1223
1223
; SSE2-LABEL: expandload_v4f32_const:
1224
1224
; SSE2: ## %bb.0:
1225
- ; SSE2-NEXT: movss (%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1226
- ; SSE2-NEXT: movss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1227
- ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[0,0]
1228
1225
; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1229
- ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
1230
- ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2]
1226
+ ; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1227
+ ; SSE2-NEXT: movaps %xmm1, %xmm3
1228
+ ; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1229
+ ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm3[2,0]
1230
+ ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1231
1231
; SSE2-NEXT: movaps %xmm1, %xmm0
1232
1232
; SSE2-NEXT: retq
1233
1233
;
1234
1234
; SSE42-LABEL: expandload_v4f32_const:
1235
1235
; SSE42: ## %bb.0:
1236
- ; SSE42-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1237
- ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1238
- ; SSE42-NEXT: insertps $16, 4(%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1239
- ; SSE42-NEXT: insertps $32, 8(%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1236
+ ; SSE42-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1237
+ ; SSE42-NEXT: insertps $32, 8(%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1238
+ ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1240
1239
; SSE42-NEXT: retq
1241
1240
;
1242
1241
; AVX1OR2-LABEL: expandload_v4f32_const:
1243
1242
; AVX1OR2: ## %bb.0:
1244
- ; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
1245
- ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1246
- ; AVX1OR2-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
1247
- ; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1243
+ ; AVX1OR2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
1244
+ ; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1245
+ ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1248
1246
; AVX1OR2-NEXT: retq
1249
1247
;
1250
1248
; AVX512F-LABEL: expandload_v4f32_const:
@@ -1277,55 +1275,45 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
1277
1275
define <16 x float > @expandload_v16f32_const (float * %base , <16 x float > %src0 ) {
1278
1276
; SSE2-LABEL: expandload_v16f32_const:
1279
1277
; SSE2: ## %bb.0:
1278
+ ; SSE2-NEXT: movss 52(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
1279
+ ; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1280
+ ; SSE2-NEXT: movaps %xmm4, %xmm6
1281
+ ; SSE2-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0]
1282
+ ; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
1283
+ ; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1284
+ ; SSE2-NEXT: movaps %xmm5, %xmm7
1285
+ ; SSE2-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm0[0]
1280
1286
; SSE2-NEXT: movups (%rdi), %xmm0
1281
1287
; SSE2-NEXT: movups 16(%rdi), %xmm1
1282
- ; SSE2-NEXT: movss 32(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1283
- ; SSE2-NEXT: movss 36(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1284
- ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0]
1285
- ; SSE2-NEXT: movss 40(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1286
- ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[3,0]
1287
- ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2]
1288
- ; SSE2-NEXT: movss 44(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1289
- ; SSE2-NEXT: movss 48(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
1290
- ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
1291
- ; SSE2-NEXT: movss 52(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1292
- ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[3,0]
1293
- ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2]
1294
- ; SSE2-NEXT: movaps %xmm4, %xmm2
1295
- ; SSE2-NEXT: movaps %xmm5, %xmm3
1288
+ ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm7[2,0]
1289
+ ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0]
1290
+ ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm6[2,0]
1291
+ ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
1292
+ ; SSE2-NEXT: movaps %xmm5, %xmm2
1293
+ ; SSE2-NEXT: movaps %xmm4, %xmm3
1296
1294
; SSE2-NEXT: retq
1297
1295
;
1298
1296
; SSE42-LABEL: expandload_v16f32_const:
1299
1297
; SSE42: ## %bb.0:
1298
+ ; SSE42-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero
1299
+ ; SSE42-NEXT: insertps $32, 52(%rdi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
1300
+ ; SSE42-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero
1301
+ ; SSE42-NEXT: insertps $32, 40(%rdi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3]
1300
1302
; SSE42-NEXT: movups (%rdi), %xmm0
1301
1303
; SSE42-NEXT: movups 16(%rdi), %xmm1
1302
- ; SSE42-NEXT: movss 32(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1303
- ; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
1304
- ; SSE42-NEXT: insertps $16, 36(%rdi), %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
1305
- ; SSE42-NEXT: insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1306
- ; SSE42-NEXT: movss 44(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
1307
- ; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
1308
- ; SSE42-NEXT: insertps $16, 48(%rdi), %xmm3 ## xmm3 = xmm3[0],mem[0],xmm3[2,3]
1309
- ; SSE42-NEXT: insertps $32, 52(%rdi), %xmm3 ## xmm3 = xmm3[0,1],mem[0],xmm3[3]
1304
+ ; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
1305
+ ; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
1310
1306
; SSE42-NEXT: retq
1311
1307
;
1312
1308
; AVX1OR2-LABEL: expandload_v16f32_const:
1313
1309
; AVX1OR2: ## %bb.0:
1314
- ; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
1315
- ; AVX1OR2-NEXT: vmovsd 16(%rdi), %xmm2 ## xmm2 = mem[0],zero
1316
- ; AVX1OR2-NEXT: vinsertps $32, 24(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1317
- ; AVX1OR2-NEXT: vinsertps $48, 28(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1,2],mem[0]
1318
- ; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1319
- ; AVX1OR2-NEXT: vmovss 32(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
1320
- ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
1321
- ; AVX1OR2-NEXT: vinsertps $16, 36(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
1310
+ ; AVX1OR2-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero
1311
+ ; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
1312
+ ; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
1322
1313
; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
1323
- ; AVX1OR2-NEXT: vmovss 44(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero
1324
- ; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
1325
- ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
1326
- ; AVX1OR2-NEXT: vinsertps $16, 48(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
1327
- ; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
1328
- ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
1314
+ ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
1315
+ ; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
1316
+ ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
1329
1317
; AVX1OR2-NEXT: retq
1330
1318
;
1331
1319
; AVX512F-LABEL: expandload_v16f32_const:
0 commit comments