@@ -302,16 +302,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
302302 ret <8 x i16 > %r
303303}
304304
305- ; Negative test - if we are shuffling a load from the base pointer, the address offset
306- ; must be a multiple of element size.
307- ; TODO: Could bitcast around this limitation.
308-
309305define <4 x i32 > @gep01_bitcast_load_i32_from_v16i8_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
310- ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
311- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
312- ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
313- ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
314- ; CHECK-NEXT: ret <4 x i32> [[R]]
306+ ; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
307+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
308+ ; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
309+ ; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
310+ ; SSE2-NEXT: ret <4 x i32> [[R]]
311+ ;
312+ ; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
313+ ; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
314+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
315+ ; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
316+ ; AVX2-NEXT: ret <4 x i32> [[R]]
315317;
316318 %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 1
317319 %s = load i32 , ptr %gep , align 1
@@ -320,11 +322,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
320322}
321323
322324define <2 x i64 > @gep01_bitcast_load_i64_from_v16i8_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
323- ; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
324- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
325- ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
326- ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
327- ; CHECK-NEXT: ret <2 x i64> [[R]]
325+ ; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
326+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
327+ ; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
328+ ; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
329+ ; SSE2-NEXT: ret <2 x i64> [[R]]
330+ ;
331+ ; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
332+ ; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
333+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
334+ ; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
335+ ; AVX2-NEXT: ret <2 x i64> [[R]]
328336;
329337 %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 1
330338 %s = load i64 , ptr %gep , align 1
@@ -333,11 +341,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der
333341}
334342
335343define <4 x i32 > @gep11_bitcast_load_i32_from_v16i8_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
336- ; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
337- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
338- ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
339- ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
340- ; CHECK-NEXT: ret <4 x i32> [[R]]
344+ ; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
345+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
346+ ; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
347+ ; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
348+ ; SSE2-NEXT: ret <4 x i32> [[R]]
349+ ;
350+ ; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
351+ ; AVX2-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
352+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
353+ ; AVX2-NEXT: [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
354+ ; AVX2-NEXT: ret <4 x i32> [[R]]
341355;
342356 %gep = getelementptr inbounds <16 x i8 >, ptr %p , i64 0 , i64 11
343357 %s = load i32 , ptr %gep , align 1
@@ -346,11 +360,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
346360}
347361
348362define <4 x i32 > @gep01_bitcast_load_i32_from_v8i16_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
349- ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
350- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
351- ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
352- ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
353- ; CHECK-NEXT: ret <4 x i32> [[R]]
363+ ; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
364+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
365+ ; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
366+ ; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
367+ ; SSE2-NEXT: ret <4 x i32> [[R]]
368+ ;
369+ ; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
370+ ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
371+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
372+ ; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
373+ ; AVX2-NEXT: ret <4 x i32> [[R]]
354374;
355375 %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 1
356376 %s = load i32 , ptr %gep , align 1
@@ -359,11 +379,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
359379}
360380
361381define <2 x i64 > @gep01_bitcast_load_i64_from_v8i16_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
362- ; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
363- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
364- ; CHECK-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
365- ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
366- ; CHECK-NEXT: ret <2 x i64> [[R]]
382+ ; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
383+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
384+ ; SSE2-NEXT: [[S:%.*]] = load i64, ptr [[GEP]], align 1
385+ ; SSE2-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
386+ ; SSE2-NEXT: ret <2 x i64> [[R]]
387+ ;
388+ ; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
389+ ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
390+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
391+ ; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
392+ ; AVX2-NEXT: ret <2 x i64> [[R]]
367393;
368394 %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 1
369395 %s = load i64 , ptr %gep , align 1
@@ -372,23 +398,29 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der
372398}
373399
374400define <4 x i32 > @gep05_bitcast_load_i32_from_v8i16_insert_v4i32 (ptr align 1 dereferenceable (16 ) %p ) {
375- ; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
376- ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
377- ; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
378- ; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
379- ; CHECK-NEXT: ret <4 x i32> [[R]]
401+ ; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
402+ ; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
403+ ; SSE2-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
404+ ; SSE2-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
405+ ; SSE2-NEXT: ret <4 x i32> [[R]]
406+ ;
407+ ; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
408+ ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
409+ ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
410+ ; AVX2-NEXT: [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
411+ ; AVX2-NEXT: ret <4 x i32> [[R]]
380412;
381413 %gep = getelementptr inbounds <8 x i16 >, ptr %p , i64 0 , i64 5
382414 %s = load i32 , ptr %gep , align 1
383415 %r = insertelement <4 x i32 > poison, i32 %s , i64 0
384416 ret <4 x i32 > %r
385417}
386418
387- define <2 x i64 > @gep01_bitcast_load_i32_from_v4i32_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) nofree nosync {
419+ define <2 x i64 > @gep01_bitcast_load_i32_from_v4i32_insert_v2i64 (ptr align 1 dereferenceable (16 ) %p ) {
388420; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
389- ; CHECK-NEXT: [[GEP :%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
390- ; CHECK-NEXT: [[S :%.*]] = load i64, ptr [[GEP ]], align 1
391- ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
421+ ; CHECK-NEXT: [[TMP1 :%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
422+ ; CHECK-NEXT: [[TMP2 :%.*]] = shufflevector <4 x i32> [[TMP1 ]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
423+ ; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
392424; CHECK-NEXT: ret <2 x i64> [[R]]
393425;
394426 %gep = getelementptr inbounds <4 x i32 >, ptr %p , i64 0 , i64 1
0 commit comments