@@ -246,7 +246,7 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
246246 ret <4 x float > %tmp5
247247}
248248
249- ; Undef shuffle indices should not prevent matching to VTRN:
249+ ; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
250250
251251define <8 x i8 > @vtrni8_undef (ptr %A , ptr %B ) nounwind {
252252; CHECKLE-LABEL: vtrni8_undef:
@@ -302,3 +302,115 @@ define <8 x i16> @vtrnQi16_undef(ptr %A, ptr %B) nounwind {
302302 %tmp5 = add <8 x i16 > %tmp3 , %tmp4
303303 ret <8 x i16 > %tmp5
304304}
305+
306+ define <8 x i16 > @vtrnQi16_undef_01 (ptr %A , ptr %B ) nounwind {
307+ ; CHECKLE-LABEL: vtrnQi16_undef_01:
308+ ; CHECKLE: // %bb.0:
309+ ; CHECKLE-NEXT: ldr q0, [x0]
310+ ; CHECKLE-NEXT: ldr q1, [x1]
311+ ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
312+ ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
313+ ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
314+ ; CHECKLE-NEXT: ret
315+ ;
316+ ; CHECKBE-LABEL: vtrnQi16_undef_01:
317+ ; CHECKBE: // %bb.0:
318+ ; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
319+ ; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
320+ ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
321+ ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
322+ ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
323+ ; CHECKBE-NEXT: rev64 v0.8h, v0.8h
324+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
325+ ; CHECKBE-NEXT: ret
326+ %tmp1 = load <8 x i16 >, ptr %A
327+ %tmp2 = load <8 x i16 >, ptr %B
328+ %tmp3 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 undef , i32 2 , i32 undef , i32 4 , i32 12 , i32 6 , i32 14 >
329+ %tmp4 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 undef , i32 3 , i32 11 , i32 undef , i32 13 , i32 undef , i32 undef >
330+ %tmp5 = add <8 x i16 > %tmp3 , %tmp4
331+ ret <8 x i16 > %tmp5
332+ }
333+
334+ define <8 x i16 > @vtrnQi16_undef_0 (ptr %A , ptr %B ) nounwind {
335+ ; CHECKLE-LABEL: vtrnQi16_undef_0:
336+ ; CHECKLE: // %bb.0:
337+ ; CHECKLE-NEXT: ldr q0, [x0]
338+ ; CHECKLE-NEXT: ldr q1, [x1]
339+ ; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h
340+ ; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h
341+ ; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h
342+ ; CHECKLE-NEXT: ret
343+ ;
344+ ; CHECKBE-LABEL: vtrnQi16_undef_0:
345+ ; CHECKBE: // %bb.0:
346+ ; CHECKBE-NEXT: ld1 { v0.8h }, [x0]
347+ ; CHECKBE-NEXT: ld1 { v1.8h }, [x1]
348+ ; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h
349+ ; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h
350+ ; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h
351+ ; CHECKBE-NEXT: rev64 v0.8h, v0.8h
352+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
353+ ; CHECKBE-NEXT: ret
354+ %tmp1 = load <8 x i16 >, ptr %A
355+ %tmp2 = load <8 x i16 >, ptr %B
356+ %tmp3 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 8 , i32 undef , i32 undef , i32 4 , i32 12 , i32 6 , i32 14 >
357+ %tmp4 = shufflevector <8 x i16 > %tmp1 , <8 x i16 > %tmp2 , <8 x i32 > <i32 undef , i32 9 , i32 3 , i32 11 , i32 5 , i32 13 , i32 undef , i32 undef >
358+ %tmp5 = add <8 x i16 > %tmp3 , %tmp4
359+ ret <8 x i16 > %tmp5
360+ }
361+
362+ define <4 x i32 > @vtrnQi32_undef_1 (ptr %A , ptr %B ) nounwind {
363+ ; CHECKLE-LABEL: vtrnQi32_undef_1:
364+ ; CHECKLE: // %bb.0:
365+ ; CHECKLE-NEXT: ldr q0, [x0]
366+ ; CHECKLE-NEXT: ldr q1, [x1]
367+ ; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s
368+ ; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s
369+ ; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s
370+ ; CHECKLE-NEXT: ret
371+ ;
372+ ; CHECKBE-LABEL: vtrnQi32_undef_1:
373+ ; CHECKBE: // %bb.0:
374+ ; CHECKBE-NEXT: ld1 { v0.4s }, [x0]
375+ ; CHECKBE-NEXT: ld1 { v1.4s }, [x1]
376+ ; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s
377+ ; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s
378+ ; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s
379+ ; CHECKBE-NEXT: rev64 v0.4s, v0.4s
380+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
381+ ; CHECKBE-NEXT: ret
382+ %tmp1 = load <4 x i32 >, ptr %A
383+ %tmp2 = load <4 x i32 >, ptr %B
384+ %tmp3 = shufflevector <4 x i32 > %tmp1 , <4 x i32 > %tmp2 , <4 x i32 > <i32 0 , i32 undef , i32 2 , i32 6 >
385+ %tmp4 = shufflevector <4 x i32 > %tmp1 , <4 x i32 > %tmp2 , <4 x i32 > <i32 1 , i32 undef , i32 3 , i32 7 >
386+ %tmp5 = add <4 x i32 > %tmp3 , %tmp4
387+ ret <4 x i32 > %tmp5
388+ }
389+
390+ define <16 x i8 > @vtrnQi8_undef_012 (ptr %A , ptr %B ) nounwind {
391+ ; CHECKLE-LABEL: vtrnQi8_undef_012:
392+ ; CHECKLE: // %bb.0:
393+ ; CHECKLE-NEXT: ldr q0, [x0]
394+ ; CHECKLE-NEXT: ldr q1, [x1]
395+ ; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b
396+ ; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b
397+ ; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b
398+ ; CHECKLE-NEXT: ret
399+ ;
400+ ; CHECKBE-LABEL: vtrnQi8_undef_012:
401+ ; CHECKBE: // %bb.0:
402+ ; CHECKBE-NEXT: ld1 { v0.16b }, [x0]
403+ ; CHECKBE-NEXT: ld1 { v1.16b }, [x1]
404+ ; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b
405+ ; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b
406+ ; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b
407+ ; CHECKBE-NEXT: rev64 v0.16b, v0.16b
408+ ; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
409+ ; CHECKBE-NEXT: ret
410+ %tmp1 = load <16 x i8 >, ptr %A
411+ %tmp2 = load <16 x i8 >, ptr %B
412+ %tmp3 = shufflevector <16 x i8 > %tmp1 , <16 x i8 > %tmp2 , <16 x i32 > <i32 undef , i32 undef , i32 undef , i32 18 , i32 4 , i32 undef , i32 6 , i32 22 , i32 undef , i32 24 , i32 10 , i32 26 , i32 12 , i32 28 , i32 14 , i32 30 >
413+ %tmp4 = shufflevector <16 x i8 > %tmp1 , <16 x i8 > %tmp2 , <16 x i32 > <i32 undef , i32 undef , i32 undef , i32 19 , i32 5 , i32 21 , i32 7 , i32 undef , i32 9 , i32 25 , i32 11 , i32 27 , i32 undef , i32 29 , i32 15 , i32 31 >
414+ %tmp5 = add <16 x i8 > %tmp3 , %tmp4
415+ ret <16 x i8 > %tmp5
416+ }
0 commit comments