|  | 
|  | 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 
|  | 2 | +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s | 
|  | 3 | + | 
|  | 4 | +; Test optimization of DUP with extended narrow loads | 
|  | 5 | +; This should avoid GPR->SIMD transfers by loading directly into vector registers | 
|  | 6 | + | 
|  | 7 | +define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) { | 
|  | 8 | +; CHECK-LABEL: test_dup_zextload_i8_v4i16: | 
|  | 9 | +; CHECK:       // %bb.0: | 
|  | 10 | +; CHECK-NEXT:    ldr b0, [x0] | 
|  | 11 | +; CHECK-NEXT:    dup v0.4h, v0.h[0] | 
|  | 12 | +; CHECK-NEXT:    ret | 
|  | 13 | +  %load = load i8, ptr %p, align 1 | 
|  | 14 | +  %ext = zext i8 %load to i16 | 
|  | 15 | +  %vec = insertelement <4 x i16> poison, i16 %ext, i32 0 | 
|  | 16 | +  %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer | 
|  | 17 | +  ret <4 x i16> %dup | 
|  | 18 | +} | 
|  | 19 | + | 
|  | 20 | +define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) { | 
|  | 21 | +; CHECK-LABEL: test_dup_zextload_i8_v8i16: | 
|  | 22 | +; CHECK:       // %bb.0: | 
|  | 23 | +; CHECK-NEXT:    ldr b0, [x0] | 
|  | 24 | +; CHECK-NEXT:    dup v0.8h, v0.h[0] | 
|  | 25 | +; CHECK-NEXT:    ret | 
|  | 26 | +  %load = load i8, ptr %p, align 1 | 
|  | 27 | +  %ext = zext i8 %load to i16 | 
|  | 28 | +  %vec = insertelement <8 x i16> poison, i16 %ext, i32 0 | 
|  | 29 | +  %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer | 
|  | 30 | +  ret <8 x i16> %dup | 
|  | 31 | +} | 
|  | 32 | + | 
|  | 33 | +define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) { | 
|  | 34 | +; CHECK-LABEL: test_dup_zextload_i8_v2i32: | 
|  | 35 | +; CHECK:       // %bb.0: | 
|  | 36 | +; CHECK-NEXT:    ldr b0, [x0] | 
|  | 37 | +; CHECK-NEXT:    dup v0.2s, v0.s[0] | 
|  | 38 | +; CHECK-NEXT:    ret | 
|  | 39 | +  %load = load i8, ptr %p, align 1 | 
|  | 40 | +  %ext = zext i8 %load to i32 | 
|  | 41 | +  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 | 
|  | 42 | +  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer | 
|  | 43 | +  ret <2 x i32> %dup | 
|  | 44 | +} | 
|  | 45 | + | 
|  | 46 | +define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) { | 
|  | 47 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32: | 
|  | 48 | +; CHECK:       // %bb.0: | 
|  | 49 | +; CHECK-NEXT:    ldr b0, [x0] | 
|  | 50 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 51 | +; CHECK-NEXT:    ret | 
|  | 52 | +  %load = load i8, ptr %p, align 1 | 
|  | 53 | +  %ext = zext i8 %load to i32 | 
|  | 54 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 55 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 56 | +  ret <4 x i32> %dup | 
|  | 57 | +} | 
|  | 58 | + | 
|  | 59 | +define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) { | 
|  | 60 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset: | 
|  | 61 | +; CHECK:       // %bb.0: | 
|  | 62 | +; CHECK-NEXT:    ldr b0, [x0, #4] | 
|  | 63 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 64 | +; CHECK-NEXT:    ret | 
|  | 65 | +  %addr = getelementptr inbounds i8, ptr %p, i64 4 | 
|  | 66 | +  %load = load i8, ptr %addr, align 1 | 
|  | 67 | +  %ext = zext i8 %load to i32 | 
|  | 68 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 69 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 70 | +  ret <4 x i32> %dup | 
|  | 71 | +} | 
|  | 72 | + | 
|  | 73 | +define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) { | 
|  | 74 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset: | 
|  | 75 | +; CHECK:       // %bb.0: | 
|  | 76 | +; CHECK-NEXT:    ldr b0, [x0, x1] | 
|  | 77 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 78 | +; CHECK-NEXT:    ret | 
|  | 79 | +  %addr = getelementptr inbounds i8, ptr %p, i64 %offset | 
|  | 80 | +  %load = load i8, ptr %addr, align 1 | 
|  | 81 | +  %ext = zext i8 %load to i32 | 
|  | 82 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 83 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 84 | +  ret <4 x i32> %dup | 
|  | 85 | +} | 
|  | 86 | + | 
|  | 87 | +define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) { | 
|  | 88 | +; CHECK-LABEL: test_dup_zextload_i8_v2i64: | 
|  | 89 | +; CHECK:       // %bb.0: | 
|  | 90 | +; CHECK-NEXT:    ldr b0, [x0] | 
|  | 91 | +; CHECK-NEXT:    dup v0.2d, v0.d[0] | 
|  | 92 | +; CHECK-NEXT:    ret | 
|  | 93 | +  %load = load i8, ptr %p, align 1 | 
|  | 94 | +  %ext = zext i8 %load to i64 | 
|  | 95 | +  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 | 
|  | 96 | +  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer | 
|  | 97 | +  ret <2 x i64> %dup | 
|  | 98 | +} | 
|  | 99 | + | 
|  | 100 | +define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) { | 
|  | 101 | +; CHECK-LABEL: test_dup_zextload_i16_v2i32: | 
|  | 102 | +; CHECK:       // %bb.0: | 
|  | 103 | +; CHECK-NEXT:    ldr h0, [x0] | 
|  | 104 | +; CHECK-NEXT:    dup v0.2s, v0.s[0] | 
|  | 105 | +; CHECK-NEXT:    ret | 
|  | 106 | +  %load = load i16, ptr %p, align 1 | 
|  | 107 | +  %ext = zext i16 %load to i32 | 
|  | 108 | +  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 | 
|  | 109 | +  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer | 
|  | 110 | +  ret <2 x i32> %dup | 
|  | 111 | +} | 
|  | 112 | + | 
|  | 113 | +define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) { | 
|  | 114 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32: | 
|  | 115 | +; CHECK:       // %bb.0: | 
|  | 116 | +; CHECK-NEXT:    ldr h0, [x0] | 
|  | 117 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 118 | +; CHECK-NEXT:    ret | 
|  | 119 | +  %load = load i16, ptr %p, align 1 | 
|  | 120 | +  %ext = zext i16 %load to i32 | 
|  | 121 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 122 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 123 | +  ret <4 x i32> %dup | 
|  | 124 | +} | 
|  | 125 | + | 
|  | 126 | +define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) { | 
|  | 127 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset: | 
|  | 128 | +; CHECK:       // %bb.0: | 
|  | 129 | +; CHECK-NEXT:    ldr h0, [x0, #8] | 
|  | 130 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 131 | +; CHECK-NEXT:    ret | 
|  | 132 | +  %addr = getelementptr inbounds i16, ptr %p, i64 4 | 
|  | 133 | +  %load = load i16, ptr %addr, align 1 | 
|  | 134 | +  %ext = zext i16 %load to i32 | 
|  | 135 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 136 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 137 | +  ret <4 x i32> %dup | 
|  | 138 | +} | 
|  | 139 | + | 
|  | 140 | +define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) { | 
|  | 141 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset: | 
|  | 142 | +; CHECK:       // %bb.0: | 
|  | 143 | +; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1] | 
|  | 144 | +; CHECK-NEXT:    dup v0.4s, v0.s[0] | 
|  | 145 | +; CHECK-NEXT:    ret | 
|  | 146 | +  %addr = getelementptr inbounds i16, ptr %p, i64 %offset | 
|  | 147 | +  %load = load i16, ptr %addr, align 1 | 
|  | 148 | +  %ext = zext i16 %load to i32 | 
|  | 149 | +  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 | 
|  | 150 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer | 
|  | 151 | +  ret <4 x i32> %dup | 
|  | 152 | +} | 
|  | 153 | + | 
|  | 154 | +define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) { | 
|  | 155 | +; CHECK-LABEL: test_dup_zextload_i16_v2i64: | 
|  | 156 | +; CHECK:       // %bb.0: | 
|  | 157 | +; CHECK-NEXT:    ldr h0, [x0] | 
|  | 158 | +; CHECK-NEXT:    dup v0.2d, v0.d[0] | 
|  | 159 | +; CHECK-NEXT:    ret | 
|  | 160 | +  %load = load i16, ptr %p, align 1 | 
|  | 161 | +  %ext = zext i16 %load to i64 | 
|  | 162 | +  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 | 
|  | 163 | +  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer | 
|  | 164 | +  ret <2 x i64> %dup | 
|  | 165 | +} | 
|  | 166 | + | 
|  | 167 | +define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) { | 
|  | 168 | +; CHECK-LABEL: test_dup_zextload_i32_v2i64: | 
|  | 169 | +; CHECK:       // %bb.0: | 
|  | 170 | +; CHECK-NEXT:    ldr s0, [x0] | 
|  | 171 | +; CHECK-NEXT:    dup v0.2d, v0.d[0] | 
|  | 172 | +; CHECK-NEXT:    ret | 
|  | 173 | +  %load = load i32, ptr %p, align 1 | 
|  | 174 | +  %ext = zext i32 %load to i64 | 
|  | 175 | +  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0 | 
|  | 176 | +  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer | 
|  | 177 | +  ret <2 x i64> %dup | 
|  | 178 | +} | 
0 commit comments