| 
 | 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  | 
 | 2 | +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s  | 
 | 3 | + | 
 | 4 | +; Test optimization of DUP with extended narrow loads  | 
 | 5 | +; This should avoid GPR->SIMD transfers by loading directly into vector registers  | 
 | 6 | + | 
 | 7 | +define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {  | 
 | 8 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32:  | 
 | 9 | +; CHECK:       // %bb.0:  | 
 | 10 | +; CHECK-NEXT:    ldr b0, [x0]  | 
 | 11 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 12 | +; CHECK-NEXT:    ret  | 
 | 13 | +  %load = load i8, ptr %p, align 1  | 
 | 14 | +  %ext = zext i8 %load to i32  | 
 | 15 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 16 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 17 | +  ret <4 x i32> %dup  | 
 | 18 | +}  | 
 | 19 | + | 
 | 20 | +define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {  | 
 | 21 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32:  | 
 | 22 | +; CHECK:       // %bb.0:  | 
 | 23 | +; CHECK-NEXT:    ldr h0, [x0]  | 
 | 24 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 25 | +; CHECK-NEXT:    ret  | 
 | 26 | +  %load = load i16, ptr %p, align 2  | 
 | 27 | +  %ext = zext i16 %load to i32  | 
 | 28 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 29 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 30 | +  ret <4 x i32> %dup  | 
 | 31 | +}  | 
 | 32 | + | 
 | 33 | +define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {  | 
 | 34 | +; CHECK-LABEL: test_dup_zextload_i8_v2i32:  | 
 | 35 | +; CHECK:       // %bb.0:  | 
 | 36 | +; CHECK-NEXT:    ldr b0, [x0]  | 
 | 37 | +; CHECK-NEXT:    dup v0.2s, v0.s[0]  | 
 | 38 | +; CHECK-NEXT:    ret  | 
 | 39 | +  %load = load i8, ptr %p, align 1  | 
 | 40 | +  %ext = zext i8 %load to i32  | 
 | 41 | +  %vec = insertelement <2 x i32> undef, i32 %ext, i32 0  | 
 | 42 | +  %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer  | 
 | 43 | +  ret <2 x i32> %dup  | 
 | 44 | +}  | 
 | 45 | + | 
 | 46 | +define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {  | 
 | 47 | +; CHECK-LABEL: test_dup_zextload_i16_v2i32:  | 
 | 48 | +; CHECK:       // %bb.0:  | 
 | 49 | +; CHECK-NEXT:    ldr h0, [x0]  | 
 | 50 | +; CHECK-NEXT:    dup v0.2s, v0.s[0]  | 
 | 51 | +; CHECK-NEXT:    ret  | 
 | 52 | +  %load = load i16, ptr %p, align 2  | 
 | 53 | +  %ext = zext i16 %load to i32  | 
 | 54 | +  %vec = insertelement <2 x i32> undef, i32 %ext, i32 0  | 
 | 55 | +  %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer  | 
 | 56 | +  ret <2 x i32> %dup  | 
 | 57 | +}  | 
 | 58 | + | 
 | 59 | +define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {  | 
 | 60 | +; CHECK-LABEL: test_dup_zextload_i8_v8i16:  | 
 | 61 | +; CHECK:       // %bb.0:  | 
 | 62 | +; CHECK-NEXT:    ldr b0, [x0]  | 
 | 63 | +; CHECK-NEXT:    dup v0.8h, v0.h[0]  | 
 | 64 | +; CHECK-NEXT:    ret  | 
 | 65 | +  %load = load i8, ptr %p, align 1  | 
 | 66 | +  %ext = zext i8 %load to i16  | 
 | 67 | +  %vec = insertelement <8 x i16> undef, i16 %ext, i32 0  | 
 | 68 | +  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer  | 
 | 69 | +  ret <8 x i16> %dup  | 
 | 70 | +}  | 
 | 71 | + | 
 | 72 | +define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {  | 
 | 73 | +; CHECK-LABEL: test_dup_zextload_i8_v4i16:  | 
 | 74 | +; CHECK:       // %bb.0:  | 
 | 75 | +; CHECK-NEXT:    ldr b0, [x0]  | 
 | 76 | +; CHECK-NEXT:    dup v0.4h, v0.h[0]  | 
 | 77 | +; CHECK-NEXT:    ret  | 
 | 78 | +  %load = load i8, ptr %p, align 1  | 
 | 79 | +  %ext = zext i8 %load to i16  | 
 | 80 | +  %vec = insertelement <4 x i16> undef, i16 %ext, i32 0  | 
 | 81 | +  %dup = shufflevector <4 x i16> %vec, <4 x i16> undef, <4 x i32> zeroinitializer  | 
 | 82 | +  ret <4 x i16> %dup  | 
 | 83 | +}  | 
 | 84 | + | 
 | 85 | +; Test with offset addressing  | 
 | 86 | +define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {  | 
 | 87 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:  | 
 | 88 | +; CHECK:       // %bb.0:  | 
 | 89 | +; CHECK-NEXT:    ldr b0, [x0, #4]  | 
 | 90 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 91 | +; CHECK-NEXT:    ret  | 
 | 92 | +  %addr = getelementptr inbounds i8, ptr %p, i64 4  | 
 | 93 | +  %load = load i8, ptr %addr, align 1  | 
 | 94 | +  %ext = zext i8 %load to i32  | 
 | 95 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 96 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 97 | +  ret <4 x i32> %dup  | 
 | 98 | +}  | 
 | 99 | + | 
 | 100 | +define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {  | 
 | 101 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:  | 
 | 102 | +; CHECK:       // %bb.0:  | 
 | 103 | +; CHECK-NEXT:    ldr h0, [x0, #8]  | 
 | 104 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 105 | +; CHECK-NEXT:    ret  | 
 | 106 | +  %addr = getelementptr inbounds i16, ptr %p, i64 4  | 
 | 107 | +  %load = load i16, ptr %addr, align 2  | 
 | 108 | +  %ext = zext i16 %load to i32  | 
 | 109 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 110 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 111 | +  ret <4 x i32> %dup  | 
 | 112 | +}  | 
 | 113 | + | 
 | 114 | +; Test with register offset addressing  | 
 | 115 | +define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {  | 
 | 116 | +; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:  | 
 | 117 | +; CHECK:       // %bb.0:  | 
 | 118 | +; CHECK-NEXT:    ldr b0, [x0, x1]  | 
 | 119 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 120 | +; CHECK-NEXT:    ret  | 
 | 121 | +  %addr = getelementptr inbounds i8, ptr %p, i64 %offset  | 
 | 122 | +  %load = load i8, ptr %addr, align 1  | 
 | 123 | +  %ext = zext i8 %load to i32  | 
 | 124 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 125 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 126 | +  ret <4 x i32> %dup  | 
 | 127 | +}  | 
 | 128 | + | 
 | 129 | +define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {  | 
 | 130 | +; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:  | 
 | 131 | +; CHECK:       // %bb.0:  | 
 | 132 | +; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]  | 
 | 133 | +; CHECK-NEXT:    dup v0.4s, v0.s[0]  | 
 | 134 | +; CHECK-NEXT:    ret  | 
 | 135 | +  %addr = getelementptr inbounds i16, ptr %p, i64 %offset  | 
 | 136 | +  %load = load i16, ptr %addr, align 2  | 
 | 137 | +  %ext = zext i16 %load to i32  | 
 | 138 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 139 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 140 | +  ret <4 x i32> %dup  | 
 | 141 | +}  | 
 | 142 | + | 
 | 143 | +; Negative test: sign-extended loads should not use this optimization  | 
 | 144 | +define <4 x i32> @test_dup_sextload_i8_v4i32(ptr %p) {  | 
 | 145 | +; CHECK-LABEL: test_dup_sextload_i8_v4i32:  | 
 | 146 | +; CHECK:       // %bb.0:  | 
 | 147 | +; CHECK-NEXT:    ldrsb w8, [x0]  | 
 | 148 | +; CHECK-NEXT:    dup v0.4s, w8  | 
 | 149 | +; CHECK-NEXT:    ret  | 
 | 150 | +  %load = load i8, ptr %p, align 1  | 
 | 151 | +  %ext = sext i8 %load to i32  | 
 | 152 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 153 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 154 | +  ret <4 x i32> %dup  | 
 | 155 | +}  | 
 | 156 | + | 
 | 157 | +; Negative test: i32 loads don't need this optimization  | 
 | 158 | +define <4 x i32> @test_dup_load_i32_v4i32(ptr %p) {  | 
 | 159 | +; CHECK-LABEL: test_dup_load_i32_v4i32:  | 
 | 160 | +; CHECK:       // %bb.0:  | 
 | 161 | +; CHECK-NEXT:    ld1r { v0.4s }, [x0]  | 
 | 162 | +; CHECK-NEXT:    ret  | 
 | 163 | +  %load = load i32, ptr %p, align 4  | 
 | 164 | +  %vec = insertelement <4 x i32> undef, i32 %load, i32 0  | 
 | 165 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 166 | +  ret <4 x i32> %dup  | 
 | 167 | +}  | 
 | 168 | + | 
 | 169 | +; Test that truncate(dup(zextload)) doesn't generate unnecessary XTN  | 
 | 170 | +define <8 x i8> @test_truncate_dup_zextload_i8_v8i8(ptr %p) {  | 
 | 171 | +; CHECK-LABEL: test_truncate_dup_zextload_i8_v8i8:  | 
 | 172 | +; CHECK:       // %bb.0:  | 
 | 173 | +; CHECK-NEXT:    ld1r { v0.8b }, [x0]  | 
 | 174 | +; CHECK-NEXT:    ret  | 
 | 175 | +  %load = load i8, ptr %p, align 1  | 
 | 176 | +  %ext = zext i8 %load to i16  | 
 | 177 | +  %vec = insertelement <8 x i16> undef, i16 %ext, i32 0  | 
 | 178 | +  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer  | 
 | 179 | +  %trunc = trunc <8 x i16> %dup to <8 x i8>  | 
 | 180 | +  ret <8 x i8> %trunc  | 
 | 181 | +}  | 
 | 182 | + | 
 | 183 | +; Test with i16 to i8 truncation  | 
 | 184 | +define <8 x i8> @test_truncate_dup_zextload_i8_from_i32_v8i8(ptr %p) {  | 
 | 185 | +; CHECK-LABEL: test_truncate_dup_zextload_i8_from_i32_v8i8:  | 
 | 186 | +; CHECK:       // %bb.0:  | 
 | 187 | +; CHECK-NEXT:    ld1r { v0.8b }, [x0]  | 
 | 188 | +; CHECK-NEXT:    ret  | 
 | 189 | +  %load = load i8, ptr %p, align 1  | 
 | 190 | +  %ext = zext i8 %load to i32  | 
 | 191 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 192 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 193 | +  %trunc = trunc <4 x i32> %dup to <4 x i8>  | 
 | 194 | +  ; Widen to v8i8 to match the test output  | 
 | 195 | +  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>  | 
 | 196 | +  ret <8 x i8> %result  | 
 | 197 | +}  | 
 | 198 | + | 
 | 199 | +; Test with i16 load truncated to i8  | 
 | 200 | +define <8 x i8> @test_truncate_dup_zextload_i16_to_i8_v8i8(ptr %p) {  | 
 | 201 | +; CHECK-LABEL: test_truncate_dup_zextload_i16_to_i8_v8i8:  | 
 | 202 | +; CHECK:       // %bb.0:  | 
 | 203 | +; CHECK-NEXT:    ld1r { v0.8b }, [x0]  | 
 | 204 | +; CHECK-NEXT:    ret  | 
 | 205 | +  %load = load i16, ptr %p, align 2  | 
 | 206 | +  %ext = zext i16 %load to i32  | 
 | 207 | +  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0  | 
 | 208 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 209 | +  %trunc = trunc <4 x i32> %dup to <4 x i8>  | 
 | 210 | +  ; Widen to v8i8 to match the test output  | 
 | 211 | +  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>  | 
 | 212 | +  ret <8 x i8> %result  | 
 | 213 | +}  | 
 | 214 | + | 
 | 215 | +; Test generalized truncate(dup(scalar_to_vector)) for non-load case  | 
 | 216 | +define <8 x i8> @test_truncate_dup_scalar_i32_to_i8_v8i8(i32 %val) {  | 
 | 217 | +; CHECK-LABEL: test_truncate_dup_scalar_i32_to_i8_v8i8:  | 
 | 218 | +; CHECK:       // %bb.0:  | 
 | 219 | +; CHECK-NEXT:    dup v0.8b, w0  | 
 | 220 | +; CHECK-NEXT:    ret  | 
 | 221 | +  %vec = insertelement <4 x i32> undef, i32 %val, i32 0  | 
 | 222 | +  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer  | 
 | 223 | +  %trunc = trunc <4 x i32> %dup to <4 x i8>  | 
 | 224 | +  ; Widen to v8i8 to match the test output  | 
 | 225 | +  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>  | 
 | 226 | +  ret <8 x i8> %result  | 
 | 227 | +}  | 
 | 228 | + | 
 | 229 | +; Test generalized truncate(dup(scalar_to_vector)) i16 to i8  | 
 | 230 | +define <8 x i8> @test_truncate_dup_scalar_i16_to_i8_v8i8(i16 %val) {  | 
 | 231 | +; CHECK-LABEL: test_truncate_dup_scalar_i16_to_i8_v8i8:  | 
 | 232 | +; CHECK:       // %bb.0:  | 
 | 233 | +; CHECK-NEXT:    dup v0.8b, w0  | 
 | 234 | +; CHECK-NEXT:    ret  | 
 | 235 | +  %vec = insertelement <8 x i16> undef, i16 %val, i32 0  | 
 | 236 | +  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer  | 
 | 237 | +  %trunc = trunc <8 x i16> %dup to <8 x i8>  | 
 | 238 | +  ret <8 x i8> %trunc  | 
 | 239 | +}  | 
 | 240 | + | 
0 commit comments