| 
 | 1 | +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5  | 
1 | 2 | ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s  | 
2 | 3 | ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s  | 
3 | 4 | 
 
  | 
4 | 5 | define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {  | 
 | 6 | +; CHECK-LABEL: define <4 x i32> @test_cse(  | 
 | 7 | +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {  | 
 | 8 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 9 | +; CHECK-NEXT:    [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0  | 
 | 10 | +; CHECK-NEXT:    [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1  | 
 | 11 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 12 | +; CHECK:       [[FOR_COND]]:  | 
 | 13 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 14 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 15 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 16 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 17 | +; CHECK:       [[FOR_BODY]]:  | 
 | 18 | +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8>  | 
 | 19 | +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8>  | 
 | 20 | +; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0  | 
 | 21 | +; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1  | 
 | 22 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]])  | 
 | 23 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]])  | 
 | 24 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 25 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 26 | +; CHECK:       [[FOR_END]]:  | 
 | 27 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 28 | +;  | 
5 | 29 | entry:  | 
6 | 30 | ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.  | 
7 |  | -; CHECK-LABEL: @test_cse  | 
8 |  | -; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0  | 
9 | 31 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
10 | 32 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
11 | 33 |   br label %for.cond  | 
@@ -34,11 +56,32 @@ for.end:                                          ; preds = %for.cond  | 
34 | 56 | }  | 
35 | 57 | 
 
  | 
36 | 58 | define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {  | 
 | 59 | +; CHECK-LABEL: define <4 x i32> @test_cse2(  | 
 | 60 | +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {  | 
 | 61 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 62 | +; CHECK-NEXT:    [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0  | 
 | 63 | +; CHECK-NEXT:    [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1  | 
 | 64 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 65 | +; CHECK:       [[FOR_COND]]:  | 
 | 66 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 67 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 68 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 69 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 70 | +; CHECK:       [[FOR_BODY]]:  | 
 | 71 | +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8>  | 
 | 72 | +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8>  | 
 | 73 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]])  | 
 | 74 | +; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0  | 
 | 75 | +; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1  | 
 | 76 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]])  | 
 | 77 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]])  | 
 | 78 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 79 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 80 | +; CHECK:       [[FOR_END]]:  | 
 | 81 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 82 | +;  | 
37 | 83 | entry:  | 
38 | 84 | ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.  | 
39 |  | -; CHECK-LABEL: @test_cse2  | 
40 |  | -; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)  | 
41 |  | -; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a)  | 
42 | 85 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
43 | 86 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
44 | 87 |   br label %for.cond  | 
@@ -68,11 +111,26 @@ for.end:                                          ; preds = %for.cond  | 
68 | 111 | }  | 
69 | 112 | 
 
  | 
70 | 113 | define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {  | 
 | 114 | +; CHECK-LABEL: define <4 x i32> @test_cse3(  | 
 | 115 | +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {  | 
 | 116 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 117 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 118 | +; CHECK:       [[FOR_COND]]:  | 
 | 119 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 120 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 121 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 122 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 123 | +; CHECK:       [[FOR_BODY]]:  | 
 | 124 | +; CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])  | 
 | 125 | +; CHECK-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0  | 
 | 126 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD2_FCA_0_EXTRACT]], <4 x i32> [[VLD2_FCA_0_EXTRACT]])  | 
 | 127 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 128 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 129 | +; CHECK:       [[FOR_END]]:  | 
 | 130 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 131 | +;  | 
71 | 132 | entry:  | 
72 | 133 | ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.  | 
73 |  | -; CHECK-LABEL: @test_cse3  | 
74 |  | -; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0  | 
75 |  | -; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0  | 
76 | 134 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
77 | 135 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
78 | 136 |   br label %for.cond  | 
@@ -100,11 +158,33 @@ for.end:                                          ; preds = %for.cond  | 
100 | 158 | 
 
  | 
101 | 159 | 
 
  | 
102 | 160 | define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) {  | 
 | 161 | +; CHECK-LABEL: define <4 x i32> @test_nocse(  | 
 | 162 | +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {  | 
 | 163 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 164 | +; CHECK-NEXT:    [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0  | 
 | 165 | +; CHECK-NEXT:    [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1  | 
 | 166 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 167 | +; CHECK:       [[FOR_COND]]:  | 
 | 168 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 169 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 170 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 171 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 172 | +; CHECK:       [[FOR_BODY]]:  | 
 | 173 | +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8>  | 
 | 174 | +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8>  | 
 | 175 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]])  | 
 | 176 | +; CHECK-NEXT:    store i32 0, ptr [[B]], align 4  | 
 | 177 | +; CHECK-NEXT:    [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]])  | 
 | 178 | +; CHECK-NEXT:    [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0  | 
 | 179 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD2_FCA_0_EXTRACT]], <4 x i32> [[VLD2_FCA_0_EXTRACT]])  | 
 | 180 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 181 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 182 | +; CHECK:       [[FOR_END]]:  | 
 | 183 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 184 | +;  | 
103 | 185 | entry:  | 
104 | 186 | ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized  | 
105 | 187 | ; away by Early CSE.  | 
106 |  | -; CHECK-LABEL: @test_nocse  | 
107 |  | -; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0  | 
108 | 188 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
109 | 189 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
110 | 190 |   br label %for.cond  | 
@@ -134,11 +214,33 @@ for.end:                                          ; preds = %for.cond  | 
134 | 214 | }  | 
135 | 215 | 
 
  | 
136 | 216 | define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {  | 
 | 217 | +; CHECK-LABEL: define <4 x i32> @test_nocse2(  | 
 | 218 | +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {  | 
 | 219 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 220 | +; CHECK-NEXT:    [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0  | 
 | 221 | +; CHECK-NEXT:    [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1  | 
 | 222 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 223 | +; CHECK:       [[FOR_COND]]:  | 
 | 224 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 225 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 226 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 227 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 228 | +; CHECK:       [[FOR_BODY]]:  | 
 | 229 | +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8>  | 
 | 230 | +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8>  | 
 | 231 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]])  | 
 | 232 | +; CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])  | 
 | 233 | +; CHECK-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0  | 
 | 234 | +; CHECK-NEXT:    [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2  | 
 | 235 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD3_FCA_0_EXTRACT]], <4 x i32> [[VLD3_FCA_2_EXTRACT]])  | 
 | 236 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 237 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 238 | +; CHECK:       [[FOR_END]]:  | 
 | 239 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 240 | +;  | 
137 | 241 | entry:  | 
138 | 242 | ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due  | 
139 | 243 | ; to mismatch between st2 and ld3.  | 
140 |  | -; CHECK-LABEL: @test_nocse2  | 
141 |  | -; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0  | 
142 | 244 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
143 | 245 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
144 | 246 |   br label %for.cond  | 
@@ -167,12 +269,33 @@ for.end:                                          ; preds = %for.cond  | 
167 | 269 | }  | 
168 | 270 | 
 
  | 
169 | 271 | define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {  | 
 | 272 | +; CHECK-LABEL: define <4 x i32> @test_nocse3(  | 
 | 273 | +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {  | 
 | 274 | +; CHECK-NEXT:  [[ENTRY:.*]]:  | 
 | 275 | +; CHECK-NEXT:    [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0  | 
 | 276 | +; CHECK-NEXT:    [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1  | 
 | 277 | +; CHECK-NEXT:    br label %[[FOR_COND:.*]]  | 
 | 278 | +; CHECK:       [[FOR_COND]]:  | 
 | 279 | +; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ]  | 
 | 280 | +; CHECK-NEXT:    [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ]  | 
 | 281 | +; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]]  | 
 | 282 | +; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]]  | 
 | 283 | +; CHECK:       [[FOR_BODY]]:  | 
 | 284 | +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8>  | 
 | 285 | +; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8>  | 
 | 286 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[S_COERCE_FCA_1_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]])  | 
 | 287 | +; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]])  | 
 | 288 | +; CHECK-NEXT:    [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]])  | 
 | 289 | +; CHECK-NEXT:    [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0  | 
 | 290 | +; CHECK-NEXT:    [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD3_FCA_0_EXTRACT]], <4 x i32> [[VLD3_FCA_0_EXTRACT]])  | 
 | 291 | +; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1  | 
 | 292 | +; CHECK-NEXT:    br label %[[FOR_COND]]  | 
 | 293 | +; CHECK:       [[FOR_END]]:  | 
 | 294 | +; CHECK-NEXT:    ret <4 x i32> [[RES_0]]  | 
 | 295 | +;  | 
170 | 296 | entry:  | 
171 | 297 | ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to  | 
172 | 298 | ; mismatch between st2 and st3.  | 
173 |  | -; CHECK-LABEL: @test_nocse3  | 
174 |  | -; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0  | 
175 |  | -; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0  | 
176 | 299 |   %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0  | 
177 | 300 |   %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1  | 
178 | 301 |   br label %for.cond  | 
@@ -214,6 +337,12 @@ declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)  | 
214 | 337 | declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr)  | 
215 | 338 | 
 
  | 
216 | 339 | define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {  | 
 | 340 | +; CHECK-LABEL: define internal fastcc <4 x i32> @vaddq_s32(  | 
 | 341 | +; CHECK-SAME: <4 x i32> [[__P0:%.*]], <4 x i32> [[__P1:%.*]]) #[[ATTR0]] {  | 
 | 342 | +; CHECK-NEXT:  [[ENTRY:.*:]]  | 
 | 343 | +; CHECK-NEXT:    [[ADD:%.*]] = add <4 x i32> [[__P0]], [[__P1]]  | 
 | 344 | +; CHECK-NEXT:    ret <4 x i32> [[ADD]]  | 
 | 345 | +;  | 
217 | 346 | entry:  | 
218 | 347 |   %add = add <4 x i32> %__p0, %__p1  | 
219 | 348 |   ret <4 x i32> %add  | 
 | 
0 commit comments