@@ -9,20 +9,122 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-
99@kernel4 = global [512 x float ] zeroinitializer , align 4
1010@src_data = global [1536 x float ] zeroinitializer , align 4
1111
12- ; We don't want to vectorize most loops containing gathers because they are
13- ; expensive.
14- ; Make sure we don't vectorize it.
12+ ; The cost of gathers in the loop gets offset by the vector math.
1513
1614define float @_Z4testmm (i64 %size , i64 %offset ) {
1715; CHECK-LABEL: define float @_Z4testmm(
1816; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) {
1917; CHECK-NEXT: [[ENTRY:.*]]:
18+ ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 4
19+ ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
20+ ; CHECK: [[VECTOR_PH]]:
21+ ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 4
22+ ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
23+ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[OFFSET]], i64 0
24+ ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
25+ ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
26+ ; CHECK: [[VECTOR_BODY]]:
27+ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
28+ ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
29+ ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
30+ ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[VECTOR_BODY]] ]
31+ ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ]
32+ ; CHECK-NEXT: [[TMP75:%.*]] = add <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
33+ ; CHECK-NEXT: [[TMP76:%.*]] = mul <4 x i64> [[TMP75]], splat (i64 3)
34+ ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i64> [[TMP76]], i32 0
35+ ; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i64> [[TMP76]], i32 1
36+ ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i64> [[TMP76]], i32 2
37+ ; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i64> [[TMP76]], i32 3
38+ ; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP77]]
39+ ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP78]]
40+ ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP79]]
41+ ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP80]]
42+ ; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP81]], align 4
43+ ; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP7]], align 4
44+ ; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP8]], align 4
45+ ; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP9]], align 4
46+ ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i32 0
47+ ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i32 1
48+ ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i32 2
49+ ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i32 3
50+ ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[INDEX]]
51+ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
52+ ; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[TMP17]], [[WIDE_LOAD]]
53+ ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[INDEX]]
54+ ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP20]], align 4
55+ ; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <4 x float> [[TMP19]], [[WIDE_LOAD3]]
56+ ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[INDEX]]
57+ ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
58+ ; CHECK-NEXT: [[TMP23:%.*]] = fmul fast <4 x float> [[TMP21]], [[WIDE_LOAD4]]
59+ ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[INDEX]]
60+ ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
61+ ; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <4 x float> [[TMP23]], [[WIDE_LOAD5]]
62+ ; CHECK-NEXT: [[TMP26]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP25]]
63+ ; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i64> [[TMP76]], splat (i64 1)
64+ ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP27]], i32 0
65+ ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i64> [[TMP27]], i32 1
66+ ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i64> [[TMP27]], i32 2
67+ ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP27]], i32 3
68+ ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP28]]
69+ ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP29]]
70+ ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP30]]
71+ ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP31]]
72+ ; CHECK-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP32]], align 4
73+ ; CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP33]], align 4
74+ ; CHECK-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP34]], align 4
75+ ; CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP35]], align 4
76+ ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP36]], i32 0
77+ ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i32 1
78+ ; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP38]], i32 2
79+ ; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP39]], i32 3
80+ ; CHECK-NEXT: [[TMP44:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP43]]
81+ ; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP44]]
82+ ; CHECK-NEXT: [[TMP46:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP45]]
83+ ; CHECK-NEXT: [[TMP47:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP46]]
84+ ; CHECK-NEXT: [[TMP48]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP47]]
85+ ; CHECK-NEXT: [[TMP49:%.*]] = add <4 x i64> [[TMP76]], splat (i64 2)
86+ ; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i64> [[TMP49]], i32 0
87+ ; CHECK-NEXT: [[TMP51:%.*]] = extractelement <4 x i64> [[TMP49]], i32 1
88+ ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i64> [[TMP49]], i32 2
89+ ; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i64> [[TMP49]], i32 3
90+ ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP50]]
91+ ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP51]]
92+ ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP52]]
93+ ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP53]]
94+ ; CHECK-NEXT: [[TMP58:%.*]] = load float, ptr [[TMP54]], align 4
95+ ; CHECK-NEXT: [[TMP59:%.*]] = load float, ptr [[TMP55]], align 4
96+ ; CHECK-NEXT: [[TMP60:%.*]] = load float, ptr [[TMP56]], align 4
97+ ; CHECK-NEXT: [[TMP61:%.*]] = load float, ptr [[TMP57]], align 4
98+ ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x float> poison, float [[TMP58]], i32 0
99+ ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x float> [[TMP62]], float [[TMP59]], i32 1
100+ ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x float> [[TMP63]], float [[TMP60]], i32 2
101+ ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <4 x float> [[TMP64]], float [[TMP61]], i32 3
102+ ; CHECK-NEXT: [[TMP66:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP65]]
103+ ; CHECK-NEXT: [[TMP67:%.*]] = fmul fast <4 x float> [[WIDE_LOAD3]], [[TMP66]]
104+ ; CHECK-NEXT: [[TMP68:%.*]] = fmul fast <4 x float> [[WIDE_LOAD4]], [[TMP67]]
105+ ; CHECK-NEXT: [[TMP69:%.*]] = fmul fast <4 x float> [[WIDE_LOAD5]], [[TMP68]]
106+ ; CHECK-NEXT: [[TMP70]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP69]]
107+ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
108+ ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
109+ ; CHECK-NEXT: [[TMP71:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
110+ ; CHECK-NEXT: br i1 [[TMP71]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
111+ ; CHECK: [[MIDDLE_BLOCK]]:
112+ ; CHECK-NEXT: [[TMP72:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP26]])
113+ ; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP48]])
114+ ; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP70]])
115+ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
116+ ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
117+ ; CHECK: [[SCALAR_PH]]:
118+ ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
119+ ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP72]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
120+ ; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
121+ ; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi float [ [[TMP74]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
20122; CHECK-NEXT: br label %[[LOOP:.*]]
21123; CHECK: [[LOOP]]:
22- ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0 , %[[ENTRY ]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
23- ; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ 0.000000e+00 , %[[ENTRY ]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
24- ; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ 0.000000e+00 , %[[ENTRY ]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
25- ; CHECK-NEXT: [[RED_2:%.*]] = phi float [ 0.000000e+00 , %[[ENTRY ]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
124+ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]] , %[[SCALAR_PH ]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
125+ ; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ [[BC_MERGE_RDX]] , %[[SCALAR_PH ]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
126+ ; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ [[BC_MERGE_RDX6]] , %[[SCALAR_PH ]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
127+ ; CHECK-NEXT: [[RED_2:%.*]] = phi float [ [[BC_MERGE_RDX7]] , %[[SCALAR_PH ]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
26128; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
27129; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
28130; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
@@ -58,11 +160,11 @@ define float @_Z4testmm(i64 %size, i64 %offset) {
58160; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
59161; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
60162; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
61- ; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.* ]]
163+ ; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+ ]]
62164; CHECK: [[EXIT]]:
63- ; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
64- ; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
65- ; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
165+ ; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ], [ [[TMP72]], %[[MIDDLE_BLOCK]] ]
166+ ; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ]
167+ ; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ], [ [[TMP74]], %[[MIDDLE_BLOCK]] ]
66168; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
67169; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
68170; CHECK-NEXT: ret float [[RES_1]]
@@ -117,3 +219,9 @@ exit:
117219 %res.1 = fadd float %res.0 , %rdx.2.next
118220 ret float %res.1
119221}
222+ ;.
223+ ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
224+ ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
225+ ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
226+ ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
227+ ;.
0 commit comments