You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
This patch teaches GVN how to eliminate redundant masked loads and forward
previous loads or instructions with a select. This is possible when the same
mask is used for masked stores/loads that write to the same memory location
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
77
+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
78
+
; CHECK-NEXT: call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
79
+
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
80
+
; CHECK-NEXT: ret <4 x float> [[TMP3]]
81
+
;
82
+
%mask = tailcall <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32%a, i32%b)
83
+
%load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr%0, i321, <4 x i1> %mask, <4 x float> zeroinitializer)
84
+
%gep.0.16 = getelementptri8, ptr%0, i3216
85
+
%load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr%gep.0.16, i321, <4 x i1> %mask, <4 x float> zeroinitializer)
86
+
%fmul = fmul <4 x float> %load.0.0, %load.0.16
87
+
callvoid@llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr%1, i321, <4 x i1> %mask)
88
+
%load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr%1, i321, <4 x i1> %mask, <4 x float> %passthrough)
89
+
ret <4 x float> %load.1.0
90
+
}
91
+
92
+
define <vscale x 4 x float> @forward_masked_load_scalable(ptr%0, ptr%1, <vscale x 4 x float> %passthrough) {
93
+
; CHECK-LABEL: @forward_masked_load_scalable(
94
+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
95
+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
96
+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
97
+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP4]]
98
+
;
99
+
%6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i324)
100
+
%7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%0, i321, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
101
+
callvoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr%1, i321, <vscale x 4 x i1> %6)
102
+
%8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%1, i321, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
103
+
ret <vscale x 4 x float> %8
104
+
}
105
+
106
+
define <vscale x 4 x float> @bail_on_different_passthrough(ptr%0, ptr%1, <vscale x 4 x float> %passthrough) {
107
+
; CHECK-LABEL: @bail_on_different_passthrough(
108
+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
109
+
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
110
+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
111
+
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
112
+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP5]]
113
+
;
114
+
%6 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i324)
115
+
%7 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%0, i321, <vscale x 4 x i1> %6, <vscale x 4 x float> zeroinitializer)
116
+
callvoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %7, ptr%1, i321, <vscale x 4 x i1> %6)
117
+
%8 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%1, i321, <vscale x 4 x i1> %6, <vscale x 4 x float> %passthrough)
118
+
ret <vscale x 4 x float> %8
119
+
}
120
+
121
+
define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr%0, ptr%1, <vscale x 4 x float> %passthrough) {
122
+
; CHECK-LABEL: @forward_binop_with_sel_scalable(
123
+
; CHECK-NEXT: [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
124
+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
127
+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
128
+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
129
+
; CHECK-NEXT: [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
130
+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP3]]
131
+
;
132
+
%mask = tailcall <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i324)
133
+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%0, i321, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
134
+
%gep.0.16 = getelementptri8, ptr%0, i3216
135
+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%gep.0.16, i321, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
136
+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
137
+
callvoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr%1, i321, <vscale x 4 x i1> %mask)
138
+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%1, i321, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
139
+
ret <vscale x 4 x float> %load.1.0
140
+
}
141
+
142
+
define <vscale x 4 x float> @load_mask_differs(ptr%0, ptr%1, <vscale x 4 x float> %passthrough) {
143
+
; CHECK-LABEL: @load_mask_differs(
144
+
; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
145
+
; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
146
+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
149
+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
150
+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
151
+
; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
152
+
; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
153
+
;
154
+
%mask0 = tailcall <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i328)
155
+
%mask1 = tailcall <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i324)
156
+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%0, i321, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
157
+
%gep.0.16 = getelementptri8, ptr%0, i3216
158
+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%gep.0.16, i321, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
159
+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
160
+
callvoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr%1, i321, <vscale x 4 x i1> %mask0)
161
+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%1, i321, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough)
162
+
ret <vscale x 4 x float> %load.1.0
163
+
}
164
+
165
+
define <vscale x 4 x float> @store_mask_differs(ptr%0, ptr%1, <vscale x 4 x float> %passthrough) {
166
+
; CHECK-LABEL: @store_mask_differs(
167
+
; CHECK-NEXT: [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
168
+
; CHECK-NEXT: [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
169
+
; CHECK-NEXT: [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
; CHECK-NEXT: [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
172
+
; CHECK-NEXT: [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
173
+
; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
174
+
; CHECK-NEXT: [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
175
+
; CHECK-NEXT: ret <vscale x 4 x float> [[LOAD_1_0]]
176
+
;
177
+
%mask0 = tailcall <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i328)
178
+
%mask1 = tailcall <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i320, i324)
179
+
%load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%0, i321, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
180
+
%gep.0.16 = getelementptri8, ptr%0, i3216
181
+
%load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%gep.0.16, i321, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
182
+
%fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
183
+
callvoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr%1, i321, <vscale x 4 x i1> %mask1)
184
+
%load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr%1, i321, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
185
+
ret <vscale x 4 x float> %load.1.0
186
+
}
187
+
188
+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
189
+
declare <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x float>) #1
190
+
191
+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
192
+
declarevoid@llvm.masked.store.nxv4f32.p0(<vscale x 4 x float>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
193
+
194
+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
195
+
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #3
196
+
39
197
declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
40
198
declarevoid@llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
0 commit comments