@@ -151,8 +151,8 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
151151 ret void
152152}
153153
154- define protected amdgpu_kernel void @max (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
155- ; CHECK-LABEL: max :
154+ define protected amdgpu_kernel void @max_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
155+ ; CHECK-LABEL: max_workgroup :
156156; CHECK: ; %bb.0:
157157; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
158158; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -165,6 +165,41 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
165165; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
166166; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
167167; CHECK-NEXT: global_store_dword v[0:1], v2, off
168+ ; CHECK-NEXT: s_endpgm
169+ %n32 = atomicrmw max i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
170+ %n64 = zext i32 %n32 to i64
171+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
172+ store float 1 .0 , float addrspace (1 )* %p1
173+ ret void
174+ }
175+
176+ define protected amdgpu_kernel void @max (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
177+ ; CHECK-LABEL: max:
178+ ; CHECK: ; %bb.0:
179+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
181+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
182+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
183+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
184+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
185+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
186+ ; CHECK-NEXT: .LBB7_1: ; %atomicrmw.start
187+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
188+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
189+ ; CHECK-NEXT: v_max_i32_e32 v2, 1, v3
190+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
191+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
192+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
193+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
194+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
195+ ; CHECK-NEXT: s_cbranch_execnz .LBB7_1
196+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
197+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
198+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
199+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
200+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
201+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
202+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
168203; CHECK-NEXT: s_endpgm
169204 %n32 = atomicrmw max i32 addrspace (1 )* %p , i32 1 monotonic
170205 %n64 = zext i32 %n32 to i64
@@ -173,8 +208,8 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
173208 ret void
174209}
175210
176- define protected amdgpu_kernel void @min (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
177- ; CHECK-LABEL: min :
211+ define protected amdgpu_kernel void @min_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
212+ ; CHECK-LABEL: min_workgroup :
178213; CHECK: ; %bb.0:
179214; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180215; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -187,6 +222,41 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
187222; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
188223; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
189224; CHECK-NEXT: global_store_dword v[0:1], v2, off
225+ ; CHECK-NEXT: s_endpgm
226+ %n32 = atomicrmw min i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
227+ %n64 = zext i32 %n32 to i64
228+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
229+ store float 1 .0 , float addrspace (1 )* %p1
230+ ret void
231+ }
232+
233+ define protected amdgpu_kernel void @min (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
234+ ; CHECK-LABEL: min:
235+ ; CHECK: ; %bb.0:
236+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
238+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
239+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
240+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
241+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
242+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
243+ ; CHECK-NEXT: .LBB9_1: ; %atomicrmw.start
244+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
245+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
246+ ; CHECK-NEXT: v_min_i32_e32 v2, 1, v3
247+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
248+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
249+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
250+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
251+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
252+ ; CHECK-NEXT: s_cbranch_execnz .LBB9_1
253+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
254+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
255+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
256+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
257+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
258+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
259+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
190260; CHECK-NEXT: s_endpgm
191261 %n32 = atomicrmw min i32 addrspace (1 )* %p , i32 1 monotonic
192262 %n64 = zext i32 %n32 to i64
@@ -195,8 +265,8 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
195265 ret void
196266}
197267
198- define protected amdgpu_kernel void @umax (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
199- ; CHECK-LABEL: umax :
268+ define protected amdgpu_kernel void @umax_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
269+ ; CHECK-LABEL: umax_workgroup :
200270; CHECK: ; %bb.0:
201271; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
202272; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -209,6 +279,41 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
209279; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
210280; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
211281; CHECK-NEXT: global_store_dword v[0:1], v2, off
282+ ; CHECK-NEXT: s_endpgm
283+ %n32 = atomicrmw umax i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
284+ %n64 = zext i32 %n32 to i64
285+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
286+ store float 1 .0 , float addrspace (1 )* %p1
287+ ret void
288+ }
289+
290+ define protected amdgpu_kernel void @umax (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
291+ ; CHECK-LABEL: umax:
292+ ; CHECK: ; %bb.0:
293+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
294+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
295+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
296+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
297+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
298+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
299+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
300+ ; CHECK-NEXT: .LBB11_1: ; %atomicrmw.start
301+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
302+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
303+ ; CHECK-NEXT: v_max_u32_e32 v2, 1, v3
304+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
305+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
306+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
307+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
308+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
309+ ; CHECK-NEXT: s_cbranch_execnz .LBB11_1
310+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
311+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
312+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
313+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
314+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
315+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
316+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
212317; CHECK-NEXT: s_endpgm
213318 %n32 = atomicrmw umax i32 addrspace (1 )* %p , i32 1 monotonic
214319 %n64 = zext i32 %n32 to i64
@@ -217,8 +322,8 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
217322 ret void
218323}
219324
220- define protected amdgpu_kernel void @umin (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
221- ; CHECK-LABEL: umin :
325+ define protected amdgpu_kernel void @umin_workgroup (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
326+ ; CHECK-LABEL: umin_workgroup :
222327; CHECK: ; %bb.0:
223328; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
224329; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -231,6 +336,41 @@ define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)*
231336; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
232337; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
233338; CHECK-NEXT: global_store_dword v[0:1], v2, off
339+ ; CHECK-NEXT: s_endpgm
340+ %n32 = atomicrmw umin i32 addrspace (1 )* %p , i32 1 syncscope("workgroup" ) monotonic
341+ %n64 = zext i32 %n32 to i64
342+ %p1 = getelementptr inbounds %S , %S addrspace (1 )* %q , i64 %n64 , i32 0
343+ store float 1 .0 , float addrspace (1 )* %p1
344+ ret void
345+ }
346+
347+ define protected amdgpu_kernel void @umin (i32 addrspace (1 )* %p , %S addrspace (1 )* %q ) {
348+ ; CHECK-LABEL: umin:
349+ ; CHECK: ; %bb.0:
350+ ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
351+ ; CHECK-NEXT: s_mov_b64 s[4:5], 0
352+ ; CHECK-NEXT: v_mov_b32_e32 v1, 0
353+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
354+ ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
355+ ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
356+ ; CHECK-NEXT: v_mov_b32_e32 v0, s6
357+ ; CHECK-NEXT: .LBB13_1: ; %atomicrmw.start
358+ ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
359+ ; CHECK-NEXT: v_mov_b32_e32 v3, v0
360+ ; CHECK-NEXT: v_min_u32_e32 v2, 1, v3
361+ ; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
362+ ; CHECK-NEXT: s_waitcnt vmcnt(0)
363+ ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
364+ ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
365+ ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
366+ ; CHECK-NEXT: s_cbranch_execnz .LBB13_1
367+ ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
368+ ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
369+ ; CHECK-NEXT: v_mov_b32_e32 v2, s2
370+ ; CHECK-NEXT: v_mov_b32_e32 v3, s3
371+ ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
372+ ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
373+ ; CHECK-NEXT: global_store_dword v[0:1], v2, off
234374; CHECK-NEXT: s_endpgm
235375 %n32 = atomicrmw umin i32 addrspace (1 )* %p , i32 1 monotonic
236376 %n64 = zext i32 %n32 to i64
@@ -337,7 +477,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
337477; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
338478; CHECK-NEXT: s_waitcnt lgkmcnt(0)
339479; CHECK-NEXT: v_mov_b32_e32 v0, s6
340- ; CHECK-NEXT: .LBB14_1 : ; %atomicrmw.start
480+ ; CHECK-NEXT: .LBB18_1 : ; %atomicrmw.start
341481; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
342482; CHECK-NEXT: v_mov_b32_e32 v3, v0
343483; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -346,7 +486,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
346486; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
347487; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
348488; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
349- ; CHECK-NEXT: s_cbranch_execnz .LBB14_1
489+ ; CHECK-NEXT: s_cbranch_execnz .LBB18_1
350490; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
351491; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
352492; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
@@ -374,7 +514,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
374514; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
375515; CHECK-NEXT: s_waitcnt lgkmcnt(0)
376516; CHECK-NEXT: v_mov_b32_e32 v0, s6
377- ; CHECK-NEXT: .LBB15_1 : ; %atomicrmw.start
517+ ; CHECK-NEXT: .LBB19_1 : ; %atomicrmw.start
378518; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
379519; CHECK-NEXT: v_mov_b32_e32 v3, v0
380520; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
@@ -383,7 +523,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
383523; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
384524; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
385525; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
386- ; CHECK-NEXT: s_cbranch_execnz .LBB15_1
526+ ; CHECK-NEXT: s_cbranch_execnz .LBB19_1
387527; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
388528; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
389529; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
0 commit comments