Skip to content

Commit 80e5678

Browse files
committed
32-bitcase
Note this does very little because we only use VGPR classes for FP types (though this doesn't particularly make any sense), and we legalize normal loads and stores to integer.
1 parent 842bd36 commit 80e5678

17 files changed

+1009
-1018
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
9191
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
9292

9393
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
94-
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
94+
95+
const SIRegisterInfo *TRI = STI.getRegisterInfo();
96+
const TargetRegisterClass *V32RegClass =
97+
TRI->getDefaultVectorSuperClassForBitWidth(32);
98+
addRegisterClass(MVT::f32, V32RegClass);
9599

96100
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
97101

98-
const SIRegisterInfo *TRI = STI.getRegisterInfo();
99102
const TargetRegisterClass *V64RegClass =
100103
TRI->getDefaultVectorSuperClassForBitWidth(64);
101104

llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll

Lines changed: 190 additions & 190 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -381,17 +381,17 @@ define float @no_unsafe(ptr %addr, float %val) {
381381
; GFX90A-LABEL: no_unsafe:
382382
; GFX90A: ; %bb.0:
383383
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384-
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
384+
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
385385
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
386386
; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start
387387
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
388388
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
389-
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
390389
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
391390
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
392391
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
393392
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
394393
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
394+
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
395395
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
396396
; GFX90A-NEXT: s_cbranch_execnz .LBB3_1
397397
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end

llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
1818
; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
1919
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
2020
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
21-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
21+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
2222
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
2323
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
2424
;
@@ -53,7 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
5353
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
5454
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
5555
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
56-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
56+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
5757
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
5858
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
5959
;
@@ -89,7 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
8989
; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0
9090
; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr0
9191
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
92-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
92+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
9393
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
9494
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
9595
;
@@ -127,7 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
127127
; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr0
128128
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY3]], %subreg.sub3
129129
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
130-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
130+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
131131
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
132132
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
133133
;
@@ -170,7 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
170170
; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
171171
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
172172
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
173-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
173+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
174174
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
175175
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
176176
;
@@ -217,7 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
217217
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
218218
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
219219
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
220-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
220+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
221221
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
222222
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
223223
;
@@ -265,7 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
265265
; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
266266
; GFX90A_GFX942-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
267267
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
268-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
268+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
269269
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
270270
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
271271
;
@@ -315,7 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
315315
; GFX90A_GFX942-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
316316
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY9]], %subreg.sub2, killed [[COPY8]], %subreg.sub3
317317
; GFX90A_GFX942-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
318-
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
318+
; GFX90A_GFX942-NEXT: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:av_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
319319
; GFX90A_GFX942-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
320320
; GFX90A_GFX942-NEXT: SI_RETURN_TO_EPILOG $vgpr0
321321
;

llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -421,19 +421,19 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
421421
; CHECK: ; %bb.0:
422422
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
423423
; CHECK-NEXT: s_mov_b64 s[4:5], 0
424-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
424+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
425425
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
426426
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
427427
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
428-
; CHECK-NEXT: v_mov_b32_e32 v0, s6
428+
; CHECK-NEXT: v_mov_b32_e32 v1, s6
429429
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
430430
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
431-
; CHECK-NEXT: v_mov_b32_e32 v3, v0
432-
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
433-
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
431+
; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1
432+
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
434433
; CHECK-NEXT: s_waitcnt vmcnt(0)
435-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
434+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
436435
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
436+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
437437
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
438438
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
439439
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -458,19 +458,19 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
458458
; CHECK: ; %bb.0:
459459
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
460460
; CHECK-NEXT: s_mov_b64 s[4:5], 0
461-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
461+
; CHECK-NEXT: v_mov_b32_e32 v2, 0
462462
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
463463
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
464464
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
465-
; CHECK-NEXT: v_mov_b32_e32 v0, s6
465+
; CHECK-NEXT: v_mov_b32_e32 v1, s6
466466
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
467467
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
468-
; CHECK-NEXT: v_mov_b32_e32 v3, v0
469-
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
470-
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
468+
; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1
469+
; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
471470
; CHECK-NEXT: s_waitcnt vmcnt(0)
472-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
471+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
473472
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
473+
; CHECK-NEXT: v_mov_b32_e32 v1, v0
474474
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
475475
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
476476
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end

llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4448,18 +4448,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ig
44484448
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
44494449
; GFX90A: ; %bb.0:
44504450
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4451-
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
4451+
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
44524452
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
44534453
; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
44544454
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
44554455
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4456-
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
44574456
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
44584457
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
44594458
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44604459
; GFX90A-NEXT: buffer_wbinvl1
44614460
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
44624461
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4462+
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
44634463
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
44644464
; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
44654465
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -4771,18 +4771,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr,
47714771
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
47724772
; GFX90A: ; %bb.0:
47734773
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4774-
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
4774+
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
47754775
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
47764776
; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
47774777
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
47784778
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4779-
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
47804779
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
47814780
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
47824781
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
47834782
; GFX90A-NEXT: buffer_wbinvl1
47844783
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
47854784
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4785+
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
47864786
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
47874787
; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
47884788
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -5462,18 +5462,18 @@ define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdg
54625462
; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
54635463
; GFX90A: ; %bb.0:
54645464
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5465-
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
5465+
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
54665466
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
54675467
; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
54685468
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
54695469
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5470-
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
54715470
; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
54725471
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
54735472
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
54745473
; GFX90A-NEXT: buffer_wbinvl1
54755474
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
54765475
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5476+
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
54775477
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
54785478
; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
54795479
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end

0 commit comments

Comments
 (0)