Skip to content

Commit 53c3b21

Browse files
committed
[AMDGPU] Add DAG mutation to improve scheduling before barriers
Add scheduler DAG mutation to add data dependencies between atomic fences and preceding memory reads. This allows some modelling of the impact an atomic fence can have on outstanding memory accesses. This is beneficial when a fence would cause wait count insertion, as more instructions will be scheduled before the fence hiding memory latency. It also reduces the risk of a fence causing a premature wait on all active memory operations.
1 parent e088334 commit 53c3b21

14 files changed

+1075
-908
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file This file contains a DAG scheduling mutation to add latency to
10+
/// barrier edges between ATOMIC_FENCE instructions and preceeding
11+
/// memory accesses potentially affected by the fence.
12+
/// This is beneficial when a fence would cause wait count insertion,
13+
/// as more instructions will be scheduled before the fence hiding
14+
/// memory latency.
15+
/// It also reduces the risk of a fence causing a premature wait
16+
/// on all active memory operations.
17+
//
18+
//===----------------------------------------------------------------------===//
19+
20+
#include "AMDGPUBarrierLatency.h"
21+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22+
#include "SIInstrInfo.h"
23+
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
24+
25+
using namespace llvm;
26+
27+
namespace {
28+
29+
class BarrierLatency : public ScheduleDAGMutation {
30+
public:
31+
BarrierLatency() = default;
32+
void apply(ScheduleDAGInstrs *DAG) override;
33+
};
34+
35+
static bool isMemLoad(const MachineInstr *MI) {
36+
auto isLoad = [](const MachineInstr *MI) {
37+
return (SIInstrInfo::isDS(*MI) || SIInstrInfo::isVMEM(*MI) ||
38+
SIInstrInfo::isSMRD(*MI)) &&
39+
MI->mayLoad();
40+
};
41+
42+
if (MI->isBundle()) {
43+
auto I = std::next(MI->getIterator());
44+
return I != MI->getParent()->instr_end() && I->isInsideBundle() &&
45+
isLoad(&*I);
46+
}
47+
48+
return isLoad(MI);
49+
}
50+
51+
void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
52+
const unsigned SyntheticLatency = 2000;
53+
for (SUnit &SU : DAG->SUnits) {
54+
const MachineInstr *MI = SU.getInstr();
55+
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
56+
continue;
57+
58+
// Update latency on barrier edges of ATOMIC_FENCE.
59+
// We don't consider the scope of the fence or type of instruction
60+
// involved in the barrier edge.
61+
for (SDep &PredDep : SU.Preds) {
62+
if (!PredDep.isBarrier())
63+
continue;
64+
SUnit *PredSU = PredDep.getSUnit();
65+
if (!isMemLoad(PredSU->getInstr()))
66+
continue;
67+
SDep ForwardD = PredDep;
68+
ForwardD.setSUnit(&SU);
69+
for (SDep &SuccDep : PredSU->Succs) {
70+
if (SuccDep == ForwardD) {
71+
SuccDep.setLatency(SuccDep.getLatency() + SyntheticLatency);
72+
break;
73+
}
74+
}
75+
PredDep.setLatency(PredDep.getLatency() + SyntheticLatency);
76+
PredSU->setDepthDirty();
77+
SU.setDepthDirty();
78+
}
79+
}
80+
}
81+
82+
} // end namespace
83+
84+
std::unique_ptr<ScheduleDAGMutation>
85+
llvm::createAMDGPUBarrierLatencyDAGMutation() {
86+
return std::make_unique<BarrierLatency>();
87+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//===- AMDGPUBarrierLatency.h - AMDGPU Export Clustering --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
10+
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H
11+
12+
#include "llvm/CodeGen/ScheduleDAGMutation.h"
13+
#include <memory>
14+
15+
namespace llvm {
16+
17+
std::unique_ptr<ScheduleDAGMutation> createAMDGPUBarrierLatencyDAGMutation();
18+
19+
} // namespace llvm
20+
21+
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUBARRIERLATENCY_H

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "AMDGPUTargetMachine.h"
1818
#include "AMDGPU.h"
1919
#include "AMDGPUAliasAnalysis.h"
20+
#include "AMDGPUBarrierLatency.h"
2021
#include "AMDGPUCtorDtorLowering.h"
2122
#include "AMDGPUExportClustering.h"
2223
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -588,6 +589,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
588589
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
589590
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
590591
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
592+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
591593
return DAG;
592594
}
593595

@@ -608,6 +610,7 @@ createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
608610
if (ST.shouldClusterStores())
609611
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
610612
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
613+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
611614
return DAG;
612615
}
613616

@@ -1158,6 +1161,7 @@ GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
11581161
EnableVOPD)
11591162
DAG->addMutation(createVOPDPairingMutation());
11601163
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
1164+
DAG->addMutation(createAMDGPUBarrierLatencyDAGMutation());
11611165
return DAG;
11621166
}
11631167
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen
4949
AMDGPUAsmPrinter.cpp
5050
AMDGPUAtomicOptimizer.cpp
5151
AMDGPUAttributor.cpp
52+
AMDGPUBarrierLatency.cpp
5253
AMDGPUCallLowering.cpp
5354
AMDGPUCodeGenPrepare.cpp
5455
AMDGPUCombinerHelper.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,9 +1528,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
15281528
; GFX942-NEXT: buffer_wbl2 sc1
15291529
; GFX942-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0
15301530
; GFX942-NEXT: s_waitcnt vmcnt(0)
1531-
; GFX942-NEXT: buffer_inv sc1
15321531
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
15331532
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1533+
; GFX942-NEXT: buffer_inv sc1
15341534
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
15351535
; GFX942-NEXT: s_cbranch_execnz .LBB12_1
15361536
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1576,9 +1576,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
15761576
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
15771577
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
15781578
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1579-
; GFX90A-NEXT: buffer_wbinvl1
15801579
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
15811580
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1581+
; GFX90A-NEXT: buffer_wbinvl1
15821582
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
15831583
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
15841584
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1603,9 +1603,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
16031603
; GFX908-NEXT: v_mov_b32_e32 v1, v5
16041604
; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
16051605
; GFX908-NEXT: s_waitcnt vmcnt(0)
1606-
; GFX908-NEXT: buffer_wbinvl1
16071606
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
16081607
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1608+
; GFX908-NEXT: buffer_wbinvl1
16091609
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
16101610
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
16111611
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1630,9 +1630,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
16301630
; GFX8-NEXT: v_mov_b32_e32 v1, v5
16311631
; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc
16321632
; GFX8-NEXT: s_waitcnt vmcnt(0)
1633-
; GFX8-NEXT: buffer_wbinvl1
16341633
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
16351634
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1635+
; GFX8-NEXT: buffer_wbinvl1
16361636
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
16371637
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
16381638
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1683,10 +1683,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
16831683
; GFX942-NEXT: buffer_wbl2 sc1
16841684
; GFX942-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
16851685
; GFX942-NEXT: s_waitcnt vmcnt(0)
1686-
; GFX942-NEXT: buffer_inv sc1
16871686
; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
1688-
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
16891687
; GFX942-NEXT: v_mov_b32_e32 v1, v4
1688+
; GFX942-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1689+
; GFX942-NEXT: buffer_inv sc1
16901690
; GFX942-NEXT: s_andn2_b64 exec, exec, s[4:5]
16911691
; GFX942-NEXT: s_cbranch_execnz .LBB13_1
16921692
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1730,10 +1730,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
17301730
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
17311731
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
17321732
; GFX90A-NEXT: s_waitcnt vmcnt(0)
1733-
; GFX90A-NEXT: buffer_wbinvl1
17341733
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
1735-
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
17361734
; GFX90A-NEXT: v_mov_b32_e32 v1, v4
1735+
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1736+
; GFX90A-NEXT: buffer_wbinvl1
17371737
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
17381738
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
17391739
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1756,10 +1756,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
17561756
; GFX908-NEXT: v_mov_b32_e32 v4, v0
17571757
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
17581758
; GFX908-NEXT: s_waitcnt vmcnt(0)
1759-
; GFX908-NEXT: buffer_wbinvl1
17601759
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
1761-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
17621760
; GFX908-NEXT: v_mov_b32_e32 v1, v4
1761+
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1762+
; GFX908-NEXT: buffer_wbinvl1
17631763
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
17641764
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
17651765
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1782,10 +1782,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
17821782
; GFX8-NEXT: v_mov_b32_e32 v4, v0
17831783
; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc
17841784
; GFX8-NEXT: s_waitcnt vmcnt(0)
1785-
; GFX8-NEXT: buffer_wbinvl1
17861785
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
1787-
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
17881786
; GFX8-NEXT: v_mov_b32_e32 v1, v4
1787+
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1788+
; GFX8-NEXT: buffer_wbinvl1
17891789
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
17901790
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
17911791
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1830,9 +1830,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18301830
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18311831
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
18321832
; GFX12-NEXT: s_wait_loadcnt 0x0
1833-
; GFX12-NEXT: global_inv scope:SCOPE_DEV
18341833
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
18351834
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
1835+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
18361836
; GFX12-NEXT: s_wait_alu 0xfffe
18371837
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
18381838
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
@@ -1872,11 +1872,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18721872
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
18731873
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
18741874
; GFX11-NEXT: s_waitcnt vmcnt(0)
1875-
; GFX11-NEXT: buffer_gl1_inv
1876-
; GFX11-NEXT: buffer_gl0_inv
18771875
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
18781876
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
1879-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1877+
; GFX11-NEXT: buffer_gl1_inv
1878+
; GFX11-NEXT: buffer_gl0_inv
18801879
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
18811880
; GFX11-NEXT: s_cbranch_execnz .LBB14_1
18821881
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1925,9 +1924,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
19251924
; GFX908-NEXT: v_mov_b32_e32 v3, v10
19261925
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
19271926
; GFX908-NEXT: s_waitcnt vmcnt(0)
1928-
; GFX908-NEXT: buffer_wbinvl1
19291927
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
19301928
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1929+
; GFX908-NEXT: buffer_wbinvl1
19311930
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
19321931
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
19331932
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1956,9 +1955,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
19561955
; GFX8-NEXT: v_mov_b32_e32 v3, v10
19571956
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
19581957
; GFX8-NEXT: s_waitcnt vmcnt(0)
1959-
; GFX8-NEXT: buffer_wbinvl1
19601958
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
19611959
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1960+
; GFX8-NEXT: buffer_wbinvl1
19621961
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
19631962
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
19641963
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2000,10 +1999,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
20001999
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
20012000
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
20022001
; GFX12-NEXT: s_wait_loadcnt 0x0
2003-
; GFX12-NEXT: global_inv scope:SCOPE_DEV
20042002
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
20052003
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
20062004
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
2005+
; GFX12-NEXT: global_inv scope:SCOPE_DEV
20072006
; GFX12-NEXT: s_wait_alu 0xfffe
20082007
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
20092008
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
@@ -2040,12 +2039,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
20402039
; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
20412040
; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
20422041
; GFX11-NEXT: s_waitcnt vmcnt(0)
2043-
; GFX11-NEXT: buffer_gl1_inv
2044-
; GFX11-NEXT: buffer_gl0_inv
20452042
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
20462043
; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
20472044
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
2048-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2045+
; GFX11-NEXT: buffer_gl1_inv
2046+
; GFX11-NEXT: buffer_gl0_inv
20492047
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
20502048
; GFX11-NEXT: s_cbranch_execnz .LBB15_1
20512049
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2090,11 +2088,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
20902088
; GFX908-NEXT: v_mov_b32_e32 v7, v0
20912089
; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
20922090
; GFX908-NEXT: s_waitcnt vmcnt(0)
2093-
; GFX908-NEXT: buffer_wbinvl1
20942091
; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
20952092
; GFX908-NEXT: v_mov_b32_e32 v2, v7
2096-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
20972093
; GFX908-NEXT: v_mov_b32_e32 v3, v8
2094+
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2095+
; GFX908-NEXT: buffer_wbinvl1
20982096
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
20992097
; GFX908-NEXT: s_cbranch_execnz .LBB15_1
21002098
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2119,11 +2117,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
21192117
; GFX8-NEXT: v_mov_b32_e32 v7, v0
21202118
; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
21212119
; GFX8-NEXT: s_waitcnt vmcnt(0)
2122-
; GFX8-NEXT: buffer_wbinvl1
21232120
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
21242121
; GFX8-NEXT: v_mov_b32_e32 v2, v7
2125-
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
21262122
; GFX8-NEXT: v_mov_b32_e32 v3, v8
2123+
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2124+
; GFX8-NEXT: buffer_wbinvl1
21272125
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
21282126
; GFX8-NEXT: s_cbranch_execnz .LBB15_1
21292127
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end

0 commit comments

Comments
 (0)