Skip to content

Commit 163ae0d

Browse files
committed
Checking for targets with native 64-bit add/sub support
1 parent 991f9b6 commit 163ae0d

File tree

3 files changed

+506
-25
lines changed

3 files changed

+506
-25
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5552,31 +5552,43 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
55525552
}
55535553
case AMDGPU::S_ADD_U64_PSEUDO:
55545554
case AMDGPU::S_SUB_U64_PSEUDO: {
5555-
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADD_U32
5556-
: AMDGPU::S_SUB_U32;
5557-
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO ? AMDGPU::S_ADDC_U32
5558-
: AMDGPU::S_SUBB_U32;
5559-
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5560-
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5561-
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5562-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5563-
&AMDGPU::SReg_32RegClass);
5564-
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5565-
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5566-
&AMDGPU::SReg_32RegClass);
5567-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5568-
.add(Accumlo)
5569-
.addReg(LaneValueLo->getOperand(0).getReg());
5570-
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5571-
.add(Accumhi)
5572-
.addReg(LaneValueHi->getOperand(0).getReg())
5573-
.setOperandDead(3); // Dead scc
5574-
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5575-
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5576-
.addReg(DestLo)
5577-
.addImm(AMDGPU::sub0)
5578-
.addReg(DestHi)
5579-
.addImm(AMDGPU::sub1);
5555+
if (ST.hasScalarAddSub64()) {
5556+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5557+
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
5558+
? AMDGPU::S_ADD_U64
5559+
: AMDGPU::S_SUB_U64),
5560+
DstReg)
5561+
.addReg(Accumulator->getOperand(0).getReg())
5562+
.addReg(LaneValue->getOperand(0).getReg());
5563+
} else {
5564+
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5565+
? AMDGPU::S_ADD_U32
5566+
: AMDGPU::S_SUB_U32;
5567+
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
5568+
? AMDGPU::S_ADDC_U32
5569+
: AMDGPU::S_SUBB_U32;
5570+
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5571+
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5572+
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
5573+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
5574+
&AMDGPU::SReg_32RegClass);
5575+
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
5576+
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
5577+
&AMDGPU::SReg_32RegClass);
5578+
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
5579+
.add(Accumlo)
5580+
.addReg(LaneValueLo->getOperand(0).getReg());
5581+
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
5582+
.add(Accumhi)
5583+
.addReg(LaneValueHi->getOperand(0).getReg())
5584+
.setOperandDead(3); // Dead scc
5585+
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5586+
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5587+
.addReg(DestLo)
5588+
.addImm(AMDGPU::sub0)
5589+
.addReg(DestHi)
5590+
.addImm(AMDGPU::sub1);
5591+
}
55805592
break;
55815593
}
55825594
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
1212
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
1313
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
14+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s
1415

1516
define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
1617
; GFX8DAGISEL-LABEL: uniform_value:
@@ -181,6 +182,18 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
181182
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
182183
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
183184
; GFX1132GISEL-NEXT: s_endpgm
185+
;
186+
; GFX12DAGISEL-LABEL: uniform_value:
187+
; GFX12DAGISEL: ; %bb.0: ; %entry
188+
; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
189+
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
190+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
191+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
192+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
193+
; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3
194+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
195+
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
196+
; GFX12DAGISEL-NEXT: s_endpgm
184197
entry:
185198
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1)
186199
store i32 %result, ptr addrspace(1) %out
@@ -337,6 +350,19 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
337350
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
338351
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
339352
; GFX1132GISEL-NEXT: s_endpgm
353+
;
354+
; GFX12DAGISEL-LABEL: const_value:
355+
; GFX12DAGISEL: ; %bb.0: ; %entry
356+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
357+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
358+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
359+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
360+
; GFX12DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
361+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
362+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
363+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
364+
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
365+
; GFX12DAGISEL-NEXT: s_endpgm
340366
entry:
341367
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1)
342368
store i32 %result, ptr addrspace(1) %out
@@ -492,6 +518,18 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
492518
; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2
493519
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
494520
; GFX1132GISEL-NEXT: s_endpgm
521+
;
522+
; GFX12DAGISEL-LABEL: poison_value:
523+
; GFX12DAGISEL: ; %bb.0: ; %entry
524+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
525+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
526+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
527+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
528+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
529+
; GFX12DAGISEL-NEXT: s_mul_i32 s2, s0, s2
530+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
531+
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
532+
; GFX12DAGISEL-NEXT: s_endpgm
495533
entry:
496534
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1)
497535
store i32 %result, ptr addrspace(1) %out
@@ -734,6 +772,26 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
734772
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
735773
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
736774
; GFX1132GISEL-NEXT: s_endpgm
775+
;
776+
; GFX12DAGISEL-LABEL: divergent_value:
777+
; GFX12DAGISEL: ; %bb.0: ; %entry
778+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
779+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
780+
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
781+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0
782+
; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
783+
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
784+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
785+
; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
786+
; GFX12DAGISEL-NEXT: s_bitset0_b32 s3, s4
787+
; GFX12DAGISEL-NEXT: s_add_co_i32 s2, s2, s5
788+
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
789+
; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
790+
; GFX12DAGISEL-NEXT: ; %bb.2:
791+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
792+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
793+
; GFX12DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
794+
; GFX12DAGISEL-NEXT: s_endpgm
737795
entry:
738796
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
739797
%result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1)
@@ -1208,6 +1266,50 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
12081266
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
12091267
; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
12101268
; GFX1132GISEL-NEXT: s_endpgm
1269+
;
1270+
; GFX12DAGISEL-LABEL: divergent_cfg:
1271+
; GFX12DAGISEL: ; %bb.0: ; %entry
1272+
; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1273+
; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo
1274+
; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr1
1275+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
1276+
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
1277+
; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0
1278+
; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_2
1279+
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
1280+
; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c
1281+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
1282+
; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr0
1283+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1284+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
1285+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
1286+
; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2
1287+
; GFX12DAGISEL-NEXT: .LBB4_2: ; %Flow
1288+
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0
1289+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
1290+
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1291+
; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_6
1292+
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
1293+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
1294+
; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0
1295+
; GFX12DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1
1296+
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
1297+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
1298+
; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3
1299+
; GFX12DAGISEL-NEXT: s_bitset0_b32 s2, s3
1300+
; GFX12DAGISEL-NEXT: s_add_co_i32 s1, s1, s6
1301+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
1302+
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
1303+
; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4
1304+
; GFX12DAGISEL-NEXT: ; %bb.5:
1305+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1
1306+
; GFX12DAGISEL-NEXT: .LBB4_6: ; %endif
1307+
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
1308+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1309+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0
1310+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
1311+
; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
1312+
; GFX12DAGISEL-NEXT: s_endpgm
12111313
entry:
12121314
%tid = call i32 @llvm.amdgcn.workitem.id.x()
12131315
%d_cmp = icmp ult i32 %tid, 16
@@ -1421,6 +1523,22 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) {
14211523
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
14221524
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
14231525
; GFX1132GISEL-NEXT: s_endpgm
1526+
;
1527+
; GFX12DAGISEL-LABEL: uniform_value_i64:
1528+
; GFX12DAGISEL: ; %bb.0: ; %entry
1529+
; GFX12DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1530+
; GFX12DAGISEL-NEXT: s_mov_b32 s4, exec_lo
1531+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1532+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4
1533+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
1534+
; GFX12DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4
1535+
; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s4
1536+
; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4
1537+
; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s5, s3
1538+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
1539+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1540+
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1541+
; GFX12DAGISEL-NEXT: s_endpgm
14241542
entry:
14251543
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1)
14261544
store i64 %result, ptr addrspace(1) %out
@@ -1623,6 +1741,22 @@ define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) {
16231741
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
16241742
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
16251743
; GFX1132GISEL-NEXT: s_endpgm
1744+
;
1745+
; GFX12DAGISEL-LABEL: const_value_i64:
1746+
; GFX12DAGISEL: ; %bb.0: ; %entry
1747+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1748+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
1749+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1750+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
1751+
; GFX12DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2
1752+
; GFX12DAGISEL-NEXT: s_mul_i32 s4, s2, 0
1753+
; GFX12DAGISEL-NEXT: s_mulk_i32 s2, 0x7b
1754+
; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s4
1755+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
1756+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1757+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
1758+
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1759+
; GFX12DAGISEL-NEXT: s_endpgm
16261760
entry:
16271761
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 123, i32 1)
16281762
store i64 %result, ptr addrspace(1) %out
@@ -1823,6 +1957,22 @@ define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) {
18231957
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
18241958
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
18251959
; GFX1132GISEL-NEXT: s_endpgm
1960+
;
1961+
; GFX12DAGISEL-LABEL: poison_value_i64:
1962+
; GFX12DAGISEL: ; %bb.0: ; %entry
1963+
; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1964+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
1965+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1966+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2
1967+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
1968+
; GFX12DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2
1969+
; GFX12DAGISEL-NEXT: s_mul_i32 s4, s1, s2
1970+
; GFX12DAGISEL-NEXT: s_mul_i32 s2, s0, s2
1971+
; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s4
1972+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2
1973+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1974+
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1975+
; GFX12DAGISEL-NEXT: s_endpgm
18261976
entry:
18271977
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 poison, i32 1)
18281978
store i64 %result, ptr addrspace(1) %out
@@ -2075,6 +2225,32 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
20752225
; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
20762226
; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
20772227
; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31]
2228+
;
2229+
; GFX12DAGISEL-LABEL: divergent_value_i64:
2230+
; GFX12DAGISEL: ; %bb.0: ; %entry
2231+
; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
2232+
; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0
2233+
; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0
2234+
; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0
2235+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
2236+
; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0
2237+
; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo
2238+
; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2239+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
2240+
; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2
2241+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
2242+
; GFX12DAGISEL-NEXT: v_readlane_b32 s4, v2, s3
2243+
; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v3, s3
2244+
; GFX12DAGISEL-NEXT: s_bitset0_b32 s2, s3
2245+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
2246+
; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0
2247+
; GFX12DAGISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
2248+
; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1
2249+
; GFX12DAGISEL-NEXT: ; %bb.2:
2250+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
2251+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
2252+
; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off
2253+
; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31]
20782254
entry:
20792255
%result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %id.x, i32 1)
20802256
store i64 %result, ptr addrspace(1) %out
@@ -2552,6 +2728,49 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
25522728
; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0
25532729
; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
25542730
; GFX1132GISEL-NEXT: s_endpgm
2731+
;
2732+
; GFX12DAGISEL-LABEL: divergent_cfg_i64:
2733+
; GFX12DAGISEL: ; %bb.0: ; %entry
2734+
; GFX12DAGISEL-NEXT: s_clause 0x1
2735+
; GFX12DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
2736+
; GFX12DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
2737+
; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2738+
; GFX12DAGISEL-NEXT: s_mov_b32 s8, exec_lo
2739+
; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7
2740+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
2741+
; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0
2742+
; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8
2743+
; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2
2744+
; GFX12DAGISEL-NEXT: ; %bb.1: ; %else
2745+
; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo
2746+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2747+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6
2748+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
2749+
; GFX12DAGISEL-NEXT: s_mul_hi_u32 s7, s2, s6
2750+
; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s6
2751+
; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6
2752+
; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s7, s3
2753+
; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow
2754+
; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0
2755+
; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8
2756+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
2757+
; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2
2758+
; GFX12DAGISEL-NEXT: ; %bb.3: ; %if
2759+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2760+
; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo
2761+
; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3
2762+
; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
2763+
; GFX12DAGISEL-NEXT: s_mul_hi_u32 s6, s4, s3
2764+
; GFX12DAGISEL-NEXT: s_mul_i32 s5, s5, s3
2765+
; GFX12DAGISEL-NEXT: s_mul_i32 s4, s4, s3
2766+
; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe
2767+
; GFX12DAGISEL-NEXT: s_add_co_u32 s5, s6, s5
2768+
; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
2769+
; GFX12DAGISEL-NEXT: ; %bb.4: ; %endif
2770+
; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
2771+
; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, 0
2772+
; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2773+
; GFX12DAGISEL-NEXT: s_endpgm
25552774
entry:
25562775
%tid = call i32 @llvm.amdgcn.workitem.id.x()
25572776
%d_cmp = icmp ult i32 %tid, 16

0 commit comments

Comments
 (0)