|
11 | 11 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s |
12 | 12 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s |
13 | 13 | ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s |
| 14 | +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 < %s | FileCheck -check-prefixes=GFX12DAGISEL %s |
14 | 15 |
|
15 | 16 | define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { |
16 | 17 | ; GFX8DAGISEL-LABEL: uniform_value: |
@@ -181,6 +182,18 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { |
181 | 182 | ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 |
182 | 183 | ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] |
183 | 184 | ; GFX1132GISEL-NEXT: s_endpgm |
| 185 | +; |
| 186 | +; GFX12DAGISEL-LABEL: uniform_value: |
| 187 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 188 | +; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 |
| 189 | +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo |
| 190 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) |
| 191 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 |
| 192 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 193 | +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3 |
| 194 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 |
| 195 | +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] |
| 196 | +; GFX12DAGISEL-NEXT: s_endpgm |
184 | 197 | entry: |
185 | 198 | %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 1) |
186 | 199 | store i32 %result, ptr addrspace(1) %out |
@@ -337,6 +350,19 @@ define amdgpu_kernel void @const_value(ptr addrspace(1) %out) { |
337 | 350 | ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) |
338 | 351 | ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] |
339 | 352 | ; GFX1132GISEL-NEXT: s_endpgm |
| 353 | +; |
| 354 | +; GFX12DAGISEL-LABEL: const_value: |
| 355 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 356 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 357 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 358 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| 359 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 |
| 360 | +; GFX12DAGISEL-NEXT: s_mulk_i32 s2, 0x7b |
| 361 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| 362 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 |
| 363 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 364 | +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] |
| 365 | +; GFX12DAGISEL-NEXT: s_endpgm |
340 | 366 | entry: |
341 | 367 | %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 123, i32 1) |
342 | 368 | store i32 %result, ptr addrspace(1) %out |
@@ -492,6 +518,18 @@ define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) { |
492 | 518 | ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 |
493 | 519 | ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] |
494 | 520 | ; GFX1132GISEL-NEXT: s_endpgm |
| 521 | +; |
| 522 | +; GFX12DAGISEL-LABEL: poison_value: |
| 523 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 524 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 525 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 526 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) |
| 527 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 |
| 528 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 529 | +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s0, s2 |
| 530 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 |
| 531 | +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] |
| 532 | +; GFX12DAGISEL-NEXT: s_endpgm |
495 | 533 | entry: |
496 | 534 | %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 poison, i32 1) |
497 | 535 | store i32 %result, ptr addrspace(1) %out |
@@ -734,6 +772,26 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) { |
734 | 772 | ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) |
735 | 773 | ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] |
736 | 774 | ; GFX1132GISEL-NEXT: s_endpgm |
| 775 | +; |
| 776 | +; GFX12DAGISEL-LABEL: divergent_value: |
| 777 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 778 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 779 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 |
| 780 | +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo |
| 781 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, 0 |
| 782 | +; GFX12DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 |
| 783 | +; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s4, s3 |
| 784 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 785 | +; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v0, s4 |
| 786 | +; GFX12DAGISEL-NEXT: s_bitset0_b32 s3, s4 |
| 787 | +; GFX12DAGISEL-NEXT: s_add_co_i32 s2, s2, s5 |
| 788 | +; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s3, 0 |
| 789 | +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1 |
| 790 | +; GFX12DAGISEL-NEXT: ; %bb.2: |
| 791 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 |
| 792 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 793 | +; GFX12DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] |
| 794 | +; GFX12DAGISEL-NEXT: s_endpgm |
737 | 795 | entry: |
738 | 796 | %id.x = call i32 @llvm.amdgcn.workitem.id.x() |
739 | 797 | %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 1) |
@@ -1208,6 +1266,50 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { |
1208 | 1266 | ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) |
1209 | 1267 | ; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3] |
1210 | 1268 | ; GFX1132GISEL-NEXT: s_endpgm |
| 1269 | +; |
| 1270 | +; GFX12DAGISEL-LABEL: divergent_cfg: |
| 1271 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 1272 | +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| 1273 | +; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo |
| 1274 | +; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr1 |
| 1275 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 1276 | +; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 |
| 1277 | +; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 |
| 1278 | +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_2 |
| 1279 | +; GFX12DAGISEL-NEXT: ; %bb.1: ; %else |
| 1280 | +; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c |
| 1281 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 1282 | +; GFX12DAGISEL-NEXT: ; implicit-def: $vgpr0 |
| 1283 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| 1284 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 |
| 1285 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 1286 | +; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2 |
| 1287 | +; GFX12DAGISEL-NEXT: .LBB4_2: ; %Flow |
| 1288 | +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 |
| 1289 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 |
| 1290 | +; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 |
| 1291 | +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB4_6 |
| 1292 | +; GFX12DAGISEL-NEXT: ; %bb.3: ; %if |
| 1293 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 1294 | +; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0 |
| 1295 | +; GFX12DAGISEL-NEXT: .LBB4_4: ; =>This Inner Loop Header: Depth=1 |
| 1296 | +; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 |
| 1297 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 1298 | +; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 |
| 1299 | +; GFX12DAGISEL-NEXT: s_bitset0_b32 s2, s3 |
| 1300 | +; GFX12DAGISEL-NEXT: s_add_co_i32 s1, s1, s6 |
| 1301 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 1302 | +; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 |
| 1303 | +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_4 |
| 1304 | +; GFX12DAGISEL-NEXT: ; %bb.5: |
| 1305 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 |
| 1306 | +; GFX12DAGISEL-NEXT: .LBB4_6: ; %endif |
| 1307 | +; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 |
| 1308 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 1309 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0 |
| 1310 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 1311 | +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] |
| 1312 | +; GFX12DAGISEL-NEXT: s_endpgm |
1211 | 1313 | entry: |
1212 | 1314 | %tid = call i32 @llvm.amdgcn.workitem.id.x() |
1213 | 1315 | %d_cmp = icmp ult i32 %tid, 16 |
@@ -1421,6 +1523,22 @@ define amdgpu_kernel void @uniform_value_i64(ptr addrspace(1) %out, i64 %in) { |
1421 | 1523 | ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
1422 | 1524 | ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
1423 | 1525 | ; GFX1132GISEL-NEXT: s_endpgm |
| 1526 | +; |
| 1527 | +; GFX12DAGISEL-LABEL: uniform_value_i64: |
| 1528 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 1529 | +; GFX12DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| 1530 | +; GFX12DAGISEL-NEXT: s_mov_b32 s4, exec_lo |
| 1531 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| 1532 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 |
| 1533 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 1534 | +; GFX12DAGISEL-NEXT: s_mul_hi_u32 s5, s2, s4 |
| 1535 | +; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s4 |
| 1536 | +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s4 |
| 1537 | +; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s5, s3 |
| 1538 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 |
| 1539 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 |
| 1540 | +; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| 1541 | +; GFX12DAGISEL-NEXT: s_endpgm |
1424 | 1542 | entry: |
1425 | 1543 | %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %in, i32 1) |
1426 | 1544 | store i64 %result, ptr addrspace(1) %out |
@@ -1623,6 +1741,22 @@ define amdgpu_kernel void @const_value_i64(ptr addrspace(1) %out) { |
1623 | 1741 | ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) |
1624 | 1742 | ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
1625 | 1743 | ; GFX1132GISEL-NEXT: s_endpgm |
| 1744 | +; |
| 1745 | +; GFX12DAGISEL-LABEL: const_value_i64: |
| 1746 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 1747 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 1748 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 1749 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| 1750 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 |
| 1751 | +; GFX12DAGISEL-NEXT: s_mul_hi_u32 s3, 0x7b, s2 |
| 1752 | +; GFX12DAGISEL-NEXT: s_mul_i32 s4, s2, 0 |
| 1753 | +; GFX12DAGISEL-NEXT: s_mulk_i32 s2, 0x7b |
| 1754 | +; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s4 |
| 1755 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 |
| 1756 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 |
| 1757 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 1758 | +; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| 1759 | +; GFX12DAGISEL-NEXT: s_endpgm |
1626 | 1760 | entry: |
1627 | 1761 | %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 123, i32 1) |
1628 | 1762 | store i64 %result, ptr addrspace(1) %out |
@@ -1823,6 +1957,22 @@ define amdgpu_kernel void @poison_value_i64(ptr addrspace(1) %out, i64 %in) { |
1823 | 1957 | ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
1824 | 1958 | ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
1825 | 1959 | ; GFX1132GISEL-NEXT: s_endpgm |
| 1960 | +; |
| 1961 | +; GFX12DAGISEL-LABEL: poison_value_i64: |
| 1962 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 1963 | +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 |
| 1964 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 1965 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| 1966 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 |
| 1967 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 1968 | +; GFX12DAGISEL-NEXT: s_mul_hi_u32 s3, s0, s2 |
| 1969 | +; GFX12DAGISEL-NEXT: s_mul_i32 s4, s1, s2 |
| 1970 | +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s0, s2 |
| 1971 | +; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s3, s4 |
| 1972 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s2 |
| 1973 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 |
| 1974 | +; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| 1975 | +; GFX12DAGISEL-NEXT: s_endpgm |
1826 | 1976 | entry: |
1827 | 1977 | %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 poison, i32 1) |
1828 | 1978 | store i64 %result, ptr addrspace(1) %out |
@@ -2075,6 +2225,32 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { |
2075 | 2225 | ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
2076 | 2226 | ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off |
2077 | 2227 | ; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] |
| 2228 | +; |
| 2229 | +; GFX12DAGISEL-LABEL: divergent_value_i64: |
| 2230 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 2231 | +; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 |
| 2232 | +; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0 |
| 2233 | +; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0 |
| 2234 | +; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0 |
| 2235 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 2236 | +; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0 |
| 2237 | +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo |
| 2238 | +; GFX12DAGISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 |
| 2239 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 2240 | +; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 |
| 2241 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 2242 | +; GFX12DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 |
| 2243 | +; GFX12DAGISEL-NEXT: v_readlane_b32 s5, v3, s3 |
| 2244 | +; GFX12DAGISEL-NEXT: s_bitset0_b32 s2, s3 |
| 2245 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 2246 | +; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 |
| 2247 | +; GFX12DAGISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] |
| 2248 | +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB8_1 |
| 2249 | +; GFX12DAGISEL-NEXT: ; %bb.2: |
| 2250 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 2251 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 |
| 2252 | +; GFX12DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off |
| 2253 | +; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31] |
2078 | 2254 | entry: |
2079 | 2255 | %result = call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %id.x, i32 1) |
2080 | 2256 | store i64 %result, ptr addrspace(1) %out |
@@ -2552,6 +2728,49 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 |
2552 | 2728 | ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 |
2553 | 2729 | ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
2554 | 2730 | ; GFX1132GISEL-NEXT: s_endpgm |
| 2731 | +; |
| 2732 | +; GFX12DAGISEL-LABEL: divergent_cfg_i64: |
| 2733 | +; GFX12DAGISEL: ; %bb.0: ; %entry |
| 2734 | +; GFX12DAGISEL-NEXT: s_clause 0x1 |
| 2735 | +; GFX12DAGISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 |
| 2736 | +; GFX12DAGISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 |
| 2737 | +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 |
| 2738 | +; GFX12DAGISEL-NEXT: s_mov_b32 s8, exec_lo |
| 2739 | +; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 |
| 2740 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| 2741 | +; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 |
| 2742 | +; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 |
| 2743 | +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB9_2 |
| 2744 | +; GFX12DAGISEL-NEXT: ; %bb.1: ; %else |
| 2745 | +; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo |
| 2746 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| 2747 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s6, s6 |
| 2748 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 2749 | +; GFX12DAGISEL-NEXT: s_mul_hi_u32 s7, s2, s6 |
| 2750 | +; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s6 |
| 2751 | +; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6 |
| 2752 | +; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s7, s3 |
| 2753 | +; GFX12DAGISEL-NEXT: .LBB9_2: ; %Flow |
| 2754 | +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 |
| 2755 | +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 |
| 2756 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 |
| 2757 | +; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 |
| 2758 | +; GFX12DAGISEL-NEXT: ; %bb.3: ; %if |
| 2759 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| 2760 | +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo |
| 2761 | +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 |
| 2762 | +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) |
| 2763 | +; GFX12DAGISEL-NEXT: s_mul_hi_u32 s6, s4, s3 |
| 2764 | +; GFX12DAGISEL-NEXT: s_mul_i32 s5, s5, s3 |
| 2765 | +; GFX12DAGISEL-NEXT: s_mul_i32 s4, s4, s3 |
| 2766 | +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe |
| 2767 | +; GFX12DAGISEL-NEXT: s_add_co_u32 s5, s6, s5 |
| 2768 | +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 |
| 2769 | +; GFX12DAGISEL-NEXT: ; %bb.4: ; %endif |
| 2770 | +; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 |
| 2771 | +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, 0 |
| 2772 | +; GFX12DAGISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] |
| 2773 | +; GFX12DAGISEL-NEXT: s_endpgm |
2555 | 2774 | entry: |
2556 | 2775 | %tid = call i32 @llvm.amdgcn.workitem.id.x() |
2557 | 2776 | %d_cmp = icmp ult i32 %tid, 16 |
|
0 commit comments