From 01f93ba57e1f9808d631db2da29a5cb36960e946 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 25 Feb 2025 15:47:56 +0700 Subject: [PATCH 1/2] AMDGPU: Mark v_mov_b64_pseudo as a VOP1 instruction This is mostly true, and it tricks the rematerialization code into handling this without special casing it. --- llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/remat-sop.mir | 22 +++++- .../AMDGPU/tuple-allocation-failure.ll | 78 ++++++++----------- llvm/test/CodeGen/AMDGPU/vgpr-remat.mir | 48 +++++++++++- 4 files changed, 102 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index cca49ee80a60e..6f80dbcfe5e71 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -136,6 +136,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), let isMoveImm = 1; let SchedRW = [Write64Bit]; let Size = 4; + let VOP1 = 1; // Not entirely correct, but close enough. let UseNamedOperandTable = 1; } diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index 81aa3a39de42f..1da55cf535449 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -653,4 +653,24 @@ body: | S_ENDPGM 0 ... - +--- +name: test_remat_s_mov_b64_imm_pseudo +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: test_remat_s_mov_b64_imm_pseudo + ; GCN: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 1 + ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64_IMM_PSEUDO 2 + ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 + ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64_IMM_PSEUDO 3 + ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1 + ; GCN-NEXT: S_ENDPGM 0 + %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 1 + %1:sgpr_64 = S_MOV_B64_IMM_PSEUDO 2 + %2:sgpr_64 = S_MOV_B64_IMM_PSEUDO 3 + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index dd78c2f46dde8..a6e6341914ed0 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -34,10 +34,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 +; GLOBALNESS1-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: global_load_dword v2, v42, s[76:77] ; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] @@ -46,6 +45,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0x40994400 @@ -73,13 +73,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s16 ; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS1-NEXT: s_mov_b32 s71, s15 ; GLOBALNESS1-NEXT: s_mov_b32 s72, s14 ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS1-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr44_vgpr45 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr56_vgpr57 ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -106,17 +108,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0x80 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v1, 0 -; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v40, v[46:47] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v58, v[46:47] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS1-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -160,8 +160,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS1-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v0, 0 @@ -170,17 +169,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -237,7 +235,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[68:69] @@ -246,14 +243,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[56:57], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS1-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_14 ; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 @@ -274,14 +271,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -326,10 +321,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 +; GLOBALNESS0-NEXT: global_store_dword v[44:45], v42, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: global_load_dword v2, v42, s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] @@ -338,6 +332,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0x40994400 @@ -365,13 +360,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[52:53], 1, v0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[44:45], 1, v1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[46:47], 1, v3 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS0-NEXT: s_mov_b32 s68, s16 ; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] ; GLOBALNESS0-NEXT: s_mov_b32 s69, s15 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s14 ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] +; GLOBALNESS0-NEXT: v_mov_b32_e32 v47, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr44_vgpr45 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr56_vgpr57 ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -398,17 +395,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], v[0:1], v[0:1] op_sel:[0,1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_16 Depth 2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0x80 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v1, 0 -; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v40, v[46:47] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v58, v[46:47] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_getpc_b64 s[4:5] ; GLOBALNESS0-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4 @@ -452,8 +447,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: flat_load_dword v0, v[2:3] +; GLOBALNESS0-NEXT: flat_load_dword v0, v[44:45] ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[62:63], 0, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v0, 0 @@ -462,17 +456,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[54:55] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: .LBB1_13: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] @@ -529,7 +522,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[72:73] @@ -538,14 +530,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[56:57], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_14 ; GLOBALNESS0-NEXT: ; %bb.23: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_16 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_14 ; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 @@ -566,14 +558,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir index 08f5550f3b08a..4b967969366f4 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir @@ -4,10 +4,10 @@ # Check that we get two move-immediates into %1 and %2, instead of a copy from # %1 to %2, because that would introduce a dependency and maybe a stall. --- -name: f +name: remat_v_mov_b32_e32 tracksRegLiveness: true body: | - ; CHECK-LABEL: name: f + ; CHECK-LABEL: name: remat_v_mov_b32_e32 ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr0 @@ -46,3 +46,47 @@ body: | %4.sub1:vreg_96 = COPY %2:vgpr_32 S_ENDPGM 0, implicit %4 ... + +--- +name: remat_v_mov_b64_pseduo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: remat_v_mov_b64_pseduo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub0_sub1:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub0_sub1, 0, [[V_MOV_B]].sub0_sub1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]].sub2_sub3:vreg_192_align2 = V_MUL_F64_e64 0, [[V_MOV_B]].sub2_sub3, 0, [[V_MOV_B]].sub2_sub3, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]] + bb.0: + liveins: $sgpr0 + %0:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec + %1:vreg_64_align2 = COPY %0:vreg_64_align2 + %2:vreg_64_align2 = COPY %0:vreg_64_align2 + %3:sreg_64 = COPY $sgpr0_sgpr1 + $exec = S_MOV_B64_term %3:sreg_64 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %1:vreg_64_align2 = V_MUL_F64_e64 0, %1:vreg_64_align2, 0, %1:vreg_64_align2, 0, 0, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_MUL_F64_e64 0, %2:vreg_64_align2, 0, %2:vreg_64_align2, 0, 0, implicit $mode, implicit $exec + + bb.2: + undef %4.sub0_sub1:vreg_192 = COPY %1:vreg_64_align2 + %4.sub2_sub3:vreg_192 = COPY %2:vreg_64_align2 + S_ENDPGM 0, implicit %4 +... From cfb36e5aad60d56a62a919c605b66fb762c049a0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 25 Feb 2025 17:23:53 +0700 Subject: [PATCH 2/2] Update llvm/test/CodeGen/AMDGPU/vgpr-remat.mir --- llvm/test/CodeGen/AMDGPU/vgpr-remat.mir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir index 4b967969366f4..0e7e4b9b10d44 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir @@ -48,10 +48,10 @@ body: | ... --- -name: remat_v_mov_b64_pseduo +name: remat_v_mov_b64_pseudo tracksRegLiveness: true body: | - ; CHECK-LABEL: name: remat_v_mov_b64_pseduo + ; CHECK-LABEL: name: remat_v_mov_b64_pseudo ; CHECK: bb.0: ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr0