diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index dbed042776e2c..36288d43443ca 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -682,11 +682,21 @@ bool SIFoldOperandsImpl::tryAddToFoldList( if (!CanCommute) return false; + MachineOperand &Op = MI->getOperand(OpNo); + MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo); + // One of operands might be an Imm operand, and OpNo may refer to it after // the call of commuteInstruction() below. Such situations are avoided // here explicitly as OpNo must be a register operand to be a candidate // for memory folding. - if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg()) + if (!Op.isReg() || !CommutedOp.isReg()) + return false; + + // The same situation with an immediate could reproduce if both inputs are + // the same register. + if (Op.isReg() && CommutedOp.isReg() && + (Op.getReg() == CommutedOp.getReg() && + Op.getSubReg() == CommutedOp.getSubReg())) return false; if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo)) diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll index dfc28539ea814..0f573fcc6deaa 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-divergence.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence.ll @@ -37,8 +37,8 @@ define amdgpu_kernel void @flat_load_maybe_divergent(ptr addrspace(4) %k, ptr %f ; GCN-LABEL: {{^}}wide_carry_divergence_error: ; GCN: v_sub_u32_e32 ; GCN: v_subb_u32_e32 -; GCN: v_subbrev_u32_e32 -; GCN: v_subbrev_u32_e32 +; GCN: v_subb_u32_e32 +; GCN: v_subb_u32_e32 define <2 x i128> @wide_carry_divergence_error(i128 %arg) { %i = call i128 @llvm.ctlz.i128(i128 %arg, i1 false) %i1 = sub i128 0, %i diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 59bc7f332bf1e..3d9043d30c1ce 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -65,8 +65,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v18, v16 @@ -2355,8 +2355,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v14, vcc, 0, v8, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13] ; GFX9-NEXT: v_or_b32_e32 v10, v13, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 8b1d803b32aa6..e01e540a8c8aa 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -66,10 +66,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 -; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] @@ -263,10 +263,10 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 -; SDAG-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -872,10 +872,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 -; SDAG-NEXT: v_subbrev_u32_e32 v24, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 ; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] @@ -1047,10 +1047,10 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 -; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc ; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc @@ -1619,10 +1619,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 -; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v19, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v19, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 ; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] @@ -1814,10 +1814,10 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc ; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10 -; SDAG-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc ; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc @@ -2502,10 +2502,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 -; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] @@ -2677,10 +2677,10 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 -; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index afe1f33d15e42..d25226d15a029 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -66,8 +66,8 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] ; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 @@ -1541,8 +1541,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc ; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v11, vcc -; GFX9-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v11, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v11, vcc ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir index 18fbf3eaf9c8d..a5c0ead28f618 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-commute-same-operands-assert.mir @@ -47,8 +47,8 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, killed [[COPY]], implicit-def $vcc, implicit $exec ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: [[V_SUBBREV_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBBREV_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec - ; CHECK-NEXT: $vgpr4 = COPY [[V_SUBBREV_U32_e32_]] + ; CHECK-NEXT: [[V_SUBB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUBB_U32_e32 0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $vcc, implicit $exec + ; CHECK-NEXT: $vgpr4 = COPY [[V_SUBB_U32_e32_]] ; CHECK-NEXT: SI_RETURN implicit $vgpr4 %0:vgpr_32 = COPY $vgpr0 %1:sreg_64 = S_MOV_B64 0