Skip to content

Commit 4cd8361

Browse files
[AMDGPU] Lower S_ABSDIFF_I32 to VALU instructions (#167691)
Added support for lowering the scalar S_ABSDIFF_I32 instruction to equivalent VALU operations.
1 parent 7b7a422 commit 4cd8361

File tree

4 files changed

+107
-0
lines changed

4 files changed

+107
-0
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7824,6 +7824,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78247824
Inst.eraseFromParent();
78257825
return;
78267826

7827+
case AMDGPU::S_ABSDIFF_I32:
7828+
lowerScalarAbsDiff(Worklist, Inst);
7829+
Inst.eraseFromParent();
7830+
return;
7831+
78277832
case AMDGPU::S_CBRANCH_SCC0:
78287833
case AMDGPU::S_CBRANCH_SCC1: {
78297834
// Clear unused bits of vcc
@@ -8473,6 +8478,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
84738478
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
84748479
}
84758480

8481+
void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
8482+
MachineInstr &Inst) const {
8483+
MachineBasicBlock &MBB = *Inst.getParent();
8484+
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
8485+
MachineBasicBlock::iterator MII = Inst;
8486+
const DebugLoc &DL = Inst.getDebugLoc();
8487+
8488+
MachineOperand &Dest = Inst.getOperand(0);
8489+
MachineOperand &Src1 = Inst.getOperand(1);
8490+
MachineOperand &Src2 = Inst.getOperand(2);
8491+
Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8492+
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8493+
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8494+
8495+
unsigned SubOp =
8496+
ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
8497+
8498+
BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
8499+
.addReg(Src1.getReg())
8500+
.addReg(Src2.getReg());
8501+
8502+
BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
8503+
8504+
BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
8505+
.addReg(SubResultReg)
8506+
.addReg(TmpReg);
8507+
8508+
MRI.replaceRegWith(Dest.getReg(), ResultReg);
8509+
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8510+
}
8511+
84768512
void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
84778513
MachineInstr &Inst) const {
84788514
MachineBasicBlock &MBB = *Inst.getParent();

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
136136

137137
void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
138138

139+
void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
140+
139141
void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
140142

141143
void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,

llvm/test/CodeGen/AMDGPU/absdiff.ll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,44 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
33

4+
5+
define amdgpu_gs float @absdiff_valu_input_regression() {
6+
; CHECK-LABEL: absdiff_valu_input_regression:
7+
; CHECK: ; %bb.0: ; %bb
8+
; CHECK-NEXT: s_mov_b32 s0, 0
9+
; CHECK-NEXT: .LBB0_1: ; %bb1
10+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
11+
; CHECK-NEXT: s_mov_b32 s1, s0
12+
; CHECK-NEXT: s_or_b32 s0, s0, 1
13+
; CHECK-NEXT: s_cmp_gt_i32 s1, 0
14+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
15+
; CHECK-NEXT: ; %bb.2: ; %bb11
16+
; CHECK-NEXT: v_med3_i32 v0, s1, 0, 1
17+
; CHECK-NEXT: v_sub_u32_e32 v0, 0, v0
18+
; CHECK-NEXT: v_sub_u32_e32 v1, 0, v0
19+
; CHECK-NEXT: v_max_i32_e32 v0, v0, v1
20+
; CHECK-NEXT: ; return to shader part epilog
21+
bb:
22+
br label %bb1
23+
24+
bb1: ; preds = %bb1, %bb
25+
%i = phi i32 [ 0, %bb ], [ %i9, %bb1 ]
26+
%i2 = phi i32 [ 0, %bb ], [ %i5, %bb1 ]
27+
%i3 = or i32 %i2, 1
28+
%i4 = or i32 %i3, 0
29+
%i5 = call i32 @llvm.smax.i32(i32 %i, i32 0)
30+
%i6 = call i32 @llvm.umin.i32(i32 %i5, i32 1)
31+
%i7 = sub i32 0, %i6
32+
%i8 = call i32 @llvm.abs.i32(i32 %i7, i1 false)
33+
%i9 = or i32 %i, 1
34+
%i10 = icmp sgt i32 %i, 0
35+
br i1 %i10, label %bb1, label %bb11
36+
37+
bb11: ; preds = %bb1
38+
%i12 = bitcast i32 %i8 to float
39+
ret float %i12
40+
}
41+
442
define amdgpu_ps i16 @absdiff_i16_false(i16 inreg %arg0, i16 inreg %arg1) {
543
; CHECK-LABEL: absdiff_i16_false:
644
; CHECK: ; %bb.0:
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX8 %s
3+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX12 %s
4+
5+
---
6+
name: absdiff_i32
7+
body: |
8+
bb.0:
9+
liveins: $vgpr0, $vgpr1, $vgpr2
10+
; GFX8-LABEL: name: absdiff_i32
11+
; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2
12+
; GFX8-NEXT: {{ $}}
13+
; GFX8-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
14+
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10
15+
; GFX8-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit-def $vcc, implicit $exec
16+
; GFX8-NEXT: [[V_SUB_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, [[V_SUB_CO_U32_e32_]], implicit-def $vcc, implicit $exec
17+
; GFX8-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_CO_U32_e32_]], [[V_SUB_CO_U32_e32_1]], implicit $exec
18+
;
19+
; GFX12-LABEL: name: absdiff_i32
20+
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
21+
; GFX12-NEXT: {{ $}}
22+
; GFX12-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
23+
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10
24+
; GFX12-NEXT: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit $exec
25+
; GFX12-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 0, [[V_SUB_U32_e32_]], implicit $exec
26+
; GFX12-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_U32_e32_]], [[V_SUB_U32_e32_1]], implicit $exec
27+
%0:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
28+
%1:sreg_32 = COPY %0:vgpr_32
29+
%2:sreg_32 = S_MOV_B32 10
30+
%3:sreg_32 = S_ABSDIFF_I32 killed %2:sreg_32, %1:sreg_32, implicit-def dead $scc
31+
...

0 commit comments

Comments
 (0)