Skip to content

Commit 278fa03

Browse files
committed
Introduce scalarization of V2S16 for G_ADD/SUB
1 parent c3b719e commit 278fa03

File tree

8 files changed

+40
-10
lines changed

8 files changed

+40
-10
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,20 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
500500
MI.eraseFromParent();
501501
}
502502

503+
void RegBankLegalizeHelper::lowerScalarizeV2S16(MachineInstr &MI) {
504+
// Unpack the V2S16 operands into two S16 scalars each
505+
auto Op1 = B.buildUnmerge({SgprRB, S16}, MI.getOperand(1).getReg());
506+
auto Op2 = B.buildUnmerge({SgprRB, S16}, MI.getOperand(2).getReg());
507+
508+
// Perform scalar additions on S16 values
509+
Register Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S16}, {Op1.getReg(0), Op2.getReg(0)}).getReg(0);
510+
Register Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S16}, {Op1.getReg(1), Op2.getReg(1)}).getReg(0);
511+
512+
// Pack the results back into V2S16
513+
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
514+
MI.eraseFromParent();
515+
}
516+
503517
static bool isSignedBFE(MachineInstr &MI) {
504518
if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
505519
return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -804,6 +818,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
804818
}
805819
break;
806820
}
821+
case ScalarizeV2S16:
822+
return lowerScalarizeV2S16(MI);
807823
case WidenMMOToS32:
808824
return widenMMOToS32(cast<GAnyLoad>(MI));
809825
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class RegBankLegalizeHelper {
7474
MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
7575
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
7676
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};
77+
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
7778

7879
public:
7980
RegBankLegalizeHelper(MachineIRBuilder &B, const MachineUniformityInfo &MUI,
@@ -124,6 +125,7 @@ class RegBankLegalizeHelper {
124125
void lowerSplitTo32Select(MachineInstr &MI);
125126
void lowerSplitTo32SExtInReg(MachineInstr &MI);
126127
void lowerUnpackMinMax(MachineInstr &MI);
128+
void lowerScalarizeV2S16(MachineInstr &MI);
127129
};
128130

129131
} // end namespace AMDGPU

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,13 +471,11 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
471471
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
472472
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
473473
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
474-
/// TODO: SALU does not support packed math addition. Scalarize into two S16 additions.
475-
.Uni(V2S16, {{SgprV2S16}, {Sgpr32AExt, Sgpr32AExt}})
474+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeV2S16})
476475
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
477476
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
478477
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
479478

480-
/// TODO: Correct these rules, related to overflow detection.
481479
addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
482480
.Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
483481
.Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,8 @@ enum LoweringMethodID {
223223
UniCstExt,
224224
SplitLoad,
225225
WidenLoad,
226-
WidenMMOToS32
226+
WidenMMOToS32,
227+
ScalarizeV2S16
227228
};
228229

229230
enum FastRulesTypes {

llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,9 @@ define i32 @v_add_i32(i32 %a, i32 %b) {
196196
ret i32 %c
197197
}
198198

199-
; TODO: Add test for s_add_v2i16
199+
; TODO: Add test for s_add_v2i16. Instruction selector currently fails
200+
; to handle G_UNMERGE_VALUES. Same in GlobalISel/sub.ll.
201+
200202
define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
201203
; GFX7-LABEL: v_add_v2i16:
202204
; GFX7: ; %bb.0:

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.v2s16.mir

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,12 @@ body: |
1414
; CHECK-NEXT: {{ $}}
1515
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
1616
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
17-
; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(<2 x s16>) = G_ADD [[COPY]], [[COPY1]]
18-
; CHECK-NEXT: S_ENDPGM 0, implicit [[ADD]](<2 x s16>)
17+
; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
18+
; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
19+
; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s16) = G_ADD [[UV]], [[UV2]]
20+
; CHECK-NEXT: [[ADD1:%[0-9]+]]:sgpr(s16) = G_ADD [[UV1]], [[UV3]]
21+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[ADD]](s16), [[ADD1]](s16)
22+
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s16>)
1923
%0:_(<2 x s16>) = COPY $sgpr0
2024
%1:_(<2 x s16>) = COPY $sgpr1
2125
%2:_(<2 x s16>) = G_ADD %0, %1

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.v2s16.mir

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
12
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
23
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-regbankselect,amdgpu-regbanklegalize %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
34

@@ -13,8 +14,12 @@ body: |
1314
; CHECK-NEXT: {{ $}}
1415
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
1516
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
16-
; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(<2 x s16>) = G_SUB [[COPY]], [[COPY1]]
17-
; CHECK-NEXT: S_ENDPGM 0, implicit [[SUB]](<2 x s16>)
17+
; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
18+
; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
19+
; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s16) = G_SUB [[UV]], [[UV2]]
20+
; CHECK-NEXT: [[SUB1:%[0-9]+]]:sgpr(s16) = G_SUB [[UV1]], [[UV3]]
21+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR [[SUB]](s16), [[SUB1]](s16)
22+
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR]](<2 x s16>)
1823
%0:_(<2 x s16>) = COPY $sgpr0
1924
%1:_(<2 x s16>) = COPY $sgpr1
2025
%2:_(<2 x s16>) = G_SUB %0, %1

llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,9 @@ define i32 @v_sub_i32(i32 %a, i32 %b) {
196196
ret i32 %c
197197
}
198198

199-
; TODO: sub test for s_sub_v2i16
199+
; TODO: Add test for s_sub_v2i16. Instruction selector currently fails
200+
; to handle G_UNMERGE_VALUES.
201+
200202
define <2 x i16> @v_sub_v2i16(<2 x i16> %a, <2 x i16> %b) {
201203
; GFX7-LABEL: v_sub_v2i16:
202204
; GFX7: ; %bb.0:

0 commit comments

Comments
 (0)