Skip to content

Commit 09ffcc9

Browse files
committed
Revise lowerV2S16ViaS32Widening
1 parent 789eaad commit 09ffcc9

File tree

4 files changed

+97
-37
lines changed

4 files changed

+97
-37
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -501,23 +501,11 @@ void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
501501
}
502502

503503
void RegBankLegalizeHelper::lowerV2S16ViaS32Widening(MachineInstr &MI) {
504-
auto Op1 = B.buildUnmerge({SgprRB, S16}, MI.getOperand(1).getReg());
505-
auto Hi1 = Op1.getReg(0);
506-
auto Lo1 = Op1.getReg(1);
507-
auto Op2 = B.buildUnmerge({SgprRB, S16}, MI.getOperand(2).getReg());
508-
auto Hi2 = Op2.getReg(0);
509-
auto Lo2 = Op2.getReg(1);
510-
511-
auto CastHi1 = B.buildZExt(SgprRB_S32, Hi1);
512-
auto CastLo1 = B.buildZExt(SgprRB_S32, Lo1);
513-
auto CastHi2 = B.buildZExt(SgprRB_S32, Hi2);
514-
auto CastLo2 = B.buildZExt(SgprRB_S32, Lo2);
515-
516-
auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {CastHi1, CastHi2});
517-
auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {CastLo1, CastLo2});
518-
519-
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),
520-
{ResHi.getReg(0), ResLo.getReg(0)});
504+
auto [Op1Lo, Op1Hi] = unpackAExt(MI.getOperand(1).getReg());
505+
auto [Op2Lo, Op2Hi] = unpackAExt(MI.getOperand(2).getReg());
506+
auto ResLo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Lo, Op2Lo});
507+
auto ResHi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Op1Hi, Op2Hi});
508+
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(),{ResLo.getReg(0), ResHi.getReg(0)});
521509
MI.eraseFromParent();
522510
}
523511

@@ -825,7 +813,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
825813
}
826814
break;
827815
}
828-
case ScalarizeV2S16:
816+
case WidenV2S16ToS32:
829817
return lowerV2S16ViaS32Widening(MI);
830818
case WidenMMOToS32:
831819
return widenMMOToS32(cast<GAnyLoad>(MI));

llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,83 @@ define i32 @v_add_i32(i32 %a, i32 %b) {
196196
ret i32 %c
197197
}
198198

199-
; TODO: Add test for s_add_v2i16. The current test does not work
200-
; due to a bug associated with
201-
; AMDGPURegBankLegalizeCombiner::tryEliminateReadAnyLane(..)
199+
define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
200+
; GFX7-LABEL: s_add_v2i16:
201+
; GFX7: ; %bb.0:
202+
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203+
; GFX7-NEXT: s_add_i32 s16, s16, s18
204+
; GFX7-NEXT: s_add_i32 s17, s17, s19
205+
; GFX7-NEXT: v_mov_b32_e32 v0, s16
206+
; GFX7-NEXT: v_mov_b32_e32 v1, s17
207+
; GFX7-NEXT: s_setpc_b64 s[30:31]
208+
;
209+
; GFX9-LABEL: s_add_v2i16:
210+
; GFX9: ; %bb.0:
211+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212+
; GFX9-NEXT: s_lshr_b32 s4, s16, 16
213+
; GFX9-NEXT: s_lshr_b32 s5, s17, 16
214+
; GFX9-NEXT: s_add_i32 s16, s16, s17
215+
; GFX9-NEXT: s_add_i32 s4, s4, s5
216+
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s4
217+
; GFX9-NEXT: v_mov_b32_e32 v0, s4
218+
; GFX9-NEXT: s_setpc_b64 s[30:31]
219+
;
220+
; GFX8-LABEL: s_add_v2i16:
221+
; GFX8: ; %bb.0:
222+
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223+
; GFX8-NEXT: s_lshr_b32 s4, s16, 16
224+
; GFX8-NEXT: s_lshr_b32 s5, s17, 16
225+
; GFX8-NEXT: s_add_i32 s4, s4, s5
226+
; GFX8-NEXT: s_add_i32 s16, s16, s17
227+
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
228+
; GFX8-NEXT: s_and_b32 s5, 0xffff, s16
229+
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
230+
; GFX8-NEXT: s_or_b32 s4, s5, s4
231+
; GFX8-NEXT: v_mov_b32_e32 v0, s4
232+
; GFX8-NEXT: s_setpc_b64 s[30:31]
233+
;
234+
; GFX10-LABEL: s_add_v2i16:
235+
; GFX10: ; %bb.0:
236+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237+
; GFX10-NEXT: s_lshr_b32 s4, s16, 16
238+
; GFX10-NEXT: s_lshr_b32 s5, s17, 16
239+
; GFX10-NEXT: s_add_i32 s16, s16, s17
240+
; GFX10-NEXT: s_add_i32 s4, s4, s5
241+
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4
242+
; GFX10-NEXT: v_mov_b32_e32 v0, s4
243+
; GFX10-NEXT: s_setpc_b64 s[30:31]
244+
;
245+
; GFX11-LABEL: s_add_v2i16:
246+
; GFX11: ; %bb.0:
247+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248+
; GFX11-NEXT: s_lshr_b32 s2, s0, 16
249+
; GFX11-NEXT: s_lshr_b32 s3, s1, 16
250+
; GFX11-NEXT: s_add_i32 s0, s0, s1
251+
; GFX11-NEXT: s_add_i32 s2, s2, s3
252+
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
253+
; GFX11-NEXT: v_mov_b32_e32 v0, s0
254+
; GFX11-NEXT: s_setpc_b64 s[30:31]
255+
;
256+
; GFX12-LABEL: s_add_v2i16:
257+
; GFX12: ; %bb.0:
258+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
259+
; GFX12-NEXT: s_wait_expcnt 0x0
260+
; GFX12-NEXT: s_wait_samplecnt 0x0
261+
; GFX12-NEXT: s_wait_bvhcnt 0x0
262+
; GFX12-NEXT: s_wait_kmcnt 0x0
263+
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
264+
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
265+
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
266+
; GFX12-NEXT: s_wait_alu 0xfffe
267+
; GFX12-NEXT: s_add_co_i32 s2, s2, s3
268+
; GFX12-NEXT: s_wait_alu 0xfffe
269+
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
270+
; GFX12-NEXT: s_wait_alu 0xfffe
271+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
272+
; GFX12-NEXT: s_setpc_b64 s[30:31]
273+
%c = add <2 x i16> %a, %b
274+
ret <2 x i16> %c
275+
}
202276

203277
define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) {
204278
; GFX7-LABEL: v_add_v2i16:

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-add.mir

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,14 +188,13 @@ body: |
188188
; CHECK-NEXT: {{ $}}
189189
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
190190
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
191-
; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
192-
; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
193-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV]](s16)
194-
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV1]](s16)
195-
; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV2]](s16)
196-
; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV3]](s16)
197-
; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[ZEXT]], [[ZEXT2]]
198-
; CHECK-NEXT: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[ZEXT1]], [[ZEXT3]]
191+
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
192+
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
193+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
194+
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
195+
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
196+
; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[BITCAST]], [[BITCAST1]]
197+
; CHECK-NEXT: [[ADD1:%[0-9]+]]:sgpr(s32) = G_ADD [[LSHR]], [[LSHR1]]
199198
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ADD]](s32), [[ADD1]](s32)
200199
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
201200
%0:_(<2 x s16>) = COPY $sgpr0

llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-sub.mir

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,14 +189,13 @@ body: |
189189
; CHECK-NEXT: {{ $}}
190190
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
191191
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
192-
; CHECK-NEXT: [[UV:%[0-9]+]]:sgpr(s16), [[UV1:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>)
193-
; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s16), [[UV3:%[0-9]+]]:sgpr(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>)
194-
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV]](s16)
195-
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV1]](s16)
196-
; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV2]](s16)
197-
; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:sgpr(s32) = G_ZEXT [[UV3]](s16)
198-
; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[ZEXT]], [[ZEXT2]]
199-
; CHECK-NEXT: [[SUB1:%[0-9]+]]:sgpr(s32) = G_SUB [[ZEXT1]], [[ZEXT3]]
192+
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
193+
; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
194+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
195+
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
196+
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
197+
; CHECK-NEXT: [[SUB:%[0-9]+]]:sgpr(s32) = G_SUB [[BITCAST]], [[BITCAST1]]
198+
; CHECK-NEXT: [[SUB1:%[0-9]+]]:sgpr(s32) = G_SUB [[LSHR]], [[LSHR1]]
200199
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SUB]](s32), [[SUB1]](s32)
201200
; CHECK-NEXT: S_ENDPGM 0, implicit [[BUILD_VECTOR_TRUNC]](<2 x s16>)
202201
%0:_(<2 x s16>) = COPY $sgpr0

0 commit comments

Comments
 (0)