Skip to content

Commit b2c7a76

Browse files
abhigargrepobroxigarchen
authored andcommitted
CodeGen using True16 D16 LDS ld/st pseudo instructions
Implement new pseudos with the suffix _t16 which have VGPR_16 as the store src or load dst. This affects LDS 8 and 16-bit loads and stores. Lower the pseudos to the existing real instructions in MC inst layer with VGPR_32 src or dst (which makes them consistent with the hardware encoding). This patch reduces VGPR usage by making hi halves of VGPRs available for other values. Modified lit tests.
1 parent 5265412 commit b2c7a76

File tree

8 files changed

+1043
-305
lines changed

8 files changed

+1043
-305
lines changed

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,47 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
187187
OutMI.addOperand(Dest);
188188
OutMI.addOperand(Src);
189189
return;
190+
} else if (const auto *Info = AMDGPU::getT16D16Helper(Opcode)) {
191+
uint16_t OpName = AMDGPU::OpName::OPERAND_LAST;
192+
if (TII->isDS(Opcode)) {
193+
if (MI->mayLoad())
194+
OpName = llvm::AMDGPU::OpName::vdst;
195+
else if (MI->mayStore())
196+
OpName = llvm::AMDGPU::OpName::data0;
197+
else
198+
llvm_unreachable("LDS load or store expected");
199+
} else {
200+
OpName = AMDGPU::hasNamedOperand(Opcode, llvm::AMDGPU::OpName::vdata)
201+
? llvm::AMDGPU::OpName::vdata
202+
: llvm::AMDGPU::OpName::vdst;
203+
}
204+
int VDstOrVDataIdx = AMDGPU::getNamedOperandIdx(Opcode, OpName);
205+
MachineOperand MIVDstOrVData = MI->getOperand(VDstOrVDataIdx);
206+
bool IsHi = AMDGPU::isHi16Reg(MIVDstOrVData.getReg(), TRI);
207+
Opcode = IsHi ? Info->HiOp : Info->LoOp;
208+
MIVDstOrVData.clearParent(); // Avoid use list error in setReg call
209+
MIVDstOrVData.setReg(TRI.get32BitRegister(MIVDstOrVData.getReg()));
210+
211+
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
212+
assert(MCOpcode != -1 &&
213+
"Pseudo instruction doesn't have a target-specific version");
214+
OutMI.setOpcode(MCOpcode);
215+
for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
216+
const MachineOperand &MO = MI->getOperand(I);
217+
MCOperand MCOp;
218+
if (I == VDstOrVDataIdx)
219+
lowerOperand(MIVDstOrVData, MCOp);
220+
else
221+
lowerOperand(MO, MCOp);
222+
OutMI.addOperand(MCOp);
223+
}
224+
225+
if (AMDGPU::hasNamedOperand(MCOpcode, AMDGPU::OpName::vdst_in)) {
226+
MCOperand MCOp;
227+
lowerOperand(MIVDstOrVData, MCOp);
228+
OutMI.addOperand(MCOp);
229+
}
230+
return;
190231
} else if (Opcode == AMDGPU::SI_TCRETURN ||
191232
Opcode == AMDGPU::SI_TCRETURN_GFX) {
192233
// TODO: How to use branch immediate and avoid register+add?

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 70 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,15 @@ multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> {
127127
}
128128
}
129129

130+
multiclass DS_1A1D_NORET_t16<string opName, RegisterClass rc = VGPR_32>
131+
: DS_1A1D_NORET_mc<opName, rc> {
132+
let has_m0_read = 0 in {
133+
let True16Predicate = UseRealTrue16Insts in {
134+
def "_t16" : DS_1A1D_NORET<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>;
135+
}
136+
}
137+
}
138+
130139
multiclass DS_1A1D_NORET_mc_gfx9<string opName, RegisterClass rc = VGPR_32> {
131140
let has_m0_read = 0 in {
132141
def "" : DS_1A1D_NORET<opName, rc>;
@@ -294,6 +303,15 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu
294303
}
295304
}
296305

306+
multiclass DS_1A_RET_t16<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = Offset>
307+
: DS_1A_RET_mc<opName, rc, HasTiedOutput, ofs> {
308+
let has_m0_read = 0 in {
309+
let True16Predicate = UseRealTrue16Insts in {
310+
def "_t16" : DS_1A_RET<opName#"_t16", VGPR_16, HasTiedOutput, ofs>, True16D16Table<NAME#"_D16_HI", NAME#"_D16">;
311+
}
312+
}
313+
}
314+
297315
multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> {
298316
let has_m0_read = 0 in {
299317
def "" : DS_1A_RET<opName, rc>;
@@ -457,8 +475,6 @@ defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">;
457475
defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">;
458476

459477
let mayLoad = 0 in {
460-
defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">;
461-
defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">;
462478
defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">;
463479
defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">;
464480
defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">;
@@ -473,6 +489,9 @@ def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
473489

474490
} // End has_m0_read = 0
475491

492+
defm DS_WRITE_B8 : DS_1A1D_NORET_t16<"ds_write_b8">;
493+
defm DS_WRITE_B16 : DS_1A1D_NORET_t16<"ds_write_b16">;
494+
476495
let SubtargetPredicate = HasDSAddTid in {
477496
def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
478497
}
@@ -625,10 +644,7 @@ def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
625644
}
626645

627646
let mayStore = 0 in {
628-
defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">;
629-
defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">;
630647
defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">;
631-
defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">;
632648
defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">;
633649
defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>;
634650

@@ -649,6 +665,10 @@ def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
649665
}
650666
} // End has_m0_read = 0
651667

668+
defm DS_READ_I8 : DS_1A_RET_t16<"ds_read_i8">;
669+
defm DS_READ_U8 : DS_1A_RET_t16<"ds_read_u8">;
670+
defm DS_READ_U16 : DS_1A_RET_t16<"ds_read_u16">;
671+
652672
let SubtargetPredicate = HasDSAddTid in {
653673
def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
654674
}
@@ -784,34 +804,51 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
784804
}
785805
}
786806

807+
multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
808+
809+
let OtherPredicates = [LDSRequiresM0Init] in {
810+
def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
811+
}
812+
813+
let OtherPredicates = [NotLDSRequiresM0Init] in {
814+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
815+
let True16Predicate = p in {
816+
def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
817+
}
818+
let True16Predicate = UseRealTrue16Insts in {
819+
def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
820+
}
821+
}
822+
}
823+
787824
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
788825
(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
789826
(inst $ptr, Offset:$offset, (i1 0), $in)
790827
>;
791828

792829
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
793-
defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">;
794830
defm : DSReadPat_mc <DS_READ_U8, i32, "extloadi8_local">;
795831
defm : DSReadPat_mc <DS_READ_U8, i32, "zextloadi8_local">;
796-
defm : DSReadPat_mc <DS_READ_U8, i16, "extloadi8_local">;
797-
defm : DSReadPat_mc <DS_READ_U8, i16, "zextloadi8_local">;
798832
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
799833
defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
800834
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
801835
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
802-
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
836+
defm : DSReadPat_t16 <DS_READ_I8, i16, "sextloadi8_local">;
837+
defm : DSReadPat_t16 <DS_READ_U8, i16, "extloadi8_local">;
838+
defm : DSReadPat_t16 <DS_READ_U8, i16, "zextloadi8_local">;
839+
defm : DSReadPat_t16 <DS_READ_U16, i16, "load_local">;
803840

804841
foreach vt = Reg32Types.types in {
805842
defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
806843
}
807844

808-
defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_8_local">;
845+
defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_8_local">;
809846
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_8_local">;
810-
defm : DSReadPat_mc <DS_READ_U8, i16, "atomic_load_zext_8_local">;
847+
defm : DSReadPat_t16 <DS_READ_U8, i16, "atomic_load_zext_8_local">;
811848
defm : DSReadPat_mc <DS_READ_U8, i32, "atomic_load_zext_8_local">;
812-
defm : DSReadPat_mc <DS_READ_I8, i16, "atomic_load_sext_8_local">;
849+
defm : DSReadPat_t16 <DS_READ_I8, i16, "atomic_load_sext_8_local">;
813850
defm : DSReadPat_mc <DS_READ_I8, i32, "atomic_load_sext_8_local">;
814-
defm : DSReadPat_mc <DS_READ_U16, i16, "atomic_load_16_local">;
851+
defm : DSReadPat_t16 <DS_READ_U16, i16, "atomic_load_16_local">;
815852
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_16_local">;
816853
defm : DSReadPat_mc <DS_READ_U16, i32, "atomic_load_zext_16_local">;
817854
defm : DSReadPat_mc <DS_READ_I16, i32, "atomic_load_sext_16_local">;
@@ -850,18 +887,34 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
850887
}
851888
}
852889

890+
multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
891+
let OtherPredicates = [LDSRequiresM0Init] in {
892+
def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
893+
}
894+
895+
let OtherPredicates = [NotLDSRequiresM0Init] in {
896+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
897+
let True16Predicate = p in {
898+
def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
899+
}
900+
let True16Predicate = UseRealTrue16Insts in {
901+
def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
902+
}
903+
}
904+
}
905+
853906
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
854907
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
855-
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
856-
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
908+
defm : DSWritePat_t16 <DS_WRITE_B8, i16, "truncstorei8_local">;
909+
defm : DSWritePat_t16 <DS_WRITE_B16, i16, "store_local">;
857910

858911
foreach vt = Reg32Types.types in {
859912
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
860913
}
861914

862-
defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
915+
defm : DSWritePat_t16 <DS_WRITE_B8, i16, "atomic_store_8_local">;
863916
defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
864-
defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
917+
defm : DSWritePat_t16 <DS_WRITE_B16, i16, "atomic_store_16_local">;
865918
defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
866919
defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
867920
defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;

llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -239,48 +239,53 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
239239
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
240240
; GFX11-NEXT: s_and_b32 s6, 0xffff, s0
241241
; GFX11-NEXT: s_lshr_b32 s5, s0, 16
242-
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s4
242+
; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
243+
; GFX11-NEXT: v_mov_b32_e32 v5, s4
243244
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
244245
; GFX11-NEXT: s_and_b32 s4, 0xffff, s1
245-
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
246+
; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
246247
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
247248
; GFX11-NEXT: s_and_b32 s7, 0xffff, s2
249+
; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
248250
; GFX11-NEXT: s_lshr_b32 s2, s6, 8
249251
; GFX11-NEXT: s_lshr_b32 s6, s5, 8
250-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
251-
; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
252-
; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s0
252+
; GFX11-NEXT: v_mov_b16_e32 v2.h, s2
253+
; GFX11-NEXT: v_mov_b16_e32 v1.h, s5
253254
; GFX11-NEXT: s_lshr_b32 s4, s4, 8
254255
; GFX11-NEXT: s_lshr_b32 s5, s0, 8
256+
; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
255257
; GFX11-NEXT: s_lshr_b32 s0, s7, 8
256-
; GFX11-NEXT: v_dual_mov_b32 v8, s4 :: v_dual_mov_b32 v9, s5
257-
; GFX11-NEXT: ds_store_b8 v1, v0
258-
; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
259-
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
260-
; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
261-
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
262-
; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
263-
; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
264-
; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
265-
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s3
258+
; GFX11-NEXT: v_mov_b16_e32 v3.l, s6
259+
; GFX11-NEXT: v_mov_b16_e32 v3.h, s4
260+
; GFX11-NEXT: v_mov_b16_e32 v4.l, s5
261+
; GFX11-NEXT: ds_store_b8 v5, v0
262+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:1
263+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:2
264+
; GFX11-NEXT: ds_store_b8 v5, v3 offset:3
265+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:4
266+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:5
267+
; GFX11-NEXT: ds_store_b8 v5, v2 offset:6
268+
; GFX11-NEXT: ds_store_b8 v5, v4 offset:7
269+
; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
266270
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
267-
; GFX11-NEXT: v_mov_b32_e32 v2, s1
268-
; GFX11-NEXT: v_mov_b32_e32 v4, s0
271+
; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
272+
; GFX11-NEXT: v_mov_b16_e32 v1.h, s0
269273
; GFX11-NEXT: s_and_b32 s0, 0xffff, s3
270274
; GFX11-NEXT: s_lshr_b32 s1, s3, 16
271275
; GFX11-NEXT: s_lshr_b32 s0, s0, 8
272-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
273-
; GFX11-NEXT: v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
276+
; GFX11-NEXT: v_mov_b16_e32 v2.l, s3
277+
; GFX11-NEXT: v_mov_b16_e32 v2.h, s0
274278
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
275-
; GFX11-NEXT: v_mov_b32_e32 v8, s0
276-
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
277-
; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
278-
; GFX11-NEXT: ds_store_b8 v1, v2 offset:10
279-
; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
280-
; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
281-
; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
282-
; GFX11-NEXT: ds_store_b8 v1, v7 offset:14
283-
; GFX11-NEXT: ds_store_b8 v1, v8 offset:15
279+
; GFX11-NEXT: v_mov_b16_e32 v3.l, s1
280+
; GFX11-NEXT: v_mov_b16_e32 v3.h, s0
281+
; GFX11-NEXT: ds_store_b8 v5, v1 offset:8
282+
; GFX11-NEXT: ds_store_b8 v5, v0 offset:9
283+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v0 offset:10
284+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v1 offset:11
285+
; GFX11-NEXT: ds_store_b8 v5, v2 offset:12
286+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v2 offset:13
287+
; GFX11-NEXT: ds_store_b8 v5, v3 offset:14
288+
; GFX11-NEXT: ds_store_b8_d16_hi v5, v3 offset:15
284289
; GFX11-NEXT: s_endpgm
285290
store <4 x i32> %x, ptr addrspace(3) %out, align 1
286291
ret void

llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -207,36 +207,42 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
207207
; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
208208
; GFX11-NEXT: s_and_b32 s5, 0xffff, s0
209209
; GFX11-NEXT: s_lshr_b32 s4, s0, 16
210+
; GFX11-NEXT: v_mov_b16_e32 v0.l, s0
210211
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
211-
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s3
212+
; GFX11-NEXT: v_mov_b32_e32 v6, s3
212213
; GFX11-NEXT: s_lshr_b32 s0, s1, 16
213214
; GFX11-NEXT: s_and_b32 s3, 0xffff, s1
214-
; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
215+
; GFX11-NEXT: v_mov_b16_e32 v0.h, s1
215216
; GFX11-NEXT: s_lshr_b32 s1, s2, 16
216217
; GFX11-NEXT: s_and_b32 s6, 0xffff, s2
218+
; GFX11-NEXT: v_mov_b16_e32 v1.l, s2
217219
; GFX11-NEXT: s_lshr_b32 s2, s5, 8
218220
; GFX11-NEXT: s_lshr_b32 s5, s4, 8
219-
; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s2
220-
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s0
221+
; GFX11-NEXT: v_mov_b16_e32 v3.l, s2
222+
; GFX11-NEXT: v_mov_b16_e32 v1.h, s4
221223
; GFX11-NEXT: s_lshr_b32 s3, s3, 8
222224
; GFX11-NEXT: s_lshr_b32 s4, s0, 8
225+
; GFX11-NEXT: v_mov_b16_e32 v2.l, s0
223226
; GFX11-NEXT: s_lshr_b32 s0, s6, 8
224227
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
225-
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s3
226-
; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
227-
; GFX11-NEXT: v_mov_b32_e32 v12, s6
228-
; GFX11-NEXT: ds_store_b8 v1, v0
229-
; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
230-
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
231-
; GFX11-NEXT: ds_store_b8 v1, v8 offset:3
232-
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
233-
; GFX11-NEXT: ds_store_b8 v1, v9 offset:5
234-
; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
235-
; GFX11-NEXT: ds_store_b8 v1, v10 offset:7
236-
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
237-
; GFX11-NEXT: ds_store_b8 v1, v11 offset:9
238-
; GFX11-NEXT: ds_store_b8 v1, v6 offset:10
239-
; GFX11-NEXT: ds_store_b8 v1, v12 offset:11
228+
; GFX11-NEXT: v_mov_b16_e32 v3.h, s5
229+
; GFX11-NEXT: v_mov_b16_e32 v2.h, s1
230+
; GFX11-NEXT: v_mov_b16_e32 v4.l, s3
231+
; GFX11-NEXT: v_mov_b16_e32 v4.h, s4
232+
; GFX11-NEXT: v_mov_b16_e32 v5.l, s0
233+
; GFX11-NEXT: v_mov_b16_e32 v5.h, s6
234+
; GFX11-NEXT: ds_store_b8 v6, v0
235+
; GFX11-NEXT: ds_store_b8 v6, v3 offset:1
236+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v1 offset:2
237+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v3 offset:3
238+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v0 offset:4
239+
; GFX11-NEXT: ds_store_b8 v6, v4 offset:5
240+
; GFX11-NEXT: ds_store_b8 v6, v2 offset:6
241+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v4 offset:7
242+
; GFX11-NEXT: ds_store_b8 v6, v1 offset:8
243+
; GFX11-NEXT: ds_store_b8 v6, v5 offset:9
244+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v2 offset:10
245+
; GFX11-NEXT: ds_store_b8_d16_hi v6, v5 offset:11
240246
; GFX11-NEXT: s_endpgm
241247
store <3 x i32> %x, ptr addrspace(3) %out, align 1
242248
ret void

0 commit comments

Comments
 (0)