Skip to content

Commit af80984

Browse files
authored
Merge branch 'main' into users/rampitec/07-30-_amdgpu_add_v_ashr_pk_i8_i32_and_v_ashr_pk_u8_i32_on_gfx1250
2 parents b0f03f1 + 7d23323 commit af80984

File tree

13 files changed

+960
-372
lines changed

13 files changed

+960
-372
lines changed

llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp

Lines changed: 116 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,116 @@ static bool isTlsAddressCode(uint8_t DW_OP_Code) {
413413
DW_OP_Code == dwarf::DW_OP_GNU_push_tls_address;
414414
}
415415

416+
static void constructSeqOffsettoOrigRowMapping(
417+
CompileUnit &Unit, const DWARFDebugLine::LineTable &LT,
418+
DenseMap<size_t, unsigned> &SeqOffToOrigRow) {
419+
420+
// Use std::map for ordered iteration.
421+
std::map<uint64_t, unsigned> LineTableMapping;
422+
423+
// First, trust the sequences that the DWARF parser did identify.
424+
for (const DWARFDebugLine::Sequence &Seq : LT.Sequences)
425+
LineTableMapping[Seq.StmtSeqOffset] = Seq.FirstRowIndex;
426+
427+
// Second, manually find sequence boundaries and match them to the
428+
// sorted attributes to handle sequences the parser might have missed.
429+
auto StmtAttrs = Unit.getStmtSeqListAttributes();
430+
llvm::sort(StmtAttrs, [](const PatchLocation &A, const PatchLocation &B) {
431+
return A.get() < B.get();
432+
});
433+
434+
std::vector<size_t> SeqStartRows;
435+
SeqStartRows.push_back(0);
436+
for (auto [I, Row] : llvm::enumerate(ArrayRef(LT.Rows).drop_back()))
437+
if (Row.EndSequence)
438+
SeqStartRows.push_back(I + 1);
439+
440+
// While SeqOffToOrigRow parsed from CU could be the ground truth,
441+
// e.g.
442+
//
443+
// SeqOff Row
444+
// 0x08 9
445+
// 0x14 15
446+
//
447+
// The StmtAttrs and SeqStartRows may not match perfectly, e.g.
448+
//
449+
// StmtAttrs SeqStartRows
450+
// 0x04 3
451+
// 0x08 5
452+
// 0x10 9
453+
// 0x12 11
454+
// 0x14 15
455+
//
456+
// In this case, we don't want to assign 5 to 0x08, since we know 0x08
457+
// maps to 9. If we do a dummy 1:1 mapping 0x10 will be mapped to 9
458+
// which is incorrect. The expected behavior is ignore 5, realign the
459+
// table based on the result from the line table:
460+
//
461+
// StmtAttrs SeqStartRows
462+
// 0x04 3
463+
// -- 5
464+
// 0x08 9 <- LineTableMapping ground truth
465+
// 0x10 11
466+
// 0x12 --
467+
// 0x14 15 <- LineTableMapping ground truth
468+
469+
ArrayRef StmtAttrsRef(StmtAttrs);
470+
ArrayRef SeqStartRowsRef(SeqStartRows);
471+
472+
// Dummy last element to make sure StmtAttrsRef and SeqStartRowsRef always
473+
// run out first.
474+
constexpr size_t DummyKey = UINT64_MAX;
475+
constexpr unsigned DummyVal = UINT32_MAX;
476+
LineTableMapping[DummyKey] = DummyVal;
477+
478+
for (auto [NextSeqOff, NextRow] : LineTableMapping) {
479+
auto StmtAttrSmallerThanNext = [NextSeqOff](const PatchLocation &SA) {
480+
return SA.get() < NextSeqOff;
481+
};
482+
auto SeqStartSmallerThanNext = [NextRow](const size_t &Row) {
483+
return Row < NextRow;
484+
};
485+
486+
// If both StmtAttrs and SeqStartRows points to value not in
487+
// the LineTableMapping yet, we do a dummy one to one mapping and
488+
// move the pointer.
489+
while (!StmtAttrsRef.empty() && !SeqStartRowsRef.empty() &&
490+
StmtAttrSmallerThanNext(StmtAttrsRef.front()) &&
491+
SeqStartSmallerThanNext(SeqStartRowsRef.front())) {
492+
SeqOffToOrigRow[StmtAttrsRef.consume_front().get()] =
493+
SeqStartRowsRef.consume_front();
494+
}
495+
// One of the pointer points to the value at or past Next in the
496+
// LineTableMapping, We move the pointer to re-align with the
497+
// LineTableMapping
498+
StmtAttrsRef = StmtAttrsRef.drop_while(StmtAttrSmallerThanNext);
499+
SeqStartRowsRef = SeqStartRowsRef.drop_while(SeqStartSmallerThanNext);
500+
// Use the LineTableMapping's result as the ground truth and move
501+
// on.
502+
if (NextSeqOff != DummyKey) {
503+
SeqOffToOrigRow[NextSeqOff] = NextRow;
504+
}
505+
// Move the pointers if they are pointed at Next.
506+
// It is possible that they point to later entries in LineTableMapping.
507+
// Therefore we only increment the pointers after we validate they are
508+
// pointing to the `Next` entry. e.g.
509+
//
510+
// LineTableMapping
511+
// SeqOff Row
512+
// 0x08 9 <- NextSeqOff/NextRow
513+
// 0x14 15
514+
//
515+
// StmtAttrs SeqStartRows
516+
// 0x14 13 <- StmtAttrsRef.front() / SeqStartRowsRef.front()
517+
// 0x16 15
518+
// -- 17
519+
if (!StmtAttrsRef.empty() && StmtAttrsRef.front().get() == NextSeqOff)
520+
StmtAttrsRef.consume_front();
521+
if (!SeqStartRowsRef.empty() && SeqStartRowsRef.front() == NextRow)
522+
SeqStartRowsRef.consume_front();
523+
}
524+
}
525+
416526
std::pair<bool, std::optional<int64_t>>
417527
DWARFLinker::getVariableRelocAdjustment(AddressesMap &RelocMgr,
418528
const DWARFDie &DIE) {
@@ -2297,8 +2407,12 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) {
22972407

22982408
// Create a map of stmt sequence offsets to original row indices.
22992409
DenseMap<uint64_t, unsigned> SeqOffToOrigRow;
2300-
for (const DWARFDebugLine::Sequence &Seq : LT->Sequences)
2301-
SeqOffToOrigRow[Seq.StmtSeqOffset] = Seq.FirstRowIndex;
2410+
// The DWARF parser's discovery of sequences can be incomplete. To
2411+
// ensure all DW_AT_LLVM_stmt_sequence attributes can be patched, we
2412+
// build a map from both the parser's results and a manual
2413+
// reconstruction.
2414+
if (!LT->Rows.empty())
2415+
constructSeqOffsettoOrigRowMapping(Unit, *LT, SeqOffToOrigRow);
23022416

23032417
// Create a map of original row indices to new row indices.
23042418
DenseMap<size_t, size_t> OrigRowToNewRow;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7038,13 +7038,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
70387038
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
70397039
assert(OpIdx >= 0 && "expected to match an immediate operand");
70407040
MIB.addImm(
7041-
(MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
7041+
(MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
70427042
}
70437043

70447044
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
70457045
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
70467046
assert(OpIdx >= 0 && "expected to match an immediate operand");
7047-
MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
7047+
MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
70487048
? (int64_t)SISrcMods::DST_OP_SEL
70497049
: 0);
70507050
}

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,8 +1015,10 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
10151015
if (}] # modifier_idx # [{ == 0) {
10161016
New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
10171017
: ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
1018-
} else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
1019-
New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
1018+
} else if (}] # modifier_idx # [{== 1) {
1019+
New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
1020+
} if (}] # modifier_idx # [{== 2) {
1021+
New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
10201022
}
10211023
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
10221024
}]>;
@@ -1060,7 +1062,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">,
10601062
def DstSelToOpSel3XForm : SDNodeXForm<timm, [{
10611063
uint32_t V = N->getZExtValue();
10621064
return CurDAG->getTargetConstant(
1063-
(V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
1065+
(V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
10641066
SDLoc(N), MVT::i32);
10651067
}]>;
10661068
def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">,

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -813,7 +813,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, flo
813813
; GCN-LABEL: test_cvt_scale_fp4_f32_byte1:
814814
; GCN: ; %bb.0:
815815
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
816+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
817817
; GCN-NEXT: s_setpc_b64 s[30:31]
818818
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
819819
ret i32 %ret
@@ -823,7 +823,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, flo
823823
; GCN-LABEL: test_cvt_scale_fp4_f32_byte2:
824824
; GCN: ; %bb.0:
825825
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
826-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
826+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
827827
; GCN-NEXT: s_setpc_b64 s[30:31]
828828
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
829829
ret i32 %ret
@@ -1302,7 +1302,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32
13021302
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1:
13031303
; GCN: ; %bb.0:
13041304
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1305-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
1305+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
13061306
; GCN-NEXT: s_nop 0
13071307
; GCN-NEXT: v_mov_b32_e32 v0, v2
13081308
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1314,7 +1314,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32
13141314
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2:
13151315
; GCN: ; %bb.0:
13161316
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1317-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
1317+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
13181318
; GCN-NEXT: s_nop 0
13191319
; GCN-NEXT: v_mov_b32_e32 v0, v2
13201320
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1380,7 +1380,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i
13801380
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1:
13811381
; GCN: ; %bb.0:
13821382
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1383-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
1383+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
13841384
; GCN-NEXT: s_nop 0
13851385
; GCN-NEXT: v_mov_b32_e32 v0, v2
13861386
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1392,7 +1392,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i
13921392
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2:
13931393
; GCN: ; %bb.0:
13941394
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1395-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
1395+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
13961396
; GCN-NEXT: s_nop 0
13971397
; GCN-NEXT: v_mov_b32_e32 v0, v2
13981398
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2072,7 +2072,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0,
20722072
; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src:
20732073
; GCN: ; %bb.0:
20742074
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2075-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
2075+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0]
20762076
; GCN-NEXT: s_setpc_b64 s[30:31]
20772077
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
20782078
ret i32 %ret
@@ -2082,7 +2082,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0,
20822082
; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src:
20832083
; GCN: ; %bb.0:
20842084
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2085-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0]
2085+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
20862086
; GCN-NEXT: s_setpc_b64 s[30:31]
20872087
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
20882088
ret i32 %ret
@@ -2515,7 +2515,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, fl
25152515
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src:
25162516
; GCN: ; %bb.0:
25172517
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2518-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1]
2518+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0]
25192519
; GCN-NEXT: s_nop 0
25202520
; GCN-NEXT: v_mov_b32_e32 v0, v1
25212521
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2527,7 +2527,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, fl
25272527
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src:
25282528
; GCN: ; %bb.0:
25292529
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2530-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0]
2530+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1]
25312531
; GCN-NEXT: s_nop 0
25322532
; GCN-NEXT: v_mov_b32_e32 v0, v1
25332533
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2562,7 +2562,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0,
25622562
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src:
25632563
; GCN: ; %bb.0:
25642564
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2565-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1]
2565+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0]
25662566
; GCN-NEXT: s_nop 0
25672567
; GCN-NEXT: v_mov_b32_e32 v0, v1
25682568
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2574,7 +2574,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0,
25742574
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src:
25752575
; GCN: ; %bb.0:
25762576
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2577-
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0]
2577+
; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1]
25782578
; GCN-NEXT: s_nop 0
25792579
; GCN-NEXT: v_mov_b32_e32 v0, v1
25802580
; GCN-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_1(ptr addrspace(1)
2828
; GFX950: ; %bb.0:
2929
; GFX950-NEXT: global_load_dword v5, v[0:1], off
3030
; GFX950-NEXT: s_waitcnt vmcnt(0)
31-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
31+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
3232
; GFX950-NEXT: global_store_dword v[0:1], v5, off
3333
; GFX950-NEXT: s_endpgm
3434
%old = load i32, ptr addrspace(1) %out, align 4
@@ -42,7 +42,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_bf16_dst_sel_2(ptr addrspace(1)
4242
; GFX950: ; %bb.0:
4343
; GFX950-NEXT: global_load_dword v5, v[0:1], off
4444
; GFX950-NEXT: s_waitcnt vmcnt(0)
45-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
45+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
4646
; GFX950-NEXT: global_store_dword v[0:1], v5, off
4747
; GFX950-NEXT: s_endpgm
4848
%old = load i32, ptr addrspace(1) %out, align 4
@@ -84,7 +84,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_1(ptr addrspace(1) %
8484
; GFX950: ; %bb.0:
8585
; GFX950-NEXT: global_load_dword v5, v[0:1], off
8686
; GFX950-NEXT: s_waitcnt vmcnt(0)
87-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
87+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
8888
; GFX950-NEXT: global_store_dword v[0:1], v5, off
8989
; GFX950-NEXT: s_endpgm
9090
%old = load i32, ptr addrspace(1) %out, align 4
@@ -98,7 +98,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f16_dst_sel_2(ptr addrspace(1) %
9898
; GFX950: ; %bb.0:
9999
; GFX950-NEXT: global_load_dword v5, v[0:1], off
100100
; GFX950-NEXT: s_waitcnt vmcnt(0)
101-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
101+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
102102
; GFX950-NEXT: global_store_dword v[0:1], v5, off
103103
; GFX950-NEXT: s_endpgm
104104
%old = load i32, ptr addrspace(1) %out, align 4
@@ -140,7 +140,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_1(ptr addrspace(1) %
140140
; GFX950: ; %bb.0:
141141
; GFX950-NEXT: global_load_dword v5, v[0:1], off
142142
; GFX950-NEXT: s_waitcnt vmcnt(0)
143-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
143+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
144144
; GFX950-NEXT: global_store_dword v[0:1], v5, off
145145
; GFX950-NEXT: s_endpgm
146146
%old = load i32, ptr addrspace(1) %out, align 4
@@ -154,7 +154,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_bf8_f32_dst_sel_2(ptr addrspace(1) %
154154
; GFX950: ; %bb.0:
155155
; GFX950-NEXT: global_load_dword v5, v[0:1], off
156156
; GFX950-NEXT: s_waitcnt vmcnt(0)
157-
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
157+
; GFX950-NEXT: v_cvt_scalef32_sr_bf8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
158158
; GFX950-NEXT: global_store_dword v[0:1], v5, off
159159
; GFX950-NEXT: s_endpgm
160160
%old = load i32, ptr addrspace(1) %out, align 4
@@ -196,7 +196,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_1(ptr addrspace(1)
196196
; GFX950: ; %bb.0:
197197
; GFX950-NEXT: global_load_dword v5, v[0:1], off
198198
; GFX950-NEXT: s_waitcnt vmcnt(0)
199-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
199+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
200200
; GFX950-NEXT: global_store_dword v[0:1], v5, off
201201
; GFX950-NEXT: s_endpgm
202202
%old = load i32, ptr addrspace(1) %out, align 4
@@ -210,7 +210,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_bf16_dst_sel_2(ptr addrspace(1)
210210
; GFX950: ; %bb.0:
211211
; GFX950-NEXT: global_load_dword v5, v[0:1], off
212212
; GFX950-NEXT: s_waitcnt vmcnt(0)
213-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
213+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
214214
; GFX950-NEXT: global_store_dword v[0:1], v5, off
215215
; GFX950-NEXT: s_endpgm
216216
%old = load i32, ptr addrspace(1) %out, align 4
@@ -252,7 +252,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_1(ptr addrspace(1) %
252252
; GFX950: ; %bb.0:
253253
; GFX950-NEXT: global_load_dword v5, v[0:1], off
254254
; GFX950-NEXT: s_waitcnt vmcnt(0)
255-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
255+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
256256
; GFX950-NEXT: global_store_dword v[0:1], v5, off
257257
; GFX950-NEXT: s_endpgm
258258
%old = load i32, ptr addrspace(1) %out, align 4
@@ -266,7 +266,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f16_dst_sel_2(ptr addrspace(1) %
266266
; GFX950: ; %bb.0:
267267
; GFX950-NEXT: global_load_dword v5, v[0:1], off
268268
; GFX950-NEXT: s_waitcnt vmcnt(0)
269-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
269+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
270270
; GFX950-NEXT: global_store_dword v[0:1], v5, off
271271
; GFX950-NEXT: s_endpgm
272272
%old = load i32, ptr addrspace(1) %out, align 4
@@ -308,7 +308,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_1(ptr addrspace(1) %
308308
; GFX950: ; %bb.0:
309309
; GFX950-NEXT: global_load_dword v5, v[0:1], off
310310
; GFX950-NEXT: s_waitcnt vmcnt(0)
311-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
311+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
312312
; GFX950-NEXT: global_store_dword v[0:1], v5, off
313313
; GFX950-NEXT: s_endpgm
314314
%old = load i32, ptr addrspace(1) %out, align 4
@@ -322,7 +322,7 @@ define amdgpu_ps void @test_cvt_scalef32_sr_fp8_f32_dst_sel_2(ptr addrspace(1) %
322322
; GFX950: ; %bb.0:
323323
; GFX950-NEXT: global_load_dword v5, v[0:1], off
324324
; GFX950-NEXT: s_waitcnt vmcnt(0)
325-
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,1,0]
325+
; GFX950-NEXT: v_cvt_scalef32_sr_fp8_f32 v5, v2, v3, v4 op_sel:[0,0,0,1]
326326
; GFX950-NEXT: global_store_dword v[0:1], v5, off
327327
; GFX950-NEXT: s_endpgm
328328
%old = load i32, ptr addrspace(1) %out, align 4

0 commit comments

Comments
 (0)