-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Fix op_sel settings for v_cvt_scale32_* and v_cvt_sr_* #151286
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
For OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte select for the first source operand. So op_sel [0, 0], [1, 0], [0, 1] and [1, 1] should map to byte 0, 1, 2 and 3, respectively. For OPF_OPSEL_DSTBYTE: OPSEL is used as a destination byte select. OPSEL[2:3] specify which byte of the destination to write to. Note the order of the bits is different from that of OPF_OPSEL_SRCBYT. So the mapping should be: op_sel [0, 0], [0, 1], [1, 0] and [1, 1] map to byte 0, 1, 2 and 3, respectively. Fixes: SWDEV-544901
|
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesFor OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte For OPF_OPSEL_DSTBYTE: OPSEL is used as a destination byte select. OPSEL[2:3] Fixes: SWDEV-544901 Patch is 38.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151286.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index cd9c2ec20c560..9c31462bb7fa8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -6994,13 +6994,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
? (int64_t)(SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL)
: (int64_t)SISrcMods::DST_OP_SEL);
}
@@ -7009,13 +7009,13 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
+ MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
? (int64_t)(SISrcMods::OP_SEL_0)
: 0);
}
@@ -7038,14 +7038,14 @@ void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
}
void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
MIB.addImm(
- (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
+ (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
}
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 5586dd872fef5..9f1e53d1db644 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -999,10 +999,10 @@ class SrcAndDstSelToOpSelXForm<int modifier_idx, bit dest_sel> : SDNodeXForm<tim
unsigned Val = N->getZExtValue();
unsigned New = 0;
if (}] # modifier_idx # [{ == 0) {
- New = (}] # dest_sel # [{ == 1) ? ((Val & 0x2) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
- : ((Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
+ New = (}] # dest_sel # [{ == 1) ? ((Val & 0x1) ? (SISrcMods::OP_SEL_0 | SISrcMods::DST_OP_SEL) : SISrcMods::DST_OP_SEL)
+ : ((Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE);
} else if (}] # modifier_idx # [{== 1 || }] # modifier_idx # [{ == 2) {
- New = (Val & 0x1) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
+ New = (Val & 0x2) ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
}
return CurDAG->getTargetConstant(New, SDLoc(N), MVT::i32);
}]>;
@@ -1046,7 +1046,7 @@ def gi_SrcSelToOpSelXForm : GICustomOperandRenderer<"renderSrcSelToOpSelXForm">,
def DstSelToOpSel3XForm : SDNodeXForm<timm, [{
uint32_t V = N->getZExtValue();
return CurDAG->getTargetConstant(
- (V & 0x2) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
+ (V & 0x1) ? SISrcMods::DST_OP_SEL : SISrcMods::NONE,
SDLoc(N), MVT::i32);
}]>;
def gi_DstSelToOpSel3XForm : GICustomOperandRenderer<"renderDstSelToOpSel3XFormXForm">,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 291a4e2d39a37..788a9b2518128 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -168,7 +168,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
@@ -179,7 +179,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
@@ -213,7 +213,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -225,7 +225,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[1,0,1]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v2, v0, v1 op_sel:[0,1,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -259,7 +259,7 @@ define float @test_cvt_scalef32_f32_fp8_byte1(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1)
ret float %ret
@@ -269,7 +269,7 @@ define float @test_cvt_scalef32_f32_fp8_byte2(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2)
ret float %ret
@@ -300,7 +300,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
@@ -311,7 +311,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
@@ -345,7 +345,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1]
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -357,7 +357,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi(i32 %src, float %scale
; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[1,0,1]
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v2, v0, v1 op_sel:[0,1,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -391,7 +391,7 @@ define float @test_cvt_scalef32_f32_bf8_byte1(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1)
ret float %ret
@@ -401,7 +401,7 @@ define float @test_cvt_scalef32_f32_bf8_byte2(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2)
ret float %ret
@@ -773,7 +773,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte1(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_f32_fp4_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1)
ret <2 x float> %ret
@@ -783,7 +783,7 @@ define <2 x float> @test_cvt_scale_f32_fp4_byte2(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_f32_fp4_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], v0, v1 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2)
ret <2 x float> %ret
@@ -813,7 +813,7 @@ define i32 @test_cvt_scale_fp4_f32_byte1(i32 %old, float %src0, float %src1, flo
; GCN-LABEL: test_cvt_scale_fp4_f32_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
ret i32 %ret
@@ -823,7 +823,7 @@ define i32 @test_cvt_scale_fp4_f32_byte2(i32 %old, float %src0, float %src1, flo
; GCN-LABEL: test_cvt_scale_fp4_f32_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
ret i32 %ret
@@ -895,7 +895,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte1(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_f16_fp4_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1)
ret <2 x half> %ret
@@ -905,7 +905,7 @@ define <2 x half> @test_cvt_scale_f16_fp4_byte2(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_f16_fp4_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2)
ret <2 x half> %ret
@@ -935,7 +935,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1)
ret <2 x bfloat> %ret
@@ -945,7 +945,7 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2(i32 %src, float %scale) {
; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, v0, v1 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2)
ret <2 x bfloat> %ret
@@ -1302,7 +1302,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte1(<2 x half> %src0, float %scale, i32
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1314,7 +1314,7 @@ define i32 @test_cvt_scalef32_fp4_f16_byte2(<2 x half> %src0, float %scale, i32
; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1380,7 +1380,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte1(<2 x bfloat> %src0, float %scale, i
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1392,7 +1392,7 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte2(<2 x bfloat> %src0, float %scale, i
; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,0,1]
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v2, v0, v1 op_sel:[0,0,1,0]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1602,7 +1602,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src(i32 inreg %s
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0]
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
@@ -1613,7 +1613,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src(i32 inreg %s
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0]
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
@@ -1647,7 +1647,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src(i32 inreg %s
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1659,7 +1659,7 @@ define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src(i32 inreg %s
; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1]
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -1693,7 +1693,7 @@ define float @test_cvt_scalef32_f32_fp8_byte1_inreg_src(i32 inreg %src, float %s
; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1)
ret float %ret
@@ -1703,7 +1703,7 @@ define float @test_cvt_scalef32_f32_fp8_byte2_inreg_src(i32 inreg %src, float %s
; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2)
ret float %ret
@@ -1734,7 +1734,7 @@ define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src(i32 inreg %s
; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0]
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
rampitec
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even if that is so for gfx950, you still need to check gfx1250.
rampitec
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like it is target dependent.
|
Ideally it should backport byte_sel modifier from gfx1250 and deal with encoding peculiarities at the MC layer. |
rampitec
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, it is confirmed: bit order is different between targets. But since gfx1250 uses bytesel operand the implementation is also separate and does not interfere with this.
LGTM
…cvt_sr_* (#151411) GFX950 uses OP_SEL[MSB:LSB] for both src reads and dest writes. So this patch essentially revert the work from llvm/llvm-project#151286 regarding dest writes.
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
…#151286) (llvm#3379) For OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte select for the first source operand. So op_sel [0, 0], [1, 0], [0, 1] and [1, 1] should map to byte 0, 1, 2 and 3, respectively. NOTE: This cherry-pick also includes llvm#151411 Fixes: SWDEV-544901
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
…#151286) For OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte select for the first source operand. So op_sel [0, 0], [1, 0], [0, 1] and [1, 1] should map to byte 0, 1, 2 and 3, respectively. NOTE: This cherry-pick also includes llvm#151411 Fixes: SWDEV-544901
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
…#151286) For OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte select for the first source operand. So op_sel [0, 0], [1, 0], [0, 1] and [1, 1] should map to byte 0, 1, 2 and 3, respectively. For OPF_OPSEL_DSTBYTE: OPSEL is used as a destination byte select. OPSEL[2:3] specify which byte of the destination to write to. Note that the order of the bits is different from that of OPF_OPSEL_SRCBYT. So the mapping should be: op_sel [0, 0], [0, 1], [1, 0] and [1, 1] map to byte 0, 1, 2 and 3, respectively. Fixes: SWDEV-544901
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
Pick up the fix to op_sel settings of the v_cvt_scalef32_pk* instructions: llvm/llvm-project#151411 and llvm/llvm-project#151286
For OPF_OPSEL_SRCBYTE: Vector instruction uses OPSEL[1:0] to specify a byte
select for the first source operand. So op_sel [0, 0], [1, 0], [0, 1] and [1, 1] should map
to byte 0, 1, 2 and 3, respectively.
For OPF_OPSEL_DSTBYTE: OPSEL is used as a destination byte select. OPSEL[2:3]
specify which byte of the destination to write to. Note that the order of the bits is different
from that of OPF_OPSEL_SRCBYT. So the mapping should be: op_sel [0, 0], [0, 1], [1, 0]
and [1, 1] map to byte 0, 1, 2 and 3, respectively.
Fixes: SWDEV-544901