diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5c39b2a4fc96a..8c59f783749db 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7361,14 +7361,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, const DebugLoc &DL = Inst.getDebugLoc(); Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) - .addImm(16) - .add(Inst.getOperand(1)); - BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) - .addImm(0) // src0_modifiers - .addReg(TmpReg) - .addImm(0) // clamp - .addImm(0); // omod + if (ST.useRealTrue16Insts()) { + BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg, 0, AMDGPU::hi16) + .addImm(0) // clamp + .addImm(0) // omod + .addImm(0); // op_sel0 + } else { + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg) + .addImm(0) // clamp + .addImm(0); // omod + } MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8073aca7f197f..faa0b6d6c3f50 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1094,7 +1094,7 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -multiclass f16_fp_Pats { +multiclass f16_to_fp_Pats { // f16_to_fp patterns def : GCNPat < (f32 (any_f16_to_fp i32:$src0)), @@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats; + // fp_to_fp16 patterns def : GCNPat < - (f64 (any_fpextend f16:$src)), - (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) + (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) >; - // fp_to_fp16 patterns + // This is only used on targets without half support + // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering def : GCNPat < - (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) >; +} + +let True16Predicate = NotHasTrue16BitInsts in +defm : f16_to_fp_Pats; + +let True16Predicate = UseFakeTrue16Insts in +defm : f16_to_fp_Pats; + +multiclass f16_fp_Pats { + def : GCNPat < + (f64 (any_fpextend f16:$src)), + (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) + >; def : GCNPat < (i32 (fp_to_sint f16:$src)), - (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) + (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) >; def : GCNPat < (i32 (fp_to_uint f16:$src)), - (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) + (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src)) >; def : GCNPat < @@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats; - - // This is only used on targets without half support - // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering - def : GCNPat < - (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), - (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) - >; } let True16Predicate = NotHasTrue16BitInsts in -defm : f16_fp_Pats; +defm : f16_fp_Pats; + +let True16Predicate = UseRealTrue16Insts in +defm : f16_fp_Pats; let True16Predicate = UseFakeTrue16Insts in -defm : f16_fp_Pats; +defm : f16_fp_Pats; //===----------------------------------------------------------------------===// // VOP2 Patterns @@ -2774,16 +2787,27 @@ def : GCNPat < SSrc_i1:$src)) >; -let SubtargetPredicate = HasTrue16BitInsts in +let True16Predicate = UseRealTrue16Insts in def : GCNPat < (f16 (sint_to_fp i1:$src)), - (V_CVT_F16_F32_fake16_e32 ( - V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), - SSrc_i1:$src)) + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) >; -let SubtargetPredicate = NotHasTrue16BitInsts in +let True16Predicate = UseFakeTrue16Insts in +def : GCNPat < + (f16 (sint_to_fp i1:$src)), + (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0) +>; + +let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (f16 (uint_to_fp i1:$src)), (V_CVT_F16_F32_e32 ( @@ -2791,13 +2815,25 @@ def : GCNPat < /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), SSrc_i1:$src)) >; -let SubtargetPredicate = HasTrue16BitInsts in + +let True16Predicate = UseRealTrue16Insts in def : GCNPat < (f16 (uint_to_fp i1:$src)), - (V_CVT_F16_F32_fake16_e32 ( - V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), - SSrc_i1:$src)) + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0) +>; + +let True16Predicate = UseFakeTrue16Insts in +def : GCNPat < + (f16 (uint_to_fp i1:$src)), + (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), + SSrc_i1:$src), + /*clamp*/ 0, /*omod*/ 0) >; def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 871a7c3c2579e..ca1f9f14937fb 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -503,7 +503,7 @@ let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; } // End FPDPRounding = 1 -let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in { +let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { def : GCNPat< (f32 (f16_to_fp i16:$src)), (V_CVT_F32_F16_e32 $src) @@ -513,7 +513,7 @@ def : GCNPat< (V_CVT_F16_F32_e32 $src) >; } -let OtherPredicates = [HasTrue16BitInsts] in { +let True16Predicate = UseRealTrue16Insts in { def : GCNPat< (f32 (f16_to_fp i16:$src)), (V_CVT_F32_F16_t16_e32 $src) @@ -523,6 +523,16 @@ def : GCNPat< (V_CVT_F16_F32_t16_e32 $src) >; } +let True16Predicate = UseFakeTrue16Insts in { +def : GCNPat< + (f32 (f16_to_fp i16:$src)), + (V_CVT_F32_F16_fake16_e32 $src) +>; +def : GCNPat< + (i16 (AMDGPUfp_to_f16 f32:$src)), + (V_CVT_F16_F32_fake16_e32 $src) +>; +} def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> { let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1); @@ -1414,15 +1424,14 @@ def : GCNPat < } // End OtherPredicates = [isGFX8Plus, p] -let OtherPredicates = [UseFakeTrue16Insts] in { +let True16Predicate = UseFakeTrue16Insts in { def : GCNPat< (i32 (DivergentUnaryFrag i16:$src)), (COPY $src) >; -} // End OtherPredicates = [UseFakeTrue16Insts] - +} // End True16Predicate = UseFakeTrue16Insts -let OtherPredicates = [UseRealTrue16Insts] in { +let True16Predicate = UseRealTrue16Insts in { def : GCNPat< (i32 (UniformUnaryFrag (i16 SReg_32:$src))), (COPY $src) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir index 17cdab46c3b93..b5f91b6b86083 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: fcmp_false_f16 @@ -10,15 +11,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_false_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -36,15 +49,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_true_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -62,13 +87,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] + ; GFX11-LABEL: name: fcmp_false_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0 @@ -84,13 +109,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] + ; GFX11-LABEL: name: fcmp_true_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15 @@ -106,15 +131,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] + ; GFX11-LABEL: name: fcmp_false_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 @@ -132,15 +157,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] + ; GFX11-LABEL: name: fcmp_true_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir index 158076a3b74a2..a67a0b6455fac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: fcmp_false_f16 @@ -10,15 +11,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_false_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_false_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -36,15 +49,27 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f16 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; GFX11-TRUE16-LABEL: name: fcmp_true_f16 + ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]] + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]] + ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] + ; + ; GFX11-FAKE16-LABEL: name: fcmp_true_f16 + ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s16) = G_FPTRUNC %0 @@ -62,13 +87,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] + ; GFX11-LABEL: name: fcmp_false_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0 @@ -84,13 +109,13 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f32 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] + ; GFX11-LABEL: name: fcmp_true_f32 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15 @@ -106,15 +131,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_false_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] + ; GFX11-LABEL: name: fcmp_false_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 @@ -132,15 +157,15 @@ tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1 - ; CHECK-LABEL: name: fcmp_true_f64 - ; CHECK: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec - ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] + ; GFX11-LABEL: name: fcmp_true_f64 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s64) = G_FPEXT %0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir index 0ff633fb4d8be..df2f390124ebd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s --- @@ -45,15 +45,15 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 @@ -85,14 +85,14 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_s16_vs ; GFX11-FAKE16: liveins: $sgpr0 @@ -124,15 +124,15 @@ body: | ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]] ; - ; GFX11-LABEL: name: fceil_fneg_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: fceil_fneg_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir index fc8a6aaa17512..df62806b61918 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s --- @@ -54,15 +54,15 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 @@ -94,14 +94,14 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_s16_vs ; GFX11-FAKE16: liveins: $sgpr0 @@ -133,15 +133,15 @@ body: | ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]] ; - ; GFX11-LABEL: name: ffloor_fneg_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 - ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 - ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; GFX11-TRUE16-LABEL: name: ffloor_fneg_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] ; ; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv ; GFX11-FAKE16: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir index 32a73bc4e24a5..03cb907f82a16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 --- name: fptosi_s32_to_s32_vv @@ -135,13 +136,22 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -174,13 +184,21 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -217,15 +235,25 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s32_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 @@ -259,13 +287,23 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -299,13 +337,22 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOSI %1 @@ -343,15 +390,26 @@ body: | ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] ; - ; GFX11-LABEL: name: fptosi_s16_to_s1_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir index 47a091804ce0a..521a0e8a2a796 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16 --- @@ -85,13 +86,22 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -124,13 +134,21 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -167,15 +185,25 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s32_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 @@ -209,13 +237,23 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -249,13 +287,22 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:sgpr(s32) = COPY $sgpr0 %1:sgpr(s16) = G_TRUNC %0 %2:vgpr(s32) = G_FPTOUI %1 @@ -293,15 +340,26 @@ body: | ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] ; - ; GFX11-LABEL: name: fptoui_s16_to_s1_fneg_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 - ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec - ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] + ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_fneg_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16 + ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16 + ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]] + ; + ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_fneg_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768 + ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FNEG %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir index 938bb58bafc93..3888ce87b46fd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- @@ -85,13 +86,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: sitofp_s32_to_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_SITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 @@ -124,13 +135,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: sitofp_s32_to_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s16) = G_SITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir index 9c6fded0d1425..35d622dc57d18 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir @@ -1,7 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s --- name: uitofp_s32_to_s32_vv @@ -99,13 +100,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: uitofp_s32_to_s16_vv - ; GFX11: liveins: $vgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vv + ; GFX11-TRUE16: liveins: $vgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vv + ; GFX11-FAKE16: liveins: $vgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_UITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 @@ -138,13 +149,23 @@ body: | ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]] ; - ; GFX11-LABEL: name: uitofp_s32_to_s16_vs - ; GFX11: liveins: $sgpr0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec - ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] + ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vs + ; GFX11-TRUE16: liveins: $sgpr0 + ; GFX11-TRUE16-NEXT: {{ $}} + ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]] + ; + ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vs + ; GFX11-FAKE16: liveins: $sgpr0 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]] %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s16) = G_UITOFP %0 %2:vgpr(s32) = G_ANYEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index 9d586e3e4a09a..eeb7b138fde31 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX78,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GFX78,GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX7-LABEL: v_powi_f16: @@ -36,21 +37,37 @@ define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX8-NEXT: v_exp_f16_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_powi_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_powi_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_powi_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %l.cast = bitcast i16 %l to half %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r) %res.cast = bitcast half %res to i16 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir index 265bdd0cf2f48..30a24c675a76b 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir @@ -1,6 +1,29 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s +# V_CVT_LT_F16 will be replaced with fake16 when its true16/fake16 profile is corrected + +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_fake16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_32 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_32 = V_CVT_F16_U16_fake16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_32 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... + +# Needs extra shift instruction to select hi 16 bits --- name: cvt_hi_f32_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 03a77dc2b8b5e..4604518d71c96 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,20 +1,39 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s -# XFAIL: * -# FIXME-TRUE16. reenable after CVT_F16_U16_t16 is supported in CodeGen +# + +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[COPY]], 0, [[DEF1]], 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_16 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... --- name: cvt_hi_f32_f16 body: | bb.0: ; GCN-LABEL: name: cvt_hi_f32_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_e64_]], implicit $exec - ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF - %1:vgpr_16 = V_CVT_F16_U16_t16_e64 %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec %2:sreg_32 = COPY %1:vgpr_16 %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode ... diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index 9a727a321d786..e8291f7ab8f72 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -2,26 +2,6 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s ---- -name: cmp_f16 -body: | - bb.0.entry: - ; GCN-LABEL: name: cmp_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec - %0:vgpr_32 = IMPLICIT_DEF - %1:sreg_32 = IMPLICIT_DEF - %2:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec - %3:sreg_32 = COPY %2:vgpr_32 - nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode - %4:sreg_32_xm0_xexec = COPY $scc - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec -... - --- name: fmac_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index 81859dce04889..064e88873a175 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s ; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s @@ -44,25 +45,45 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp16_to_fp32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp16_to_fp32: ; CYPRESS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index c17be87834aeb..6c9f451167b7b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone @@ -44,27 +46,49 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp16_to_fp64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in, align 2 %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone store double %cvt, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index d8a726f251a01..5bac710070477 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone @@ -43,25 +44,45 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; -; GFX11-LABEL: test_convert_fp32_to_fp16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ; ; CYPRESS-LABEL: test_convert_fp32_to_fp16: ; CYPRESS: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 75f4dff14fcbd..a40d678e84d72 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @fpext_f16_to_f32( ; SI-LABEL: fpext_f16_to_f32: @@ -59,25 +60,45 @@ define amdgpu_kernel void @fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -145,27 +166,49 @@ define amdgpu_kernel void @fpext_f16_to_f64( ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_f16_to_f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_f16_to_f64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_f16_to_f64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -234,28 +277,51 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32( ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_v2f16_to_v2f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -330,31 +396,57 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64( ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fpext_v2f16_to_v2f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -387,19 +479,35 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: s_fneg_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %a.trunc = trunc i32 %a to i16 %a.val = bitcast i16 %a.trunc to half @@ -463,25 +571,45 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -547,25 +675,45 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -631,25 +779,45 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32( ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -730,29 +898,55 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -833,29 +1027,55 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0 -; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -v0 +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -935,29 +1155,55 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1038,29 +1284,55 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0| -; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, |v0.l|, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, |v0.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, |v0| +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1140,29 +1412,55 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l| +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1244,29 +1542,55 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0| -; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0 -; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.h, -|v0.l|, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0.l| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e64 v1, -|v0| +; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, -|v0|, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc +; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index 0e12cca1900ce..327f2653c4746 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + define amdgpu_kernel void @fptosi_f16_to_i16( ; SI-LABEL: fptosi_f16_to_i16: @@ -41,25 +43,45 @@ define amdgpu_kernel void @fptosi_f16_to_i16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +130,49 @@ define amdgpu_kernel void @fptosi_f16_to_i32( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -182,28 +226,51 @@ define amdgpu_kernel void @fptosi_f16_to_i64( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -259,31 +326,60 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i16_f16_e32 v1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i16_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i16_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -337,31 +433,57 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -422,34 +544,63 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64( ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_v2f16_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v1 -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -485,21 +636,38 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptosi_f16_to_i1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptosi_f16_to_i1: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptosi_f16_to_i1: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptosi half %in to i1 store i1 %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index abc5c7af13b0c..ba540f4948b50 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s + define amdgpu_kernel void @fptoui_f16_to_i16( ; SI-LABEL: fptoui_f16_to_i16: @@ -41,25 +43,45 @@ define amdgpu_kernel void @fptoui_f16_to_i16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +130,49 @@ define amdgpu_kernel void @fptoui_f16_to_i32( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -182,28 +226,51 @@ define amdgpu_kernel void @fptoui_f16_to_i64( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -258,31 +325,60 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u16_f16_e32 v1, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_u16_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_u16_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -336,31 +432,57 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32( ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -421,33 +543,61 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64( ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_v2f16_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -484,21 +634,38 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: fptoui_f16_to_i1: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fptoui_f16_to_i1: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fptoui_f16_to_i1: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b32 s4, s[2:3], 0x2c +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm entry: %conv = fptoui half %in to i1 store i1 %conv, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index b08a35ab80732..9169433cdca56 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @sitofp_i16_to_f16( ; SI-LABEL: sitofp_i16_to_f16: @@ -41,25 +42,45 @@ define amdgpu_kernel void @sitofp_i16_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_i16_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_i16_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_i16_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +129,49 @@ define amdgpu_kernel void @sitofp_i32_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_i32_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_i32_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_i32_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -186,29 +229,56 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_v2i16_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -264,31 +334,60 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sitofp_v2i32_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -353,37 +452,69 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: s_sint_to_fp_i1_to_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index c21ae434f4470..c4268c15d9db6 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s define amdgpu_kernel void @uitofp_i16_to_f16( ; SI-LABEL: uitofp_i16_to_f16: @@ -41,25 +42,45 @@ define amdgpu_kernel void @uitofp_i16_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_i16_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_i16_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_i16_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -108,27 +129,49 @@ define amdgpu_kernel void @uitofp_i32_to_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_i32_to_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_i32_to_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_i32_to_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -186,29 +229,56 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_v2i16_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -264,31 +334,60 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: uitofp_v2i32_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -353,37 +452,69 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: s_uint_to_fp_i1_to_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, s10 -; GFX11-NEXT: s_mov_b32 s3, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s6 -; GFX11-NEXT: s_mov_b32 s13, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s5 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_nop 0 +; GFX11-TRUE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s5 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_nop 0 +; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00