diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2d46cf3b70a34..e6fa79a70c0bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5518,6 +5518,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(SAT_PK_CAST) NODE_NAME_CASE(UMED3) NODE_NAME_CASE(FMAXIMUM3) NODE_NAME_CASE(FMINIMUM3) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index c74dc7942f52c..6df4066c0fe6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -461,6 +461,7 @@ enum NodeType : unsigned { FMED3, SMED3, UMED3, + SAT_PK_CAST, FMAXIMUM3, FMINIMUM3, FDOT2, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index bec294a945d2f..8babc86effbba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -332,6 +332,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, [] >; +// Special node to pack v2i8 into i16 for v_sat_pk lowering. +def AMDGPUsat_pk_cast : SDNode<"AMDGPUISD::SAT_PK_CAST", SDTUnaryOp, []>; + def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 909ad07782fc6..7d310d27e654d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -824,6 +824,25 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, Custom); } + + // True 16 instruction is current not supported + // FIXME: Add support for true 16 when supported + if (!Subtarget->hasTrue16BitInsts() || !Subtarget->useRealTrue16Insts()) { + // MVT::vNi16 for src type check in foldToSaturated + // MVT::vNi8 for dst type check in CustomLowerNode + // FIXME: Handle N = 2, 4, 8 first, should change verification logic from + // LLVM side, like break bigger vector into legal small vectors + setOperationAction(ISD::TRUNCATE_SSAT_U, + { + MVT::v2i16, + MVT::v4i16, + MVT::v8i16, + MVT::v2i8, + MVT::v4i8, + MVT::v8i8, + }, + Custom); + } } setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); @@ -1983,6 +2002,12 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { if (VT == MVT::i1 && Op == ISD::SETCC) return false; + // Special case for vNi8 handling where N is even + if (Op == ISD::TRUNCATE_SSAT_U && VT.isVector() && + VT.getVectorElementType() == MVT::i8 && + ((VT.getVectorNumElements() & 1) == 0)) + return true; + return TargetLowering::isTypeDesirableForOp(Op, VT); } @@ -6615,6 +6640,45 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); break; } + case ISD::TRUNCATE_SSAT_U: { + SDLoc SL(N); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + EVT DstVT = N->getValueType(0); + + assert(SrcVT.isVector() && DstVT.isVector()); + assert(DstVT.getVectorElementType() == MVT::i8); + assert(SrcVT.getVectorElementType() == MVT::i16); + + unsigned EleNo = SrcVT.getVectorNumElements(); + assert(EleNo == DstVT.getVectorNumElements()); + + if (EleNo == 2) { + SDValue Op = DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, Src); + Op = DAG.getNode(ISD::BITCAST, SL, N->getValueType(0), Op); + Results.push_back(Op); + break; + } + + // Vector case, number of element must be even + assert((EleNo & 1) == 0); + SmallVector DstPairs; + EVT SrcEleVT = SrcVT.getVectorElementType(); + EVT DstEleVT = DstVT.getVectorElementType(); + EVT SrcPairVT = EVT::getVectorVT(*DAG.getContext(), SrcEleVT, 2); + EVT DstPairVT = EVT::getVectorVT(*DAG.getContext(), DstEleVT, 2); + for (unsigned i = 0; i != EleNo; i += 2) { + SDValue SrcPair = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SrcPairVT, Src, + DAG.getConstant(i, SL, MVT::i32)); + SDValue SatPk = + DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, SrcPair); + SDValue DstPair = DAG.getNode(ISD::BITCAST, SL, DstPairVT, SatPk); + DstPairs.push_back(DstPair); + } + SDValue Op = DAG.getNode(ISD::CONCAT_VECTORS, SL, DstVT, DstPairs); + Results.push_back(Op); + break; + } default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 598475763d02d..2421ae1aa2f04 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3382,6 +3382,21 @@ def : GCNPat < (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) >; +multiclass V_SAT_PK_Pat { + def : GCNPat< + (i16 (AMDGPUsat_pk_cast v2i16:$src)), + (inst VRegSrc_32:$src) + >; +} + +let OtherPredicates = [NotHasTrue16BitInsts] in { + defm : V_SAT_PK_Pat; +} // End OtherPredicates = [NotHasTrue16BitInsts] + +let True16Predicate = UseFakeTrue16Insts in { + defm : V_SAT_PK_Pat; +} // End True16Predicate = UseFakeTrue16Insts + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 934d9efba4656..f37cdff6a6c06 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s ; @llvm.smin.v2i16(<2 x i16>, <2 x i16>) declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <3 x i16> @llvm.smin.v3i16(<3 x i16>, <3 x i16>) +declare <3 x i16> @llvm.smax.v3i16(<3 x i16>, <3 x i16>) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; SDAG-VI-LABEL: basic_smax_smin: @@ -815,15 +821,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_smax_smin_bit_or: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-LABEL: basic_smax_smin_bit_or: ; SDAG-GFX12: ; %bb.0: @@ -860,6 +866,16 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-LABEL: basic_smax_smin_bit_or: ; GISEL-GFX12: ; %bb.0: ; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -873,6 +889,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 ; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smax_smin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -902,15 +927,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_umax_umin_bit_or: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_u16 v1, 0xff, v1 -; GFX11-NEXT: v_min_u16 v0, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_min_u16 v1, 0xff, v1 +; SDAG-GFX11-NEXT: v_min_u16 v0, 0xff, v0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-LABEL: basic_umax_umin_bit_or: ; SDAG-GFX12: ; %bb.0: @@ -944,6 +969,16 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_min_u16 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_min_u16 v0, 0xff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-LABEL: basic_umax_umin_bit_or: ; GISEL-GFX12: ; %bb.0: ; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -957,6 +992,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 ; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_umax_umin_bit_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_u16 v1, 0xff, v1 +; GFX11-NEXT: v_min_u16 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255) @@ -1093,15 +1137,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: basic_smax_smin_bit_shl: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_i16 v1, v1, 0 -; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0 +; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl: ; SDAG-GFX12: ; %bb.0: @@ -1137,6 +1181,16 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_max_i16 v1, v1, 0 +; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl: ; GISEL-GFX12: ; %bb.0: ; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -1150,6 +1204,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 ; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smax_smin_bit_shl: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_i16 v1, v1, 0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -1174,24 +1237,13 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; SDAG-GFX9-LABEL: basic_smax_smin_vec_input: ; SDAG-GFX9: ; %bb.0: ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff -; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-LABEL: basic_smax_smin_vec_input: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-LABEL: basic_smax_smin_vec_input: @@ -1201,13 +1253,7 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_vec_input: @@ -1291,24 +1337,13 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev: ; SDAG-GFX9: ; %bb.0: ; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff -; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] -; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev: ; SDAG-GFX11: ; %bb.0: ; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev: @@ -1318,13 +1353,7 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 ; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 ; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 ; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev: @@ -1392,3 +1421,1198 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ret i16 %cast } +define <4 x i16> @basic_smax_smin_v8i16_input_1(<8 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_v8i16_input_1: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v5, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-VI-NEXT: v_min_i16_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v2, 0xff, v2 +; SDAG-VI-NEXT: v_min_i16_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v3, 0xff, v3 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v6, v6, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v5, v5, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_max_i16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v5 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v1, v2, v7 +; SDAG-VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_v8i16_input_1: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v1, v2 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_v8i16_input_1: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_v8i16_input_1: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_v8i16_input_1: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v6, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v5, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_e32 v8, 0xff, v3 +; GISEL-VI-NEXT: v_min_i16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v7, 0xff, v2 +; GISEL-VI-NEXT: v_min_i16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v4, 0, v5 +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v5, 0, v6 +; GISEL-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GISEL-VI-NEXT: v_max_i16_e32 v6, 0, v7 +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; GISEL-VI-NEXT: v_max_i16_e32 v7, 0, v8 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GISEL-VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_v8i16_input_1: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v4, v0 +; GISEL-GFX9-NEXT: v_pk_min_i16 v1, v4, v1 +; GISEL-GFX9-NEXT: v_pk_min_i16 v2, v4, v2 +; GISEL-GFX9-NEXT: v_pk_min_i16 v3, v4, v3 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-GFX9-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_and_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GISEL-GFX9-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GISEL-GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_v8i16_input_1: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX11-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX11-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX11-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GISEL-GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GISEL-GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GISEL-GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v4, 8, v4 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v6, 8, v6 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v4 +; GISEL-GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_or_b32_e32 v1, v1, v6 +; GISEL-GFX11-NEXT: v_or_b32_e32 v3, v3, v7 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_v8i16_input_1: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX12-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX12-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX12-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GISEL-GFX12-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GISEL-GFX12-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX12-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GISEL-GFX12-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v4, 8, v4 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v5, 8, v5 +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v6, 8, v6 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v7, 8, v7 +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v4 +; GISEL-GFX12-NEXT: v_or_b32_e32 v2, v2, v5 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_or_b32_e32 v1, v1, v6 +; GISEL-GFX12-NEXT: v_or_b32_e32 v3, v3, v7 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v1, v3, 16, v2 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <8 x i16> @llvm.smin.v8i16(<8 x i16> , <8 x i16> %src) + %smed = call <8 x i16> @llvm.smax.v8i16(<8 x i16> , <8 x i16> %smin) + %vec.trunc = trunc <8 x i16> %smed to <8 x i8> + %cast = bitcast <8 x i8> %vec.trunc to <4 x i16> + ret <4 x i16> %cast +} + +define <2 x i32> @basic_smax_smin_v8i16_input_2(<8 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_v8i16_input_2: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v5, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-VI-NEXT: v_min_i16_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v2, 0xff, v2 +; SDAG-VI-NEXT: v_min_i16_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v3, 0xff, v3 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v6, v6, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v5, v5, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_max_i16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v5 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v1, v2, v7 +; SDAG-VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_v8i16_input_2: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v1, v2 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_v8i16_input_2: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_v8i16_input_2: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_v8i16_input_2: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v5, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_e32 v6, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v7, 0xff, v2 +; GISEL-VI-NEXT: v_min_i16_e32 v8, 0xff, v3 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v4, 0, v5 +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v5, 0, v6 +; GISEL-VI-NEXT: v_max_i16_e32 v6, 0, v7 +; GISEL-VI-NEXT: v_max_i16_e32 v7, 0, v8 +; GISEL-VI-NEXT: v_mov_b32_e32 v8, 8 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v4 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GISEL-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_v8i16_input_2: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v4, v0 +; GISEL-GFX9-NEXT: v_pk_min_i16 v1, v4, v1 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GISEL-GFX9-NEXT: v_pk_min_i16 v2, v4, v2 +; GISEL-GFX9-NEXT: v_pk_min_i16 v3, v4, v3 +; GISEL-GFX9-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_and_or_b32 v0, v0, v4, v6 +; GISEL-GFX9-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v7, 24 +; GISEL-GFX9-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX9-NEXT: v_or3_b32 v0, v0, v6, v1 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GISEL-GFX9-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_or3_b32 v1, v1, v2, v3 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_v8i16_input_2: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX11-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX11-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX11-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GISEL-GFX11-NEXT: v_bfe_u32 v6, v2, 16, 8 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GISEL-GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX11-NEXT: v_and_b32_e32 v7, 0xff, v3 +; GISEL-GFX11-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GISEL-GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GISEL-GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v6 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_or3_b32 v0, v0, v5, v1 +; GISEL-GFX11-NEXT: v_or3_b32 v1, v2, v7, v3 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_v8i16_input_2: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX12-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX12-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX12-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GISEL-GFX12-NEXT: v_bfe_u32 v6, v2, 16, 8 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GISEL-GFX12-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX12-NEXT: v_and_b32_e32 v7, 0xff, v3 +; GISEL-GFX12-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GISEL-GFX12-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GISEL-GFX12-NEXT: v_and_or_b32 v2, 0xff, v2, v6 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_or3_b32 v0, v0, v5, v1 +; GISEL-GFX12-NEXT: v_or3_b32 v1, v2, v7, v3 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <8 x i16> @llvm.smin.v8i16(<8 x i16> , <8 x i16> %src) + %smed = call <8 x i16> @llvm.smax.v8i16(<8 x i16> , <8 x i16> %smin) + %vec.trunc = trunc <8 x i16> %smed to <8 x i8> + %cast = bitcast <8 x i8> %vec.trunc to <2 x i32> + ret <2 x i32> %cast +} + +define i64 @basic_smax_smin_v8i16_input_3(<8 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_v8i16_input_3: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v5, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-VI-NEXT: v_min_i16_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v2, 0xff, v2 +; SDAG-VI-NEXT: v_min_i16_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v3, 0xff, v3 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v6, v6, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v5, v5, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_max_i16_sdwa v4, v4, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v5 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v1, v2, v7 +; SDAG-VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_v8i16_input_3: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v1, v2 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_v8i16_input_3: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_v8i16_input_3: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v2, v2 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v3, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; SDAG-GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: v_or_b32_e32 v1, v2, v3 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_v8i16_input_3: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v5, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_e32 v6, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v7, 0xff, v2 +; GISEL-VI-NEXT: v_min_i16_e32 v8, 0xff, v3 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v4, 0, v5 +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v5, 0, v6 +; GISEL-VI-NEXT: v_max_i16_e32 v6, 0, v7 +; GISEL-VI-NEXT: v_max_i16_e32 v7, 0, v8 +; GISEL-VI-NEXT: v_mov_b32_e32 v8, 8 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v4 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GISEL-VI-NEXT: v_max_i16_e32 v3, 0, v3 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GISEL-VI-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_v8i16_input_3: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v4, v0 +; GISEL-GFX9-NEXT: v_pk_min_i16 v1, v4, v1 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GISEL-GFX9-NEXT: v_pk_min_i16 v2, v4, v2 +; GISEL-GFX9-NEXT: v_pk_min_i16 v3, v4, v3 +; GISEL-GFX9-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_and_or_b32 v0, v0, v4, v6 +; GISEL-GFX9-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v7, 24 +; GISEL-GFX9-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX9-NEXT: v_or3_b32 v0, v0, v6, v1 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v1, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 +; GISEL-GFX9-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_or3_b32 v1, v1, v2, v3 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_v8i16_input_3: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX11-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX11-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX11-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GISEL-GFX11-NEXT: v_bfe_u32 v6, v2, 16, 8 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GISEL-GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX11-NEXT: v_and_b32_e32 v7, 0xff, v3 +; GISEL-GFX11-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GISEL-GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GISEL-GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v6 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_or3_b32 v0, v0, v5, v1 +; GISEL-GFX11-NEXT: v_or3_b32 v1, v2, v7, v3 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_v8i16_input_3: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v2, 0xff00ff, v2 +; GISEL-GFX12-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX12-NEXT: v_pk_min_i16 v3, 0xff00ff, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v2, 0, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX12-NEXT: v_pk_max_i16 v3, 0, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GISEL-GFX12-NEXT: v_bfe_u32 v6, v2, 16, 8 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GISEL-GFX12-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX12-NEXT: v_and_b32_e32 v7, 0xff, v3 +; GISEL-GFX12-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GISEL-GFX12-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GISEL-GFX12-NEXT: v_and_or_b32 v2, 0xff, v2, v6 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_or3_b32 v0, v0, v5, v1 +; GISEL-GFX12-NEXT: v_or3_b32 v1, v2, v7, v3 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <8 x i16> @llvm.smin.v8i16(<8 x i16> , <8 x i16> %src) + %smed = call <8 x i16> @llvm.smax.v8i16(<8 x i16> , <8 x i16> %smin) + %vec.trunc = trunc <8 x i16> %smed to <8 x i8> + %cast = bitcast <8 x i8> %vec.trunc to i64 + ret i64 %cast +} + +define <2 x i16> @basic_smax_smin_v4i16_input_1(<4 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_v4i16_input_1: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v3 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_v4i16_input_1: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_v4i16_input_1: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_v4i16_input_1: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_v4i16_input_1: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v4, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_min_i16_e32 v3, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v3 +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v3, 0, v4 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_v4i16_input_1: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v2, v0 +; GISEL-GFX9-NEXT: v_pk_min_i16 v1, v2, v1 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GISEL-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_v4i16_input_1: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_v4i16_input_1: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GISEL-GFX12-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX12-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_lshlrev_b16 v2, 8, v2 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v3, 8, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-GFX12-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <4 x i16> @llvm.smin.v4i16(<4 x i16> , <4 x i16> %src) + %smed = call <4 x i16> @llvm.smax.v4i16(<4 x i16> , <4 x i16> %smin) + %vec.trunc = trunc <4 x i16> %smed to <4 x i8> + %cast = bitcast <4 x i8> %vec.trunc to <2 x i16> + ret <2 x i16> %cast +} + +define i32 @basic_smax_smin_v4i16_input_2(<4 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_v4i16_input_2: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v3 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_v4i16_input_2: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_v4i16_input_2: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_v4i16_input_2: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 +; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v1, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_v4i16_input_2: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v3, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_min_i16_e32 v4, 0xff, v1 +; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v3 +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v3, 0, v4 +; GISEL-VI-NEXT: v_mov_b32_e32 v4, 8 +; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-VI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_v4i16_input_2: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v2, v0 +; GISEL-GFX9-NEXT: v_pk_min_i16 v1, v2, v1 +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GISEL-GFX9-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GISEL-GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v3, 24 +; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GISEL-GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_v4i16_input_2: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GISEL-GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GISEL-GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or3_b32 v0, v0, v3, v1 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_v4i16_input_2: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v1, 0xff00ff, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_pk_max_i16 v1, 0, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GISEL-GFX12-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GISEL-GFX12-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GISEL-GFX12-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GISEL-GFX12-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or3_b32 v0, v0, v3, v1 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <4 x i16> @llvm.smin.v4i16(<4 x i16> , <4 x i16> %src) + %smed = call <4 x i16> @llvm.smax.v4i16(<4 x i16> , <4 x i16> %smin) + %vec.trunc = trunc <4 x i16> %smed to <4 x i8> + %cast = bitcast <4 x i8> %vec.trunc to i32 + ret i32 %cast +} + +define i24 @basic_smax_smin_vec_v3i16(<3 x i16> %src) { +; SDAG-VI-LABEL: basic_smax_smin_vec_v3i16: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_mov_b32_e32 v3, s4 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_mov_b32_e32 v4, s4 +; SDAG-VI-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9-LABEL: basic_smax_smin_vec_v3i16: +; SDAG-GFX9: ; %bb.0: +; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff +; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] +; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX9-NEXT: v_pk_min_i16 v1, v1, s4 +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SDAG-GFX9-NEXT: v_pk_max_i16 v1, v1, 0 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; SDAG-GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-LABEL: basic_smax_smin_vec_v3i16: +; SDAG-GFX11: ; %bb.0: +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX11-NEXT: v_pk_min_i16 v1, 0xff, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX11-NEXT: v_pk_max_i16 v1, v1, 0 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 +; SDAG-GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-LABEL: basic_smax_smin_vec_v3i16: +; SDAG-GFX12: ; %bb.0: +; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-NEXT: v_pk_min_i16 v1, 0xff, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-NEXT: v_pk_max_i16 v1, v1, 0 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_lshlrev_b16 v2, 8, v2 +; SDAG-GFX12-NEXT: v_lshlrev_b16 v3, 8, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-GFX12-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: basic_smax_smin_vec_v3i16: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-VI-NEXT: v_min_i16_e32 v3, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; GISEL-VI-NEXT: v_max_i16_e32 v2, 0, v3 +; GISEL-VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GISEL-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-VI-NEXT: v_mov_b32_e32 v2, 0xffff +; GISEL-VI-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9-LABEL: basic_smax_smin_vec_v3i16: +; GISEL-GFX9: ; %bb.0: +; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff00ff +; GISEL-GFX9-NEXT: v_pk_min_i16 v0, v2, v0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GISEL-GFX9-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GISEL-GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GISEL-GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GISEL-GFX9-NEXT: s_mov_b32 s4, 0xffff +; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX9-NEXT: v_and_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX11-LABEL: basic_smax_smin_vec_v3i16: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX11-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GISEL-GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-LABEL: basic_smax_smin_vec_v3i16: +; GISEL-GFX12: ; %bb.0: +; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GISEL-GFX12-NEXT: v_lshlrev_b16 v2, 8, v2 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] + %smin = call <3 x i16> @llvm.smin.v3i16(<3 x i16> , <3 x i16> %src) + %smed = call <3 x i16> @llvm.smax.v3i16(<3 x i16> , <3 x i16> %smin) + %vec.trunc = trunc <3 x i16> %smed to <3 x i8> + %cast = bitcast <3 x i8> %vec.trunc to i24 + ret i24 %cast +}