Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ BUILTIN(__builtin_amdgcn_ds_bpermute, "iii", "nc")
BUILTIN(__builtin_amdgcn_readfirstlane, "ii", "nc")
BUILTIN(__builtin_amdgcn_readlane, "iii", "nc")
BUILTIN(__builtin_amdgcn_fmed3f, "ffff", "nc")
BUILTIN(__builtin_amdgcn_smed3, "iiii", "nc")
BUILTIN(__builtin_amdgcn_umed3, "UiUiUiUi", "nc")
BUILTIN(__builtin_amdgcn_ds_faddf, "ff*3fIiIiIb", "n")
BUILTIN(__builtin_amdgcn_ds_fminf, "ff*3fIiIiIb", "n")
BUILTIN(__builtin_amdgcn_ds_fmaxf, "ff*3fIiIiIb", "n")
Expand Down Expand Up @@ -265,6 +267,8 @@ TARGET_BUILTIN(__builtin_amdgcn_perm, "UiUiUiUi", "nc", "gfx8-insts")
//===----------------------------------------------------------------------===//

TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts")
TARGET_BUILTIN(__builtin_amdgcn_smed3h, "ssss", "nc", "gfx9-insts")
TARGET_BUILTIN(__builtin_amdgcn_umed3h, "UsUsUsUs", "nc", "gfx9-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "atomic-fadd-rtn-insts")
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_fmed3h:
return emitBuiltinWithOneOverloadedType<3>(*this, E,
Intrinsic::amdgcn_fmed3);
case AMDGPU::BI__builtin_amdgcn_smed3:
case AMDGPU::BI__builtin_amdgcn_smed3h:
return emitBuiltinWithOneOverloadedType<3>(*this, E,
Intrinsic::amdgcn_smed3);
case AMDGPU::BI__builtin_amdgcn_umed3:
case AMDGPU::BI__builtin_amdgcn_umed3h:
return emitBuiltinWithOneOverloadedType<3>(*this, E,
Intrinsic::amdgcn_umed3);
case AMDGPU::BI__builtin_amdgcn_ds_append:
case AMDGPU::BI__builtin_amdgcn_ds_consume: {
Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,18 @@ def int_amdgcn_fmed3 :
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_smed3 :
DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_umed3 :
DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
DefaultAttrsIntrinsic<[llvm_float_ty],
[llvm_float_ty, llvm_float_ty, llvm_float_ty],
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;

def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3_impl>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3_impl>;
def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;

Expand Down
172 changes: 172 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,26 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
return maxnum(Src0, Src1);
}

// Constant fold llvm.amdgcn.smed3 intrinsics for standard inputs.
static APInt smed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) {
APInt Max3 = Src0.sgt(Src1) ? (Src0.sgt(Src2) ? Src0 : Src2)
: (Src1.sgt(Src2) ? Src1 : Src2);

if (Max3 == Src0) return Src1.sgt(Src2) ? Src1 : Src2;
if (Max3 == Src1) return Src0.sgt(Src2) ? Src0 : Src2;
return Src0.sgt(Src1) ? Src0 : Src1;
}

// Constant fold llvm.amdgcn.umed3 intrinsics for standard inputs.
static APInt umed3AMDGCN(const APInt &Src0, const APInt &Src1, const APInt &Src2) {
APInt Max3 = Src0.ugt(Src1) ? (Src0.ugt(Src2) ? Src0 : Src2)
: (Src1.ugt(Src2) ? Src1 : Src2);

if (Max3 == Src0) return Src1.ugt(Src2) ? Src1 : Src2;
if (Max3 == Src1) return Src0.ugt(Src2) ? Src0 : Src2;
return Src0.ugt(Src1) ? Src0 : Src1;
}

// Check if a value can be converted to a 16-bit value without losing
// precision.
// The value is expected to be either a float (IsFloat = true) or an unsigned
Expand Down Expand Up @@ -427,6 +447,36 @@ static Value *matchFPExtFromF16(Value *Arg) {
return nullptr;
}

/// Match an sext from i16 to i32, or a constant we can convert.
static Value *matchSExtFromI16(Value *Arg) {
Value *Src = nullptr;
ConstantInt *CInt = nullptr;
if (match(Arg, m_OneUse(m_SExt(m_Value(Src))))) {
if (Src->getType()->isIntegerTy(16))
return Src;
} else if (match(Arg, m_ConstantInt(CInt))) {
// Check if the constant fits in i16
if (CInt->getValue().getActiveBits() <= 16)
return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16));
}
return nullptr;
}

/// Match a zext from i16 to i32, or a constant we can convert.
static Value *matchZExtFromI16(Value *Arg) {
Value *Src = nullptr;
ConstantInt *CInt = nullptr;
if (match(Arg, m_OneUse(m_ZExt(m_Value(Src))))) {
if (Src->getType()->isIntegerTy(16))
return Src;
} else if (match(Arg, m_ConstantInt(CInt))) {
// Check if the constant fits in i16
if (CInt->getValue().getActiveBits() <= 16)
return ConstantInt::get(Type::getInt16Ty(Arg->getContext()), CInt->getValue().trunc(16));
}
return nullptr;
}

// Trim all zero components from the end of the vector \p UseV and return
// an appropriate bitset with known elements.
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
Expand Down Expand Up @@ -1174,6 +1224,128 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {

break;
}
case Intrinsic::amdgcn_smed3: {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
Value *Src2 = II.getArgOperand(2);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is untested and these intrinsics are not universally supported

// Propagate poison values.
for (Value *Src : {Src0, Src1, Src2}) {
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
}

bool Swap = false;
// Canonicalize constants to RHS operands.
//
// smed3(c0, x, c1) -> smed3(x, c0, c1)
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}

if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
std::swap(Src1, Src2);
Swap = true;
}

if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}

if (Swap) {
II.setArgOperand(0, Src0);
II.setArgOperand(1, Src1);
II.setArgOperand(2, Src2);
return &II;
}

// Constant fold smed3 with constant operands.
if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) {
if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) {
if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) {
APInt Result = smed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue());
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
}
}
}

// Width reduction for integer extensions.
// smed3((sext X), (sext Y), (sext Z)) -> sext (smed3(X, Y, Z))
if (Value *X = matchSExtFromI16(Src0)) {
if (Value *Y = matchSExtFromI16(Src1)) {
if (Value *Z = matchSExtFromI16(Src2)) {
Value *NewCall = IC.Builder.CreateIntrinsic(
IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
return new SExtInst(NewCall, II.getType());
}
}
}

break;
}
case Intrinsic::amdgcn_umed3: {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
Value *Src2 = II.getArgOperand(2);

// Propagate poison values.
for (Value *Src : {Src0, Src1, Src2}) {
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
}

bool Swap = false;
// Canonicalize constants to RHS operands.
//
// umed3(c0, x, c1) -> umed3(x, c0, c1)
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}

if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
std::swap(Src1, Src2);
Swap = true;
}

if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
std::swap(Src0, Src1);
Swap = true;
}

if (Swap) {
II.setArgOperand(0, Src0);
II.setArgOperand(1, Src1);
II.setArgOperand(2, Src2);
return &II;
}

// Constant fold umed3 with constant operands.
if (const ConstantInt *C0 = dyn_cast<ConstantInt>(Src0)) {
if (const ConstantInt *C1 = dyn_cast<ConstantInt>(Src1)) {
if (const ConstantInt *C2 = dyn_cast<ConstantInt>(Src2)) {
APInt Result = umed3AMDGCN(C0->getValue(), C1->getValue(), C2->getValue());
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
}
}
}

// Width reduction for integer extensions.
// umed3((zext X), (zext Y), (zext Z)) -> zext (umed3(X, Y, Z))
if (Value *X = matchZExtFromI16(Src0)) {
if (Value *Y = matchZExtFromI16(Src1)) {
if (Value *Z = matchZExtFromI16(Src2)) {
Value *NewCall = IC.Builder.CreateIntrinsic(
IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
return new ZExtInst(NewCall, II.getType());
}
}
}

break;
}
case Intrinsic::amdgcn_icmp:
case Intrinsic::amdgcn_fcmp: {
const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
Expand Down
19 changes: 12 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -334,16 +334,13 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
[]
>;

def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
[]
>;

def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;

def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;

def AMDGPUsmed3_impl : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, []>;

def AMDGPUumed3_impl : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, []>;

def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
SDTCisFP<0>, SDTCisVec<1>,
Expand Down Expand Up @@ -448,6 +445,14 @@ def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2),
(AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUsmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_smed3 node:$src0, node:$src1, node:$src2),
(AMDGPUsmed3_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUumed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_umed3 node:$src0, node:$src1, node:$src2),
(AMDGPUumed3_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUdiv_fixup : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_div_fixup node:$src0, node:$src1, node:$src2),
(AMDGPUdiv_fixup_impl node:$src0, node:$src1, node:$src2)]>;
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7798,6 +7798,28 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
case Intrinsic::amdgcn_smed3: {
GISelChangeObserver &Observer = Helper.Observer;

// FIXME: This is to workaround the inability of tablegen match combiners to
// match intrinsics in patterns.
Observer.changingInstr(MI);
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_SMED3));
MI.removeOperand(1);
Observer.changedInstr(MI);
return true;
}
case Intrinsic::amdgcn_umed3: {
GISelChangeObserver &Observer = Helper.Observer;

// FIXME: This is to workaround the inability of tablegen match combiners to
// match intrinsics in patterns.
Observer.changingInstr(MI);
MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_UMED3));
MI.removeOperand(1);
Observer.changedInstr(MI);
return true;
}
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4136,6 +4136,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_UMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
Expand Down Expand Up @@ -4660,6 +4661,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_smed3:
case Intrinsic::amdgcn_umed3:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
Expand Down
27 changes: 27 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smed3.i16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}test_smed3_i16:
; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_smed3_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
%src0.i16 = trunc i32 %src0.arg to i16
%src1.i16 = trunc i32 %src1.arg to i16
%src2.i16 = trunc i32 %src2.arg to i16
%med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 %src2.i16)
store i16 %med3, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}test_smed3_zero_i16:
; GCN: v_med3_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 0
define amdgpu_kernel void @test_smed3_zero_i16(ptr addrspace(1) %out, i32 %src0.arg, i32 %src1.arg) #1 {
%src0.i16 = trunc i32 %src0.arg to i16
%src1.i16 = trunc i32 %src1.arg to i16
%med3 = call i16 @llvm.amdgcn.smed3.i16(i16 %src0.i16, i16 %src1.i16, i16 0)
store i16 %med3, ptr addrspace(1) %out
ret void
}

declare i16 @llvm.amdgcn.smed3.i16(i16, i16, i16) #0

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
Loading