Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2206,11 +2206,18 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

// i32 llvm.amdgcn.readfirstlane(i32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should just upgrade the intrinsic instead of introducing a new copy

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// ty llvm.amdgcn.readfirstlane2(ty)
// A type-generic version of readfirstlane.
def int_amdgcn_readfirstlane2 :
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5453,6 +5453,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LDS)
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
NODE_NAME_CASE(READFIRSTLANE)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(LOAD_D16_HI)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,8 @@ enum NodeType : unsigned {
FPTRUNC_ROUND_UPWARD,
FPTRUNC_ROUND_DOWNWARD,

READFIRSTLANE,

DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",

def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>;

// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
Expand Down Expand Up @@ -504,3 +506,7 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;

def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
[(int_amdgcn_readfirstlane node:$src),
(AMDGPUreadfirstlane_impl node:$src)]>;
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ def UniformIntrinsics : GenericTable {
}

def : AlwaysUniform<int_amdgcn_readfirstlane>;
def : AlwaysUniform<int_amdgcn_readfirstlane2>;
def : AlwaysUniform<int_amdgcn_readlane>;
def : AlwaysUniform<int_amdgcn_icmp>;
def : AlwaysUniform<int_amdgcn_fcmp>;
Expand Down
21 changes: 20 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
}

setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
Expand Down Expand Up @@ -8452,6 +8451,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
case Intrinsic::amdgcn_readfirstlane2:
if (VT.getSizeInBits() <= 32) {
MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
return DAG.getBitcast(
VT, DAG.getAnyExtOrTrunc(
DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, MVT::i32,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe directly accept legal 32-bit types and avoid the bitcast?

DAG.getAnyExtOrTrunc(
DAG.getBitcast(IntVT, Op.getOperand(1)), DL,
MVT::i32)),
DL, IntVT));
}
if (VT.getSizeInBits() % 32 == 0) {
MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
return DAG.getBitcast(
VT, DAG.UnrollVectorOp(
DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, VecVT,
DAG.getBitcast(VecVT, Op.getOperand(1)))
.getNode()));
}
return SDValue();
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3405,7 +3405,7 @@ def : GCNPat<
// FIXME: Should also do this for readlane, but tablegen crashes on
// the ignored src1.
def : GCNPat<
(int_amdgcn_readfirstlane (i32 imm:$src)),
(AMDGPUreadfirstlane (i32 imm:$src)),
(S_MOV_B32 SReg_32:$src)
>;

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
// FIXME: Specify SchedRW for READFIRSTLANE_B32
// TODO: There is VOP3 encoding also
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
getVOP1Pat<int_amdgcn_readfirstlane,
getVOP1Pat<AMDGPUreadfirstlane,
VOP_READFIRSTLANE>.ret, 1> {
let isConvergent = 1;
}
Expand Down
175 changes: 140 additions & 35 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
Original file line number Diff line number Diff line change
@@ -1,71 +1,176 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

global-isel checks


declare i32 @llvm.amdgcn.readfirstlane(i32) #0

; CHECK-LABEL: {{^}}test_readfirstlane:
; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v2
define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) #1 {
define void @test_readfirstlane(ptr addrspace(1) %out, i32 %src) {
; CHECK-LABEL: test_readfirstlane:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: v_mov_b32_e32 v2, s4
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}

; CHECK-LABEL: {{^}}test_readfirstlane_imm:
; CHECK: s_mov_b32 [[SGPR_VAL:s[0-9]]], 32
; CHECK-NOT: [[SGPR_VAL]]
; CHECK: ; use [[SGPR_VAL]]
define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_readfirstlane_imm(ptr addrspace(1) %out) {
; CHECK-LABEL: test_readfirstlane_imm:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_mov_b32 s0, 32
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_endpgm
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}

; CHECK-LABEL: {{^}}test_readfirstlane_imm_fold:
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
; CHECK-NOT: [[VVAL]]
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_readfirstlane_imm_fold(ptr addrspace(1) %out) {
; CHECK-LABEL: test_readfirstlane_imm_fold:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 32
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_endpgm
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}

; CHECK-LABEL: {{^}}test_readfirstlane_m0:
; CHECK: s_mov_b32 m0, -1
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], m0
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VVAL]]
define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-LABEL: test_readfirstlane_m0:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v2, m0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_endpgm
%m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}

; CHECK-LABEL: {{^}}test_readfirstlane_copy_from_sgpr:
; CHECK: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 [[SGPR:s[0-9]+]]
; CHECK: ;;#ASMEND
; CHECK-NOT: [[SGPR]]
; CHECK-NOT: readfirstlane
; CHECK: v_mov_b32_e32 [[VCOPY:v[0-9]+]], [[SGPR]]
; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VCOPY]]
define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(ptr addrspace(1) %out) {
; CHECK-LABEL: test_readfirstlane_copy_from_sgpr:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: s_mov_b32 s2, 0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_endpgm
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %sgpr)
store i32 %readfirstlane, ptr addrspace(1) %out, align 4
ret void
}

; Make sure this doesn't crash.
; CHECK-LABEL: {{^}}test_readfirstlane_fi:
; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 0
define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) #1 {
define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
; CHECK-LABEL: test_readfirstlane_fi:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s9
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s4
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_endpgm
%alloca = alloca i32, addrspace(5)
%int = ptrtoint ptr addrspace(5) %alloca to i32
%readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %int)
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
ret void
}

attributes #0 = { nounwind readnone convergent }
attributes #1 = { nounwind }
define void @test_readfirstlane2_i32(ptr addrspace(1) %out, i32 %src) {
; CHECK-LABEL: test_readfirstlane2_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s4
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%x = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src)
call void asm sideeffect "; use $0", "s"(i32 %x)
ret void
}

define void @test_readfirstlane2_i64(ptr addrspace(1) %out, i64 %src) {
; CHECK-LABEL: test_readfirstlane2_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s5, v3
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[4:5]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%x = call i64 @llvm.amdgcn.readfirstlane2.i64(i64 %src)
call void asm sideeffect "; use $0", "s"(i64 %x)
ret void
}

define void @test_readfirstlane2_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
; CHECK-LABEL: test_readfirstlane2_v7i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s10, v8
; CHECK-NEXT: v_readfirstlane_b32 s9, v7
; CHECK-NEXT: v_readfirstlane_b32 s8, v6
; CHECK-NEXT: v_readfirstlane_b32 s7, v5
; CHECK-NEXT: v_readfirstlane_b32 s6, v4
; CHECK-NEXT: v_readfirstlane_b32 s5, v3
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[4:10]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%x = call <7 x i32> @llvm.amdgcn.readfirstlane2.v7i32(<7 x i32> %src)
call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
ret void
}

define void @test_readfirstlane2_f16(ptr addrspace(1) %out, half %src) {
; CHECK-LABEL: test_readfirstlane2_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s4
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%x = call half @llvm.amdgcn.readfirstlane2.f16(half %src)
call void asm sideeffect "; use $0", "s"(half %x)
ret void
}

define void @test_readfirstlane2_float(ptr addrspace(1) %out, float %src) {
; CHECK-LABEL: test_readfirstlane2_float:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s4
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%x = call float @llvm.amdgcn.readfirstlane2.f32(float %src)
call void asm sideeffect "; use $0", "s"(float %x)
ret void
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check bfloat, vector bfloat/half, float2, some pointers/pointer vectors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is float2? Done the others.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<2 x float>