Skip to content

Commit 627614e

Browse files
committed
[AMDGPU] Add type-generic llvm.amdgcn.readfirstlane2 intrinsic
1 parent fff34d4 commit 627614e

File tree

9 files changed

+116
-3
lines changed

9 files changed

+116
-3
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2206,11 +2206,18 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
22062206
def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
22072207
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
22082208

2209+
// i32 llvm.amdgcn.readfirstlane(i32)
22092210
def int_amdgcn_readfirstlane :
22102211
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
22112212
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
22122213
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
22132214

2215+
// ty llvm.amdgcn.readfirstlane2(ty)
2216+
// A type-generic version of readfirstlane.
2217+
def int_amdgcn_readfirstlane2 :
2218+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
2219+
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2220+
22142221
// The lane argument must be uniform across the currently active threads of the
22152222
// current wave. Otherwise, the result is undefined.
22162223
def int_amdgcn_readlane :

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5453,6 +5453,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54535453
NODE_NAME_CASE(LDS)
54545454
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
54555455
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5456+
NODE_NAME_CASE(READFIRSTLANE)
54565457
NODE_NAME_CASE(DUMMY_CHAIN)
54575458
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
54585459
NODE_NAME_CASE(LOAD_D16_HI)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,8 @@ enum NodeType : unsigned {
541541
FPTRUNC_ROUND_UPWARD,
542542
FPTRUNC_ROUND_DOWNWARD,
543543

544+
READFIRSTLANE,
545+
544546
DUMMY_CHAIN,
545547
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
546548
LOAD_D16_HI,

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,8 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
342342

343343
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
344344

345+
def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", SDTIntUnaryOp>;
346+
345347
// SI+ export
346348
def AMDGPUExportOp : SDTypeProfile<0, 8, [
347349
SDTCisInt<0>, // i8 tgt
@@ -504,3 +506,7 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
504506
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
505507
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
506508
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
509+
510+
def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
511+
[(int_amdgcn_readfirstlane node:$src),
512+
(AMDGPUreadfirstlane_impl node:$src)]>;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ def UniformIntrinsics : GenericTable {
393393
}
394394

395395
def : AlwaysUniform<int_amdgcn_readfirstlane>;
396+
def : AlwaysUniform<int_amdgcn_readfirstlane2>;
396397
def : AlwaysUniform<int_amdgcn_readlane>;
397398
def : AlwaysUniform<int_amdgcn_icmp>;
398399
def : AlwaysUniform<int_amdgcn_fcmp>;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
238238
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
239239
AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
240240
}
241-
242241
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
243242
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
244243
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
@@ -8452,6 +8451,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
84528451
}
84538452
case Intrinsic::amdgcn_addrspacecast_nonnull:
84548453
return lowerADDRSPACECAST(Op, DAG);
8454+
case Intrinsic::amdgcn_readfirstlane2:
8455+
if (VT.getSizeInBits() <= 32) {
8456+
MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
8457+
return DAG.getBitcast(
8458+
VT, DAG.getAnyExtOrTrunc(
8459+
DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, MVT::i32,
8460+
DAG.getAnyExtOrTrunc(
8461+
DAG.getBitcast(IntVT, Op.getOperand(1)), DL,
8462+
MVT::i32)),
8463+
DL, IntVT));
8464+
}
8465+
if (VT.getSizeInBits() % 32 == 0) {
8466+
MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8467+
return DAG.getBitcast(
8468+
VT, DAG.UnrollVectorOp(
8469+
DAG.getNode(AMDGPUISD::READFIRSTLANE, DL, VecVT,
8470+
DAG.getBitcast(VecVT, Op.getOperand(1)))
8471+
.getNode()));
8472+
}
8473+
return SDValue();
84558474
default:
84568475
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
84578476
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3405,7 +3405,7 @@ def : GCNPat<
34053405
// FIXME: Should also do this for readlane, but tablegen crashes on
34063406
// the ignored src1.
34073407
def : GCNPat<
3408-
(int_amdgcn_readfirstlane (i32 imm:$src)),
3408+
(AMDGPUreadfirstlane (i32 imm:$src)),
34093409
(S_MOV_B32 SReg_32:$src)
34103410
>;
34113411

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
243243
// FIXME: Specify SchedRW for READFIRSTLANE_B32
244244
// TODO: There is VOP3 encoding also
245245
def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
246-
getVOP1Pat<int_amdgcn_readfirstlane,
246+
getVOP1Pat<AMDGPUreadfirstlane,
247247
VOP_READFIRSTLANE>.ret, 1> {
248248
let isConvergent = 1;
249249
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,80 @@ define amdgpu_kernel void @test_readfirstlane_fi(ptr addrspace(1) %out) {
9797
call void asm sideeffect "; use $0", "s"(i32 %readfirstlane)
9898
ret void
9999
}
100+
101+
define void @test_readfirstlane2_i32(ptr addrspace(1) %out, i32 %src) {
102+
; CHECK-LABEL: test_readfirstlane2_i32:
103+
; CHECK: ; %bb.0:
104+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
106+
; CHECK-NEXT: ;;#ASMSTART
107+
; CHECK-NEXT: ; use s4
108+
; CHECK-NEXT: ;;#ASMEND
109+
; CHECK-NEXT: s_setpc_b64 s[30:31]
110+
%x = call i32 @llvm.amdgcn.readfirstlane2.i32(i32 %src)
111+
call void asm sideeffect "; use $0", "s"(i32 %x)
112+
ret void
113+
}
114+
115+
define void @test_readfirstlane2_i64(ptr addrspace(1) %out, i64 %src) {
116+
; CHECK-LABEL: test_readfirstlane2_i64:
117+
; CHECK: ; %bb.0:
118+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119+
; CHECK-NEXT: v_readfirstlane_b32 s5, v3
120+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
121+
; CHECK-NEXT: ;;#ASMSTART
122+
; CHECK-NEXT: ; use s[4:5]
123+
; CHECK-NEXT: ;;#ASMEND
124+
; CHECK-NEXT: s_setpc_b64 s[30:31]
125+
%x = call i64 @llvm.amdgcn.readfirstlane2.i64(i64 %src)
126+
call void asm sideeffect "; use $0", "s"(i64 %x)
127+
ret void
128+
}
129+
130+
define void @test_readfirstlane2_v7i32(ptr addrspace(1) %out, <7 x i32> %src) {
131+
; CHECK-LABEL: test_readfirstlane2_v7i32:
132+
; CHECK: ; %bb.0:
133+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134+
; CHECK-NEXT: v_readfirstlane_b32 s10, v8
135+
; CHECK-NEXT: v_readfirstlane_b32 s9, v7
136+
; CHECK-NEXT: v_readfirstlane_b32 s8, v6
137+
; CHECK-NEXT: v_readfirstlane_b32 s7, v5
138+
; CHECK-NEXT: v_readfirstlane_b32 s6, v4
139+
; CHECK-NEXT: v_readfirstlane_b32 s5, v3
140+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
141+
; CHECK-NEXT: ;;#ASMSTART
142+
; CHECK-NEXT: ; use s[4:10]
143+
; CHECK-NEXT: ;;#ASMEND
144+
; CHECK-NEXT: s_setpc_b64 s[30:31]
145+
%x = call <7 x i32> @llvm.amdgcn.readfirstlane2.v7i32(<7 x i32> %src)
146+
call void asm sideeffect "; use $0", "s"(<7 x i32> %x)
147+
ret void
148+
}
149+
150+
define void @test_readfirstlane2_f16(ptr addrspace(1) %out, half %src) {
151+
; CHECK-LABEL: test_readfirstlane2_f16:
152+
; CHECK: ; %bb.0:
153+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
155+
; CHECK-NEXT: ;;#ASMSTART
156+
; CHECK-NEXT: ; use s4
157+
; CHECK-NEXT: ;;#ASMEND
158+
; CHECK-NEXT: s_setpc_b64 s[30:31]
159+
%x = call half @llvm.amdgcn.readfirstlane2.f16(half %src)
160+
call void asm sideeffect "; use $0", "s"(half %x)
161+
ret void
162+
}
163+
164+
define void @test_readfirstlane2_float(ptr addrspace(1) %out, float %src) {
165+
; CHECK-LABEL: test_readfirstlane2_float:
166+
; CHECK: ; %bb.0:
167+
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168+
; CHECK-NEXT: v_readfirstlane_b32 s4, v2
169+
; CHECK-NEXT: ;;#ASMSTART
170+
; CHECK-NEXT: ; use s4
171+
; CHECK-NEXT: ;;#ASMEND
172+
; CHECK-NEXT: s_setpc_b64 s[30:31]
173+
%x = call float @llvm.amdgcn.readfirstlane2.f32(float %src)
174+
call void asm sideeffect "; use $0", "s"(float %x)
175+
ret void
176+
}

0 commit comments

Comments
 (0)