Skip to content

Commit 9deb7f6

Browse files
authored
[AMDGPU] gfx1250 vmem prefetch target intrinsics and builtins (#150466)
1 parent 2cb6be2 commit 9deb7f6

File tree

13 files changed

+309
-8
lines changed

13 files changed

+309
-8
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
642642
// GFX1250+ only builtins.
643643
//===----------------------------------------------------------------------===//
644644

645+
TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
646+
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")
647+
645648
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
646649
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
647650
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
440440
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
441441
}
442442

443+
// CHECK-LABEL: @test_prefetch(
444+
// CHECK-NEXT: entry:
445+
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
446+
// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
447+
// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
448+
// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
449+
// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
450+
// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
451+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
452+
// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
453+
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
454+
// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
455+
// CHECK-NEXT: ret void
456+
//
457+
void test_prefetch(generic void *fptr, global void *gptr) {
458+
__builtin_amdgcn_flat_prefetch(fptr, 0);
459+
__builtin_amdgcn_global_prefetch(gptr, 8);
460+
}
461+
443462
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
444463
// CHECK-NEXT: entry:
445464
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// REQUIRES: amdgpu-registered-target
2-
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
2+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
33

44
typedef int v4i __attribute__((ext_vector_type(4)));
55
typedef int v8i __attribute__((ext_vector_type(8)));
@@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
3636
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
3737
}
3838

39+
void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
40+
__builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}}
41+
__builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}}
42+
}
43+
3944
void test_cvt_f32_fp8_e5m3(global int* out, int a)
4045
{
4146
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
30453045
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
30463046
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
30473047

3048+
def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
3049+
Intrinsic<[],
3050+
[llvm_ptr_ty, // Pointer
3051+
llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
3052+
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
3053+
IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
3054+
"", [SDNPMemOperand]
3055+
>;
3056+
3057+
def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
3058+
Intrinsic<[],
3059+
[LLVMQualPointerType<1>, // Pointer
3060+
llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
3061+
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
3062+
IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
3063+
"", [SDNPMemOperand]
3064+
>;
3065+
30483066
//===----------------------------------------------------------------------===//
30493067
// Deep learning intrinsics.
30503068
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
54375437
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
54385438
break;
54395439
}
5440+
case Intrinsic::amdgcn_flat_prefetch:
5441+
case Intrinsic::amdgcn_global_prefetch:
5442+
return getDefaultMappingVOP(MI);
54405443
default:
54415444
return getInvalidInstructionMapping();
54425445
}

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f
21742174

21752175
} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
21762176

2177+
multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
2178+
def : GCNPat <
2179+
(intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
2180+
(!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
2181+
>;
2182+
2183+
def : GCNPat <
2184+
(intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
2185+
(!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
2186+
let AddedComplexity = 11;
2187+
}
2188+
}
2189+
2190+
let SubtargetPredicate = HasVmemPrefInsts in {
2191+
// Patterns for target intrinsics
2192+
defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
2193+
defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
2194+
} // End SubtargetPredicate = HasVmemPrefInsts
2195+
21772196
//===----------------------------------------------------------------------===//
21782197
// Target
21792198
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1550,7 +1550,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
15501550
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
15511551
return true;
15521552
}
1553-
case Intrinsic::amdgcn_s_prefetch_data: {
1553+
case Intrinsic::amdgcn_s_prefetch_data:
1554+
case Intrinsic::amdgcn_flat_prefetch:
1555+
case Intrinsic::amdgcn_global_prefetch: {
15541556
Info.opc = ISD::INTRINSIC_VOID;
15551557
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
15561558
Info.ptrVal = CI.getArgOperand(0);

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
21082108
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
21092109
assert(TII->isFLAT(MI));
21102110

2111-
// All flat instructions use the VMEM counter.
2112-
assert(TII->usesVM_CNT(MI));
2111+
// All flat instructions use the VMEM counter except prefetch.
2112+
if (!TII->usesVM_CNT(MI))
2113+
return false;
21132114

21142115
// If there are no memory operands then conservatively assume the flat
21152116
// operation may access VMEM.
@@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
22952296
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
22962297
}
22972298

2298-
// A Flat memory operation must access at least one address space.
2299-
assert(FlatASCount);
2300-
23012299
// This is a flat memory operation that access both VMEM and LDS, so note it
23022300
// - it will require that both the VM and LGKM be flushed to zero if it is
23032301
// pending when a VM or LGKM dependency occurs.

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
451451
Features["permlane16-swap"] = true;
452452
Features["ashr-pk-insts"] = true;
453453
Features["atomic-buffer-pk-add-bf16-inst"] = true;
454+
Features["vmem-pref-insts"] = true;
454455
Features["atomic-fadd-rtn-insts"] = true;
455456
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
456457
Features["atomic-flat-pk-add-16-insts"] = true;

0 commit comments

Comments
 (0)