Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,9 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//

TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")

TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
19 changes: 19 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,25 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}

// CHECK-LABEL: @test_prefetch(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[GPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[FPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR_ADDR]] to ptr
// CHECK-NEXT: [[GPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GPTR_ADDR]] to ptr
// CHECK-NEXT: store ptr [[FPTR:%.*]], ptr [[FPTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store ptr addrspace(1) [[GPTR:%.*]], ptr [[GPTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FPTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @llvm.amdgcn.flat.prefetch(ptr [[TMP0]], i32 0)
// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[GPTR_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @llvm.amdgcn.global.prefetch(ptr addrspace(1) [[TMP1]], i32 8)
// CHECK-NEXT: ret void
//
void test_prefetch(generic void *fptr, global void *gptr) {
__builtin_amdgcn_flat_prefetch(fptr, 0);
__builtin_amdgcn_global_prefetch(gptr, 8);
}

// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
7 changes: 6 additions & 1 deletion clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s

typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
Expand Down Expand Up @@ -36,6 +36,11 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}

void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
__builtin_amdgcn_flat_prefetch(fptr, cpol); // expected-error {{'__builtin_amdgcn_flat_prefetch' must be a constant integer}}
__builtin_amdgcn_global_prefetch(gptr, cpol); // expected-error {{'__builtin_amdgcn_global_prefetch' must be a constant integer}}
}

void test_cvt_f32_fp8_e5m3(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
Expand Down
18 changes: 18 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3045,6 +3045,24 @@ def int_amdgcn_ds_bpermute_fi_b32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

def int_amdgcn_flat_prefetch : ClangBuiltin<"__builtin_amdgcn_flat_prefetch">,
Intrinsic<[],
[llvm_ptr_ty, // Pointer
llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
"", [SDNPMemOperand]
>;

def int_amdgcn_global_prefetch : ClangBuiltin<"__builtin_amdgcn_global_prefetch">,
Intrinsic<[],
[LLVMQualPointerType<1>, // Pointer
llvm_i32_ty], // cachepolicy(imm), bits [0-2] = th, bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>],
"", [SDNPMemOperand]
>;

//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5437,6 +5437,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_flat_prefetch:
case Intrinsic::amdgcn_global_prefetch:
return getDefaultMappingVOP(MI);
default:
return getInvalidInstructionMapping();
}
Expand Down
19 changes: 19 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2174,6 +2174,25 @@ defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f

} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]

multiclass FlatIntrPrefetchPats<string inst, SDPatternOperator intr> {
def : GCNPat <
(intr (FlatOffset i64:$vaddr, i32:$offset), timm:$cpol),
(!cast<FLAT_Pseudo>(inst) $vaddr, $offset, $cpol)
>;

def : GCNPat <
(intr (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), timm:$cpol),
(!cast<FLAT_Pseudo>(inst#"_SADDR") $saddr, $voffset, $offset, $cpol)> {
let AddedComplexity = 11;
}
}

let SubtargetPredicate = HasVmemPrefInsts in {
// Patterns for target intrinsics
defm : FlatIntrPrefetchPats<"FLAT_PREFETCH_B8", int_amdgcn_flat_prefetch>;
defm : FlatIntrPrefetchPats<"GLOBAL_PREFETCH_B8", int_amdgcn_global_prefetch>;
} // End SubtargetPredicate = HasVmemPrefInsts

//===----------------------------------------------------------------------===//
// Target
//===----------------------------------------------------------------------===//
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1550,7 +1550,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_s_prefetch_data: {
case Intrinsic::amdgcn_s_prefetch_data:
case Intrinsic::amdgcn_flat_prefetch:
case Intrinsic::amdgcn_global_prefetch: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
Info.ptrVal = CI.getArgOperand(0);
Expand Down
8 changes: 3 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2108,8 +2108,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
assert(TII->isFLAT(MI));

// All flat instructions use the VMEM counter.
assert(TII->usesVM_CNT(MI));
// All flat instructions use the VMEM counter except prefetch.
if (!TII->usesVM_CNT(MI))
return false;

// If there are no memory operands then conservatively assume the flat
// operation may access VMEM.
Expand Down Expand Up @@ -2295,9 +2296,6 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
}

// A Flat memory operation must access at least one address space.
assert(FlatASCount);

// This is a flat memory operation that access both VMEM and LDS, so note it
// - it will require that both the VM and LGKM be flushed to zero if it is
// pending when a VM or LGKM dependency occurs.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["ashr-pk-insts"] = true;
Features["atomic-buffer-pk-add-bf16-inst"] = true;
Features["vmem-pref-insts"] = true;
Features["atomic-fadd-rtn-insts"] = true;
Features["atomic-buffer-global-pk-add-f16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
Expand Down
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/AMDGPU/hard-clauses-gfx1250.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GFX12

---
name: flat_prefetch_flat_load
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX12-LABEL: name: flat_prefetch_flat_load
; GFX12: liveins: $vgpr0_vgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
FLAT_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
$vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
...

---
name: global_prefetch_flat_load
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX12-LABEL: name: global_prefetch_flat_load
; GFX12: liveins: $vgpr0_vgpr1
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
; GFX12-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
GLOBAL_PREFETCH_B8 $vgpr0_vgpr1, 0, 0, implicit $exec
$vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
...
100 changes: 100 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s

declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)

define amdgpu_ps void @flat_prefetch(ptr %ptr) {
; GCN-LABEL: flat_prefetch:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1]
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
ret void
}

define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
; GCN-LABEL: flat_prefetch_sgpr:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 0)
ret void
}

define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
; GCN-LABEL: flat_prefetch_offset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1] offset:512
; GCN-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %ptr, i32 128
tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
ret void
}

define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
; GCN-LABEL: flat_prefetch_sgpr_voffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v0, s[0:1]
; GCN-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr %ptr, i32 %offset
tail call void @llvm.amdgcn.flat.prefetch(ptr %gep, i32 0)
ret void
}

define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v0, s[0:1] offset:128
; GCN-NEXT: s_endpgm
entry:
%gep1 = getelementptr i8, ptr %ptr, i32 %offset
%gep2 = getelementptr i8, ptr %gep1, i32 128
tail call void @llvm.amdgcn.flat.prefetch(ptr %gep2, i32 0)
ret void
}

define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
; GCN-LABEL: flat_prefetch_se:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1] scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 8)
ret void
}

define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
; GCN-LABEL: flat_prefetch_se_nt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 9)
ret void
}

define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
; GCN-LABEL: flat_prefetch_dev_ht:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 18)
ret void
}

define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
; GCN-LABEL: flat_prefetch_sys_lu:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GCN-NEXT: s_endpgm
entry:
tail call void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 27)
ret void
}
Loading