Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,14 +588,17 @@ class AMDGPULowerModuleLDS {
return OrderedKernels;
}

static void partitionVariablesIntoIndirectStrategies(
void partitionVariablesIntoIndirectStrategies(
Module &M, LDSUsesInfoTy const &LDSUsesInfo,
VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
DenseSet<GlobalVariable *> &ModuleScopeVariables,
DenseSet<GlobalVariable *> &TableLookupVariables,
DenseSet<GlobalVariable *> &KernelAccessVariables,
DenseSet<GlobalVariable *> &DynamicVariables) {

if (TM.getOptLevel() == CodeGenOptLevel::None)
LoweringKindLoc = LoweringKind::table;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should not override the explicit flag. This also seems like a dubious way to avoid going over the limit; we can't rely on other optimizations without -O0 either. It it possible to compute the size usage with the different strategies before committing to one?


GlobalVariable *HybridModuleRoot =
LoweringKindLoc != LoweringKind::hybrid
? nullptr
Expand Down Expand Up @@ -1188,13 +1191,17 @@ class AMDGPULowerModuleLDS {
// Allocated at zero, recorded once on construction, not once per
// kernel
Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
LLVM_DEBUG(dbgs() << "amdgpu-lds-size after ModuleScopeStruct"
<< Offset << "\n");
}

if (AllocateKernelScopeStruct) {
GlobalVariable *KernelStruct = Replacement->second.SGV;
Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
LLVM_DEBUG(dbgs()
<< "amdgpu-lds-size after KernelStruct" << Offset << "\n");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
<< "amdgpu-lds-size after KernelStruct" << Offset << "\n");
<< "amdgpu-lds-size after KernelStruct" << Offset << '\n');

}

// If there is dynamic allocation, the alignment needed is included in
Expand All @@ -1205,6 +1212,8 @@ class AMDGPULowerModuleLDS {
GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
LLVM_DEBUG(dbgs() << "amdgpu-lds-size after DynamicVariable" << Offset
<< "\n");
}

if (Offset != 0) {
Expand Down
92 changes: 92 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lower-module-lds-force-table-O0.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
; RUN: not llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null < %s 2>&1 | FileCheck --check-prefix=CHECK %s
; CHECK-NOT: error: <unknown>:0:0: local memory (98304) exceeds limit (65536) in function 'k2'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK-NOT should be avoided, this should check the actual output. Not erroring is sufficient


@gA = internal addrspace(3) global [32768 x i8] undef, align 4
@gB = internal addrspace(3) global [32768 x i8] undef, align 4
@gC = internal addrspace(3) global [32768 x i8] undef, align 4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
@gA = internal addrspace(3) global [32768 x i8] undef, align 4
@gB = internal addrspace(3) global [32768 x i8] undef, align 4
@gC = internal addrspace(3) global [32768 x i8] undef, align 4
@gA = internal addrspace(3) global [32768 x i8] poison, align 4
@gB = internal addrspace(3) global [32768 x i8] poison, align 4
@gC = internal addrspace(3) global [32768 x i8] poison, align 4


; ---- Helpers ----

define internal void @helperA() inlinehint {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove all the inlinehint, they aren't doing anything

entry:
%p = getelementptr [32768 x i8], ptr addrspace(3) @gA, i32 0, i32 0
store i8 1, ptr addrspace(3) %p
ret void
}

define internal void @helperB() inlinehint {
entry:
%p = getelementptr [32768 x i8], ptr addrspace(3) @gB, i32 0, i32 0
store i8 2, ptr addrspace(3) %p
ret void
}

define internal void @helperC() inlinehint {
entry:
%p = getelementptr [32768 x i8], ptr addrspace(3) @gC, i32 0, i32 0
store i8 3, ptr addrspace(3) %p
ret void
}

; ---------------------------------------------------------------------------
; Dispatch: takes an index and calls the appropriate helper.
; If dispatch is NOT inlined, a backend lowering pass that conservatively
; examines call targets may think all helpers (and thus all globals) are
; potentially referenced by every kernel that calls dispatch.
; ---------------------------------------------------------------------------

define void @dispatch(i32 %idx) inlinehint {
entry:
%cmp1 = icmp eq i32 %idx, 1
br i1 %cmp1, label %case1, label %check2

check2:
%cmp2 = icmp eq i32 %idx, 2
br i1 %cmp2, label %case2, label %check3

check3:
%cmp3 = icmp eq i32 %idx, 3
br i1 %cmp3, label %case3, label %default

case1:
call void @helperA()
br label %done

case2:
call void @helperB()
br label %done

case3:
call void @helperC()
br label %done

default:
; fallthrough: call helperA to have a default behaviour
call void @helperA()
br label %done

done:
ret void
}

; ---- Kernels ----

define amdgpu_kernel void @k0() {
entry:
call void @dispatch(i32 1)
call void @dispatch(i32 2)
ret void
}

define amdgpu_kernel void @k1() {
entry:
call void @dispatch(i32 2)
call void @dispatch(i32 1)
ret void
}

define amdgpu_kernel void @k2() {
entry:
call void @helperC()
ret void
}