Skip to content

Commit 013790c

Browse files
committed
Tweak handling potential AS mismatches.
1 parent 99e03a2 commit 013790c

File tree

4 files changed

+34
-29
lines changed

4 files changed

+34
-29
lines changed

clang/lib/CodeGen/CGCall.cpp

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5168,7 +5168,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
51685168
// alloca was AS casted to the default as, so we ensure the cast is
51695169
// stripped before binding to the sret arg, which is in the allocaAS.
51705170
IRCallArgs[IRFunctionArgs.getSRetArgNo()] =
5171-
getAsNaturalPointerTo(SRetPtr, RetTy)->stripPointerCasts();
5171+
getAsNaturalPointerTo(SRetPtr, RetTy);
51725172
} else if (RetAI.isInAlloca()) {
51735173
Address Addr =
51745174
Builder.CreateStructGEP(ArgMemory, RetAI.getInAllocaFieldIndex());
@@ -5390,18 +5390,20 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
53905390
V->getType()->isIntegerTy())
53915391
V = Builder.CreateZExt(V, ArgInfo.getCoerceToType());
53925392

5393-
// If the argument doesn't match, we are either trying to pass an
5394-
// alloca-ed sret argument directly, and the alloca AS does not match
5395-
// the default AS, case in which we AS cast it, or we have a trivial
5396-
// type mismatch, and thus perform a bitcast to coerce it.
5393+
// The only plausible mismatch here would be for pointer address spaces,
5394+
// which can happen e.g. when passing a sret arg that is in the AllocaAS
5395+
// to a function that takes a pointer to and argument in the DefaultAS.
5396+
// We assume that the target has a reasonable mapping for the DefaultAS
5397+
// (it can be casted to from incoming specific ASes), and insert an AS
5398+
// cast to address the mismatch.
53975399
if (FirstIRArg < IRFuncTy->getNumParams() &&
53985400
V->getType() != IRFuncTy->getParamType(FirstIRArg)) {
5399-
auto IRTy = IRFuncTy->getParamType(FirstIRArg);
5400-
auto MaybeSRetArg = dyn_cast_or_null<llvm::Argument>(V);
5401-
if (MaybeSRetArg && MaybeSRetArg->hasStructRetAttr())
5402-
V = Builder.CreateAddrSpaceCast(V, IRTy);
5403-
else
5404-
V = Builder.CreateBitCast(V, IRTy);
5401+
assert(V->getType()->isPointerTy() && "Only pointers can mismatch!");
5402+
auto FormalAS =
5403+
CallInfo.arguments()[ArgNo].type.getQualifiers().getAddressSpace();
5404+
auto ActualAS = I->Ty.getAddressSpace();
5405+
V = getTargetHooks().performAddrSpaceCast(
5406+
*this, V, ActualAS, FormalAS, IRFuncTy->getParamType(FirstIRArg));
54055407
}
54065408

54075409
if (ArgHasMaybeUndefAttr)

clang/lib/CodeGen/CGExprAgg.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -296,18 +296,25 @@ void AggExprEmitter::withReturnValueSlot(
296296
(RequiresDestruction && Dest.isIgnored());
297297

298298
Address RetAddr = Address::invalid();
299-
RawAddress RetAllocaAddr = RawAddress::invalid();
300299

301300
EHScopeStack::stable_iterator LifetimeEndBlock;
302301
llvm::Value *LifetimeSizePtr = nullptr;
303302
llvm::IntrinsicInst *LifetimeStartInst = nullptr;
304303
if (!UseTemp) {
305-
RetAddr = Dest.getAddress();
304+
// It is possible for the existing slot we are using directly to have been
305+
// allocated in the correct AS for an indirect return, and then cast to
306+
// the default AS (this is the behaviour of CreateMemTemp), however we know
307+
// that the return address is expected to point to the uncasted AS, hence we
308+
// strip possible pointer casts here.
309+
if (Dest.getAddress().isValid())
310+
RetAddr = Dest.getAddress().withPointer(
311+
Dest.getAddress().getBasePointer()->stripPointerCasts(),
312+
Dest.getAddress().isKnownNonNull());
306313
} else {
307-
RetAddr = CGF.CreateMemTemp(RetTy, "tmp", &RetAllocaAddr);
314+
RetAddr = CGF.CreateMemTempWithoutCast(RetTy, "tmp");
308315
llvm::TypeSize Size =
309316
CGF.CGM.getDataLayout().getTypeAllocSize(CGF.ConvertTypeForMem(RetTy));
310-
LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAllocaAddr.getPointer());
317+
LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAddr.getBasePointer());
311318
if (LifetimeSizePtr) {
312319
LifetimeStartInst =
313320
cast<llvm::IntrinsicInst>(std::prev(Builder.GetInsertPoint()));
@@ -316,7 +323,7 @@ void AggExprEmitter::withReturnValueSlot(
316323
"Last insertion wasn't a lifetime.start?");
317324

318325
CGF.pushFullExprCleanup<CodeGenFunction::CallLifetimeEnd>(
319-
NormalEHLifetimeMarker, RetAllocaAddr, LifetimeSizePtr);
326+
NormalEHLifetimeMarker, RetAddr, LifetimeSizePtr);
320327
LifetimeEndBlock = CGF.EHStack.stable_begin();
321328
}
322329
}
@@ -337,7 +344,7 @@ void AggExprEmitter::withReturnValueSlot(
337344
// Since we're not guaranteed to be in an ExprWithCleanups, clean up
338345
// eagerly.
339346
CGF.DeactivateCleanupBlock(LifetimeEndBlock, LifetimeStartInst);
340-
CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAllocaAddr.getPointer());
347+
CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAddr.getBasePointer());
341348
}
342349
}
343350

clang/test/CodeGenOpenCL/addr-space-struct-arg.cl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
154154
// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
155155
// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
156156
// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
157-
// AMDGCN20-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
158157
// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
159158
// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
160159
// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -164,10 +163,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
164163
// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
165164
// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
166165
// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
167-
// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
166+
// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
168167
// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
169-
// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
170-
// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
168+
// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
169+
// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
171170
// AMDGCN20-NEXT: ret void
172171
//
173172
// SPIR-LABEL: define dso_local spir_kernel void @ker(
@@ -327,7 +326,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
327326
// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
328327
// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
329328
// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
330-
// AMDGCN20-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
331329
// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
332330
// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
333331
// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -336,7 +334,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
336334
// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
337335
// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
338336
// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
339-
// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
337+
// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
340338
// AMDGCN20-NEXT: ret void
341339
//
342340
// SPIR-LABEL: define dso_local spir_kernel void @ker_large(

clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
7070
// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
7171
// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
7272
// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
73-
// AMDGCN-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
7473
// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
7574
// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
7675
// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -80,10 +79,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
8079
// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
8180
// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
8281
// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
83-
// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
82+
// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
8483
// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
85-
// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
86-
// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
84+
// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
85+
// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
8786
// AMDGCN-NEXT: ret void
8887
//
8988
kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
@@ -112,7 +111,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
112111
// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
113112
// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
114113
// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
115-
// AMDGCN-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
116114
// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
117115
// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
118116
// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
@@ -121,7 +119,7 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
121119
// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
122120
// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
123121
// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
124-
// AMDGCN-NEXT: call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
122+
// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
125123
// AMDGCN-NEXT: ret void
126124
//
127125
kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {

0 commit comments

Comments
 (0)