Skip to content

Commit 078f02d

Browse files
author
anikelal
committed
Removed always-inline attribute from stubs and made required littest changes
1 parent 9ef951d commit 078f02d

18 files changed

+1983
-1822
lines changed

clang/lib/AST/MicrosoftMangle.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1163,7 +1163,7 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
11631163
->hasAttr<CUDAGlobalAttr>())) &&
11641164
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
11651165
bool IsOCLDeviceStub =
1166-
ND && (isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>()) &&
1166+
ND && isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>() &&
11671167
GD.getKernelReferenceKind() == KernelReferenceKind::Stub;
11681168
if (IsDeviceStub)
11691169
mangleSourceName(

clang/lib/CodeGen/CGCall.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2504,7 +2504,12 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
25042504
NumElemsParam);
25052505
}
25062506

2507-
if (TargetDecl->hasAttr<OpenCLKernelAttr>()) {
2507+
if (TargetDecl->hasAttr<OpenCLKernelAttr>() &&
2508+
CallingConv != CallingConv::CC_C &&
2509+
CallingConv !=
2510+
CallingConv::CC_SpirFunction) { // Check CallingConv to avoid adding
2511+
// uniform-work-group-size attribute
2512+
// to OpenCL Kernel Stub
25082513
if (getLangOpts().OpenCLVersion <= 120) {
25092514
// OpenCL v1.2 Work groups are always uniform
25102515
FuncAttrs.addAttribute("uniform-work-group-size", "true");

clang/lib/CodeGen/CGExpr.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6144,11 +6144,9 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
61446144

61456145
const auto *FnType = cast<FunctionType>(PointeeType);
61466146

6147-
if (auto FD = dyn_cast_or_null<FunctionDecl>(TargetDecl)) {
6148-
if (FD->hasAttr<OpenCLKernelAttr>()) {
6149-
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType);
6150-
}
6151-
}
6147+
if (const auto *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl);
6148+
FD && FD->hasAttr<OpenCLKernelAttr>())
6149+
CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType);
61526150

61536151
// If we are checking indirect calls and this call is indirect, check that the
61546152
// function pointer is a member of the bit set for the function type.

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6162,16 +6162,6 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
61626162

61636163
setNonAliasAttributes(GD, Fn);
61646164

6165-
if (D->hasAttr<OpenCLKernelAttr>()) {
6166-
if (GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
6167-
if (Fn->hasFnAttribute(llvm::Attribute::NoInline))
6168-
Fn->removeFnAttr(llvm::Attribute::NoInline);
6169-
if (Fn->hasFnAttribute(llvm::Attribute::InlineHint))
6170-
Fn->removeFnAttr(llvm::Attribute::InlineHint);
6171-
Fn->addFnAttr(llvm::Attribute::AlwaysInline);
6172-
}
6173-
}
6174-
61756165
SetLLVMFunctionAttributesForDefinition(D, Fn);
61766166

61776167
if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())

clang/test/CodeGenOpenCL/addr-space-struct-arg.cl

Lines changed: 1533 additions & 1082 deletions
Large diffs are not rendered by default.

clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl

Lines changed: 206 additions & 127 deletions
Large diffs are not rendered by default.

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ kernel void test_target_features_kernel(global int *i) {
106106
// NOCPU-NEXT: ret void
107107
//
108108
//
109-
// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind
109+
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
110110
// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
111111
// NOCPU-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] {
112112
// NOCPU-NEXT: [[ENTRY:.*:]]
@@ -230,7 +230,7 @@ kernel void test_target_features_kernel(global int *i) {
230230
// NOCPU-NEXT: ret void
231231
//
232232
//
233-
// NOCPU: Function Attrs: alwaysinline convergent norecurse nounwind
233+
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
234234
// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
235235
// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] {
236236
// NOCPU-NEXT: [[ENTRY:.*:]]
@@ -362,7 +362,7 @@ kernel void test_target_features_kernel(global int *i) {
362362
// NOCPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8
363363
// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
364364
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
365-
// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]]
365+
// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10]]
366366
// NOCPU-NEXT: ret void
367367
//
368368
//
@@ -451,7 +451,7 @@ kernel void test_target_features_kernel(global int *i) {
451451
// GFX900-NEXT: ret void
452452
//
453453
//
454-
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
454+
// GFX900: Function Attrs: convergent norecurse nounwind
455455
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test(
456456
// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] {
457457
// GFX900-NEXT: [[ENTRY:.*:]]
@@ -585,7 +585,7 @@ kernel void test_target_features_kernel(global int *i) {
585585
// GFX900-NEXT: ret void
586586
//
587587
//
588-
// GFX900: Function Attrs: alwaysinline convergent norecurse nounwind
588+
// GFX900: Function Attrs: convergent norecurse nounwind
589589
// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel(
590590
// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] {
591591
// GFX900-NEXT: [[ENTRY:.*:]]
@@ -715,7 +715,7 @@ kernel void test_target_features_kernel(global int *i) {
715715
// GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
716716
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
717717
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
718-
// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10:[0-9]+]]
718+
// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8]]
719719
// GFX900-NEXT: ret void
720720
//
721721
//
@@ -754,27 +754,25 @@ kernel void test_target_features_kernel(global int *i) {
754754
// NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
755755
// NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
756756
// NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
757-
// NOCPU: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
757+
// NOCPU: attributes #[[ATTR3]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
758758
// NOCPU: attributes #[[ATTR4]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
759-
// NOCPU: attributes #[[ATTR5]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
759+
// NOCPU: attributes #[[ATTR5]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" }
760760
// NOCPU: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
761761
// NOCPU: attributes #[[ATTR7]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
762762
// NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
763763
// NOCPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
764-
// NOCPU: attributes #[[ATTR10]] = { convergent nounwind "uniform-work-group-size"="false" }
765-
// NOCPU: attributes #[[ATTR11]] = { convergent nounwind }
764+
// NOCPU: attributes #[[ATTR10]] = { convergent nounwind }
766765
//.
767766
// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
768767
// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
769768
// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
770-
// GFX900: attributes #[[ATTR3]] = { alwaysinline convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
769+
// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
771770
// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
772771
// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
773772
// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
774773
// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
775-
// GFX900: attributes #[[ATTR8]] = { convergent nounwind "uniform-work-group-size"="false" }
774+
// GFX900: attributes #[[ATTR8]] = { convergent nounwind }
776775
// GFX900: attributes #[[ATTR9]] = { nounwind }
777-
// GFX900: attributes #[[ATTR10]] = { convergent nounwind }
778776
//.
779777
// NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
780778
// NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}

clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55

66
kernel void ker() {};
77
// CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]]
8+
// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]]
9+
10+
// CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]]
811

912
void foo() {};
1013
// CHECK: define{{.*}}@foo() #[[ATTR1:[0-9]+]]
@@ -15,3 +18,6 @@ void foo() {};
1518

1619
// CHECK: attributes #[[ATTR1]]
1720
// CHECK-NOT: uniform-work-group-size
21+
22+
// CHECK: attributes #[[ATTR2]]
23+
// CHECK-NOT: uniform-work-group-size

0 commit comments

Comments
 (0)