diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9091fdd5c959f..1ab4458bafcc3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -977,6 +977,10 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, } unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { + if (auto *Arg = dyn_cast(V); + Arg && AMDGPU::isKernelCC(Arg->getParent()) && !Arg->hasByRefAttr()) + return AMDGPUAS::GLOBAL_ADDRESS; + const auto *LD = dyn_cast(V); if (!LD) // TODO: Handle invariant load like constant. return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 470c5308edca4..3ce03a4b96f61 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -12592,29 +12592,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace { } ChangeStatus updateImpl(Attributor &A) override { - unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value(); uint32_t OldAddressSpace = AssumedAddressSpace; auto CheckAddressSpace = [&](Value &Obj) { if (isa(&Obj)) return true; - // If an argument in flat address space only has addrspace cast uses, and - // those casts are same, then we take the dst addrspace. if (auto *Arg = dyn_cast(&Obj)) { - if (Arg->getType()->getPointerAddressSpace() == FlatAS) { - unsigned CastAddrSpace = FlatAS; - for (auto *U : Arg->users()) { - auto *ASCI = dyn_cast(U); - if (!ASCI) - return takeAddressSpace(Obj.getType()->getPointerAddressSpace()); - if (CastAddrSpace != FlatAS && - CastAddrSpace != ASCI->getDestAddressSpace()) - return false; - CastAddrSpace = ASCI->getDestAddressSpace(); - } - if (CastAddrSpace != FlatAS) - return takeAddressSpace(CastAddrSpace); - } + auto *TTI = + A.getInfoCache().getAnalysisResultForFunction( + *Arg->getParent()); + unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg); + if (AssumedAS != ~0U) + return takeAddressSpace(AssumedAS); } return takeAddressSpace(Obj.getType()->getPointerAddressSpace()); }; diff --git a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll index d1a6414fe49ae..cc2c80060231c 100644 --- a/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll +++ b/llvm/test/CodeGen/AMDGPU/aa-as-infer.ll @@ -246,8 +246,7 @@ define void @foo(ptr addrspace(3) %val) { define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val) { ; CHECK-LABEL: define void @kernel_argument_promotion_pattern_intra_procedure( ; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[P_CAST_0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) -; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[P_CAST_0]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; %p.cast.0 = addrspacecast ptr %p to ptr addrspace(1) @@ -259,8 +258,7 @@ define void @kernel_argument_promotion_pattern_intra_procedure(ptr %p, i32 %val) define internal void @use_argument_after_promotion(ptr %p, i32 %val) { ; CHECK-LABEL: define internal void @use_argument_after_promotion( ; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) -; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; store i32 %val, ptr %p diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll index 99fe986cf6378..60bb38f863e8e 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll @@ -66,7 +66,9 @@ define amdgpu_kernel void @store_global_from_flat(ptr %generic_scalar) #0 { define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 { ; CHECK-LABEL: define amdgpu_kernel void @store_group_from_flat( ; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(3) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(3) ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(3) [[_TMP0]], align 4 ; CHECK-NEXT: ret void ; @@ -78,7 +80,9 @@ define amdgpu_kernel void @store_group_from_flat(ptr %generic_scalar) #0 { define amdgpu_kernel void @store_private_from_flat(ptr %generic_scalar) #0 { ; CHECK-LABEL: define amdgpu_kernel void @store_private_from_flat( ; CHECK-SAME: ptr [[GENERIC_SCALAR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(5) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[GENERIC_SCALAR]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[TMP1]] to ptr +; CHECK-NEXT: [[_TMP0:%.*]] = addrspacecast ptr [[TMP2]] to ptr addrspace(5) ; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[_TMP0]], align 4 ; CHECK-NEXT: ret void ; @@ -136,8 +140,10 @@ define amdgpu_kernel void @load_store_private(ptr addrspace(5) nocapture %input, define amdgpu_kernel void @load_store_flat(ptr nocapture %input, ptr nocapture %output) #0 { ; CHECK-LABEL: define amdgpu_kernel void @load_store_flat( ; CHECK-SAME: ptr captures(none) [[INPUT:%.*]], ptr captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[INPUT]], align 4 -; CHECK-NEXT: store i32 [[VAL]], ptr [[OUTPUT]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[TMP2]], align 4 ; CHECK-NEXT: ret void ; %val = load i32, ptr %input, align 4 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll index 57453d63d7e8a..1c317786d1c20 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -48,7 +48,8 @@ define amdgpu_kernel void @memset_global_to_flat_no_md(ptr addrspace(1) %global. define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -59,7 +60,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.inline.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -70,7 +72,8 @@ define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrspace(3) %dest.group.ptr, ptr %src.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group( ; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr align 4 [[SRC_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[SRC_PTR]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[TMP1]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr @@ -116,7 +119,8 @@ define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspac define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -127,7 +131,8 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struc define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -138,8 +143,10 @@ define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest0, ptr %dest1, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( ; CHECK-SAME: ptr [[DEST0:%.*]], ptr [[DEST1:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST0]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST0]] to ptr addrspace(1) +; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[DEST1]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) align 4 [[TMP2]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false) ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr @@ -162,7 +169,8 @@ define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %grou define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 { ; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group( ; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: call void @llvm.memmove.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DEST]] to ptr addrspace(1) +; CHECK-NEXT: call void @llvm.memmove.p1.p3.i64(ptr addrspace(1) align 4 [[TMP1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]] ; CHECK-NEXT: ret void ; %cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll index f662d5dd85b2b..56f730ccb4189 100644 --- a/llvm/test/Transforms/OpenMP/barrier_removal.ll +++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll @@ -682,11 +682,18 @@ m: } define internal void @write_then_barrier0(ptr %p) { -; CHECK-LABEL: define {{[^@]+}}@write_then_barrier0 -; CHECK-SAME: (ptr [[P:%.*]]) { -; CHECK-NEXT: store i32 0, ptr [[P]], align 4 -; CHECK-NEXT: call void @aligned_barrier() -; CHECK-NEXT: ret void +; MODULE-LABEL: define {{[^@]+}}@write_then_barrier0 +; MODULE-SAME: (ptr [[P:%.*]]) { +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 +; MODULE-NEXT: call void @aligned_barrier() +; MODULE-NEXT: ret void +; +; CGSCC-LABEL: define {{[^@]+}}@write_then_barrier0 +; CGSCC-SAME: (ptr [[P:%.*]]) { +; CGSCC-NEXT: store i32 0, ptr [[P]], align 4 +; CGSCC-NEXT: call void @aligned_barrier() +; CGSCC-NEXT: ret void ; store i32 0, ptr %p call void @aligned_barrier() @@ -695,7 +702,8 @@ define internal void @write_then_barrier0(ptr %p) { define internal void @barrier_then_write0(ptr %p) { ; MODULE-LABEL: define {{[^@]+}}@barrier_then_write0 ; MODULE-SAME: (ptr [[P:%.*]]) { -; MODULE-NEXT: store i32 0, ptr [[P]], align 4 +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 ; MODULE-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write0 @@ -711,7 +719,8 @@ define internal void @barrier_then_write0(ptr %p) { define internal void @barrier_then_write_then_barrier0(ptr %p) { ; MODULE-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0 ; MODULE-SAME: (ptr [[P:%.*]]) { -; MODULE-NEXT: store i32 0, ptr [[P]], align 4 +; MODULE-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) +; MODULE-NEXT: store i32 0, ptr addrspace(1) [[TMP1]], align 4 ; MODULE-NEXT: call void @aligned_barrier() ; MODULE-NEXT: ret void ; diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll index 2f1aadc073142..81e11e048dfd0 100644 --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -85,8 +85,10 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] ; CHECK: region.guarded: ; CHECK-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] -; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] ; CHECK: region.guarded.end: ; CHECK-NEXT: br label [[REGION_BARRIER]] @@ -107,16 +109,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID5:%.*]] ; CHECK: region.check.tid5: -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]] ; CHECK: region.guarded4: -; CHECK-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP8]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END1:%.*]] ; CHECK: region.guarded.end1: ; CHECK-NEXT: br label [[REGION_BARRIER2]] ; CHECK: region.barrier2: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP4]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) ; CHECK-NEXT: br label [[REGION_EXIT3]] ; CHECK: region.exit3: ; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 @@ -128,16 +131,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID10:%.*]] ; CHECK: region.check.tid10: -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]] ; CHECK: region.guarded9: -; CHECK-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP11]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END6:%.*]] ; CHECK: region.guarded.end6: ; CHECK-NEXT: br label [[REGION_BARRIER7]] ; CHECK: region.barrier7: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP9]]) ; CHECK-NEXT: br label [[REGION_EXIT8:%.*]] ; CHECK: region.exit8: ; CHECK-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -145,16 +149,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID15:%.*]] ; CHECK: region.check.tid15: -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]] ; CHECK: region.guarded14: -; CHECK-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP14:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP14]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END11:%.*]] ; CHECK: region.guarded.end11: ; CHECK-NEXT: br label [[REGION_BARRIER12]] ; CHECK: region.barrier12: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP12]]) ; CHECK-NEXT: br label [[REGION_EXIT13:%.*]] ; CHECK: region.exit13: ; CHECK-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -162,16 +167,17 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] ; CHECK-NEXT: br label [[REGION_CHECK_TID20:%.*]] ; CHECK: region.check.tid20: -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]] ; CHECK: region.guarded19: -; CHECK-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] +; CHECK-NEXT: [[TMP17:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1) +; CHECK-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP17]], align 4, !noalias [[META7]] ; CHECK-NEXT: br label [[REGION_GUARDED_END16:%.*]] ; CHECK: region.guarded.end16: ; CHECK-NEXT: br label [[REGION_BARRIER17]] ; CHECK: region.barrier17: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]]) +; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP15]]) ; CHECK-NEXT: br label [[REGION_EXIT18:%.*]] ; CHECK: region.exit18: ; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] @@ -232,11 +238,13 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]] ; CHECK-DISABLED-NEXT: store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]] ; CHECK-DISABLED-NEXT: [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1 -; CHECK-DISABLED-NEXT: store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[ARRAYIDX1_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 1, ptr addrspace(1) [[TMP2]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[SEXT:%.*]] = shl i64 [[N]], 32 ; CHECK-DISABLED-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32 ; CHECK-DISABLED-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]] -; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP3:%.*]] = addrspacecast ptr [[ARRAYIDX2_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr addrspace(1) [[TMP3]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: call void @usei8ptr(ptr captures(none) [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]] ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK-DISABLED: for.cond.i: @@ -248,7 +256,8 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1 ; CHECK-DISABLED-NEXT: [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]] -; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[ARRAYIDX5_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[SUB3_I]], ptr addrspace(1) [[TMP4]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 ; CHECK-DISABLED-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-DISABLED: __omp_outlined__.exit: @@ -256,15 +265,18 @@ define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr % ; CHECK-DISABLED-NEXT: [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARRAYIDX7_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL_I]], ptr addrspace(1) [[TMP5]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP6:%.*]] = addrspacecast ptr [[ARRAYIDX10_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL8_I]], ptr addrspace(1) [[TMP6]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64 ; CHECK-DISABLED-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]] -; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]] +; CHECK-DISABLED-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[ARRAYIDX13_I]] to ptr addrspace(1) +; CHECK-DISABLED-NEXT: store i32 [[CALL11_I]], ptr addrspace(1) [[TMP7]], align 4, !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]] ; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]