From a45b5f404e422e3523c9c01369ecd0ecea076216 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 30 Sep 2025 14:07:34 +0100 Subject: [PATCH 01/11] [AMDGPU] Propagate alias information in AMDGPULowerKernelArguments. --- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll new file mode 100644 index 0000000000000..313ae3b883e56 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -0,0 +1,620 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-- -S -o - -passes=amdgpu-lower-kernel-arguments %s | FileCheck %s + +define amdgpu_kernel void @aliasinfo_2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0:![0-9]+]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_AS( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_2i32_NA_AS( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid + %val = load i32, ptr addrspace(1) %in.gep, align 4, !alias.scope !4, !noalias !2 + %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone + store i32 %ctlz, ptr addrspace(1) %out, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1 +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1 +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1 +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16 +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS(ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %in, ptr addrspace(1) %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_AS_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS( +; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> +; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %load = load <4 x i8>, ptr addrspace(1) %gep, align 1, !alias.scope !4, !noalias !2 + %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1, !alias.scope !4, !noalias !2 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, ptr addrspace(1) %out, align 16, !alias.scope !2, !noalias !4 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16( +; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_NA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA( +; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_AS(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_AS( +; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS( +; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) +; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) +; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) +; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) +; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) +; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: ret void +; +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 + %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 + %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 + %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 + %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 + %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) + %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) + %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) + %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) + %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) + %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx + store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 + %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 + store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 + %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 + store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 + %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 + store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 + %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 + store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readnone speculatable } + +!0 = distinct !{!0, !"alias_scope_0"} +!1 = distinct !{!1, !0, !"alias_scope_1"} +!2 = !{!1} +!3 = distinct !{!3, !0, !"alias_scope_3"} +!4 = !{!3} +;. +; CHECK: [[META0]] = !{} +; CHECK: [[META1]] = !{[[META2:![0-9]+]]} +; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"alias_scope_3"} +; CHECK: [[META3]] = distinct !{[[META3]], !"alias_scope_0"} +; CHECK: [[META4]] = !{[[META5:![0-9]+]]} +; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"alias_scope_1"} +;. From 7396ea13449c4f0d6bc2a66d589ebcee387725a4 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Fri, 14 Nov 2025 15:04:50 +0000 Subject: [PATCH 02/11] Update kernarg lowering and tests. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 157 +++++- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 198 ++++---- llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 280 +++++------ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 78 +-- llvm/test/CodeGen/AMDGPU/lower-kernargs.ll | 55 ++- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 246 ++++++---- llvm/test/CodeGen/AMDGPU/mad-combine.ll | 448 ++++++++---------- llvm/test/CodeGen/AMDGPU/sub.ll | 58 +-- llvm/test/CodeGen/AMDGPU/valu-i1.ll | 26 +- 9 files changed, 876 insertions(+), 670 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 755b44c0ca93a..e1b613cc233bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -13,13 +13,21 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" +#include #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" @@ -58,6 +66,145 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } +static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { + // Collect noalias arguments. + auto NoAliasArgs = SmallVector(); + + for (auto &Arg : F.args()) + if (Arg.hasNoAliasAttr() && !Arg.use_empty()) + NoAliasArgs.push_back(&Arg); + + if (NoAliasArgs.empty()) + return; + + // Add alias scopes for each noalias argument. + auto MDB = MDBuilder(F.getContext()); + auto NewScopes = DenseMap(); + auto *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName()); + + for (auto I = 0u; I < NoAliasArgs.size(); ++I) { + auto *Arg = NoAliasArgs[I]; + auto Name = std::string(F.getName()); + + if (Arg->hasName()) + Name += std::string(": %") + std::string(Arg->getName()); + else + Name += std::string(": argument ") + std::to_string(I); + + auto *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); + NewScopes.insert(std::make_pair(Arg, NewScope)); + } + + // Iterate over all instructions. + auto DT = DominatorTree(); + DT.recalculate(F); + + for (auto Inst = inst_begin(F); Inst != inst_end(F); ++Inst) { + // If instruction accesses memory, collect its pointer arguments. + auto *I = &(*Inst); + auto IsFuncCall = false; + auto PtrArgs = SmallVector(); + + if (auto *LI = dyn_cast(I)) + PtrArgs.push_back(LI->getPointerOperand()); + else if (auto *SI = dyn_cast(I)) + PtrArgs.push_back(SI->getPointerOperand()); + else if (auto *VAAI = dyn_cast(I)) + PtrArgs.push_back(VAAI->getPointerOperand()); + else if (auto *CXI = dyn_cast(I)) + PtrArgs.push_back(CXI->getPointerOperand()); + else if (auto *RMWI = dyn_cast(I)) + PtrArgs.push_back(RMWI->getPointerOperand()); + else if (auto *Call = dyn_cast(I)) { + if (Call->doesNotAccessMemory()) + continue; + + IsFuncCall = true; + + for (auto &Arg : Call->args()) { + if (!Arg->getType()->isPointerTy()) + continue; + + PtrArgs.push_back(Arg); + } + } + + if (PtrArgs.empty() && !IsFuncCall) + continue; + + // Collect underlying objects of pointer arguments. + auto Scopes = SmallVector(); + auto ObjSet = SmallPtrSet(); + auto NoAliases = SmallVector(); + + for (auto &Ptr : PtrArgs) { + auto Objects = SmallVector(); + getUnderlyingObjects(Ptr, Objects); + ObjSet.insert_range(Objects); + } + + auto RequiresNoCaptureBefore = false; + auto UsesUnknownObject = false; + auto UsesAliasingPtr = false; + + for (auto *Val : ObjSet) { + if (isa(Val) || isa(Val) || + isa(Val) || isa(Val) || + isa(Val)) + continue; + + if (auto *Arg = dyn_cast(Val)) { + if (!Arg->hasAttribute(Attribute::NoAlias)) + UsesAliasingPtr = true; + } + else + UsesAliasingPtr = true; + + if (isEscapeSource(Val)) + RequiresNoCaptureBefore = true; + else if (!isa(Val) && isIdentifiedObject(Val)) + UsesUnknownObject = true; + } + + if (UsesUnknownObject) + continue; + + // Collect noalias scopes for instruction. + for (auto *Arg : NoAliasArgs) { + if (ObjSet.contains(Arg)) + continue; + + if (!RequiresNoCaptureBefore || + !capturesAnything(PointerMayBeCapturedBefore( + Arg, false, I, &DT, false, CaptureComponents::Provenance))) + NoAliases.push_back(NewScopes[Arg]); + } + + // Add noalias metadata to instruction. + if (!NoAliases.empty()) { + auto *NewMD = MDNode::concatenate( + Inst->getMetadata(LLVMContext::MD_noalias), + MDNode::get(F.getContext(), NoAliases)); + Inst->setMetadata(LLVMContext::MD_noalias, NewMD); + } + + // Collect scopes for alias.scope metadata. + if (!UsesAliasingPtr) + for (auto *Arg : NoAliasArgs) { + if (ObjSet.count(Arg)) + Scopes.push_back(NewScopes[Arg]); + } + + // Add alias.scope metadata to instruction. + if (!Scopes.empty()) { + auto *NewMD = MDNode::concatenate( + Inst->getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(F.getContext(), Scopes)); + Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD); + } + } +} + static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) @@ -86,6 +233,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; + + addAliasScopeMetadata(F, F.getParent()->getDataLayout()); + for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); @@ -124,11 +274,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && !ST.hasUsableDSOffset()) continue; - - // FIXME: We can replace this with equivalent alias.scope/noalias - // metadata, but this appears to be a lot of work. - if (Arg.hasNoAliasAttr()) - continue; } auto *VT = dyn_cast(ArgTy); @@ -215,8 +360,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { } } - // TODO: Convert noalias arg to !noalias - if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 07e6a76d14cf9..cb05b5978c384 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_zero_undef_i64_with_select: @@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_ffbh_u32_e32 v1, v1 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_ffbh_u32_e32 v3, v3 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -706,20 +706,20 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_readfirstlane_b32 s2, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s3, v0 -; VI-NEXT: s_lshl_b32 s2, s2, 8 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_flbit_i32_b32 s3, s3 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_cselect_b32 s2, s3, 32 +; VI-NEXT: flat_load_ubyte v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_readfirstlane_b32 s0, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_readfirstlane_b32 s1, v3 +; VI-NEXT: s_lshl_b32 s0, s0, 8 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s0, 16 +; VI-NEXT: s_flbit_i32_b32 s1, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_cselect_b32 s0, s1, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -813,37 +813,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -964,29 +964,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: flat_load_ubyte v10, v[0:1] ; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: flat_load_ubyte v12, v[4:5] ; VI-NEXT: flat_load_ubyte v6, v[6:7] ; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(6) @@ -1000,19 +1001,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_ffbh_u32_e32 v4, v4 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v8 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 +; VI-NEXT: v_min_u32_e32 v2, v2, v4 +; VI-NEXT: v_min_u32_e32 v2, 64, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_zero_undef_i64_with_select: @@ -1118,12 +1118,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1258,10 +1258,10 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1504,11 +1504,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1583,11 +1583,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1660,11 +1660,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1857,13 +1857,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1941,13 +1941,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2025,13 +2025,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2110,13 +2110,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v1, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbh_u32_e32 v3, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 137acd34ecc2a..cbfe99a4e7faf 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -92,11 +92,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v2, v0 +; VI-NEXT: flat_load_dword v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -168,8 +168,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 @@ -248,8 +248,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -511,13 +511,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_ff1_i32_b64 s0, s[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_zero_undef_i64_with_select: @@ -581,14 +581,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -677,17 +677,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v1, v0 -; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc +; VI-NEXT: flat_load_ubyte v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_ffbl_b32_e32 v3, v2 +; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -778,37 +778,37 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -929,55 +929,55 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: flat_load_ubyte v10, v[0:1] -; VI-NEXT: flat_load_ubyte v11, v[2:3] -; VI-NEXT: flat_load_ubyte v12, v[4:5] -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v10, s2 +; VI-NEXT: flat_load_ubyte v12, v[0:1] +; VI-NEXT: flat_load_ubyte v13, v[2:3] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: flat_load_ubyte v6, v[8:9] +; VI-NEXT: flat_load_ubyte v7, v[10:11] ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_ffbl_b32_e32 v4, v4 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_ffbl_b32_e32 v3, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_or_b32_e32 v0, v0, v2 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, v3, v2 +; VI-NEXT: v_min_u32_e32 v2, 64, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_zero_undef_i64_with_select: @@ -1091,36 +1091,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1213,36 +1213,36 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1338,39 +1338,39 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 +; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 +; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 2 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v3, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v5, v[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 -; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 +; VI-NEXT: v_min_u32_e32 v2, 32, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1455,11 +1455,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_ffbl_b32_e32 v2, v0 +; VI-NEXT: flat_load_ubyte v2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1541,19 +1541,19 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_load_ubyte v4, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 14897b68bf57b..95cd249f84413 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1466,10 +1466,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) @@ -1485,15 +1487,15 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24 ; SI-NEXT: v_or_b32_e32 v4, v4, v6 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v4, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_mov_b32 s12, 0x4000405 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 @@ -1515,10 +1517,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: flat_load_ubyte v4, v[0:1] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 @@ -1531,9 +1535,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: v_or_b32_e32 v5, v7, v3 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_perm_b32 v4, v4, v5, s8 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: v_perm_b32 v4, v4, v5, s12 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: @@ -1628,21 +1632,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_mov_b32 s1, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 @@ -1664,29 +1670,31 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1696,14 +1704,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u16_e32 v9, 9, v4 ; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll index 190384255bf23..efece9d02950d 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1119,21 +1119,44 @@ define amdgpu_kernel void @kern_align32_global_ptr(ptr addrspace(1) align 1024 % } define amdgpu_kernel void @kern_noalias_global_ptr(ptr addrspace(1) noalias %ptr) #0 { -; GCN-LABEL: @kern_noalias_global_ptr( -; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: ret void +; HSA-LABEL: @kern_noalias_global_ptr( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]] +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META5:![0-9]+]] +; MESA-NEXT: ret void ; store volatile ptr addrspace(1) %ptr, ptr addrspace(1) poison ret void } define amdgpu_kernel void @kern_noalias_global_ptr_x2(ptr addrspace(1) noalias %ptr0, ptr addrspace(1) noalias %ptr1) #0 { -; GCN-LABEL: @kern_noalias_global_ptr_x2( -; GCN-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR0:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: store volatile ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) poison, align 8 -; GCN-NEXT: ret void +; HSA-LABEL: @kern_noalias_global_ptr_x2( +; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 0 +; HSA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; HSA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 8 +; HSA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]] +; HSA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]] +; HSA-NEXT: ret void +; +; MESA-LABEL: @kern_noalias_global_ptr_x2( +; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[PTR0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36 +; MESA-NEXT: [[PTR0_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR0_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: [[PTR1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 44 +; MESA-NEXT: [[PTR1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[PTR1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR0_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8:![0-9]+]] +; MESA-NEXT: store volatile ptr addrspace(1) [[PTR1_LOAD]], ptr addrspace(1) poison, align 8, !noalias [[META8]] +; MESA-NEXT: ret void ; store volatile ptr addrspace(1) %ptr0, ptr addrspace(1) poison store volatile ptr addrspace(1) %ptr1, ptr addrspace(1) poison @@ -1855,10 +1878,24 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; HSA: [[META2]] = !{i64 42} ; HSA: [[META3]] = !{i64 128} ; HSA: [[META4]] = !{i64 1024} +; HSA: [[META5]] = !{[[META6:![0-9]+]]} +; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} +; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} +; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} +; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} ;. ; MESA: [[META0]] = !{} ; MESA: [[RNG1]] = !{i32 0, i32 8} ; MESA: [[META2]] = !{i64 42} ; MESA: [[META3]] = !{i64 128} ; MESA: [[META4]] = !{i64 1024} +; MESA: [[META5]] = !{[[META6:![0-9]+]]} +; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} +; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} +; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} +; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} ;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll index 313ae3b883e56..11bf238a1b13f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -31,11 +31,15 @@ define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr ; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ALIASINFO_2I32_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] ; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] -; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] ; CHECK-NEXT: ret void ; entry: @@ -58,9 +62,9 @@ define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspac ; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] ; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] -; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: ret void ; entry: @@ -77,11 +81,15 @@ define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, p ; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[IN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_2I32_NA_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN]], i32 [[TID]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] ; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] -; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META14]], !noalias [[META11]] ; CHECK-NEXT: ret void ; entry: @@ -135,15 +143,23 @@ define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA(ptr addrspace(1) noalias %ou ; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] -; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1 -; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META16:![0-9]+]], !noalias [[META19:![0-9]+]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META23:![0-9]+]], !noalias [[META24:![0-9]+]] ; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16 -; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4 +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META25:![0-9]+]], !noalias [[META26:![0-9]+]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META27:![0-9]+]], !noalias [[META28:![0-9]+]] ; CHECK-NEXT: ret void ; entry: @@ -175,12 +191,12 @@ define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_AS(ptr addrspace(1) %out, ptr a ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] -; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] -; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META6]], !noalias [[META9]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META4]], !noalias [[META1]] -; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: ret void ; entry: @@ -201,15 +217,23 @@ define amdgpu_kernel void @aliasinfo_v4f32_3v4i8_NA_AS(ptr addrspace(1) noalias ; CHECK-SAME: ptr addrspace(1) noalias [[OUT:%.*]], ptr addrspace(1) noalias [[OUT1:%.*]], ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[IN1:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(288) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_V4F32_3V4I8_NA_AS_KERNARG_SEGMENT]], i64 60 +; CHECK-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN]], i32 [[TID]] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1]], i32 [[TID]] -; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META1]], !noalias [[META4]] -; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[IN1_LOAD]], i32 [[TID]] +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP]], align 1, !alias.scope [[META29:![0-9]+]], !noalias [[META32:![0-9]+]] +; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 1, !alias.scope [[META36:![0-9]+]], !noalias [[META37:![0-9]+]] ; CHECK-NEXT: [[SHUFFLE0_0:%.*]] = shufflevector <4 x i8> [[LOAD]], <4 x i8> [[LOAD1]], <4 x i32> ; CHECK-NEXT: [[CVT:%.*]] = uitofp <4 x i8> [[SHUFFLE0_0]] to <4 x float> -; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT]], align 16, !alias.scope [[META4]], !noalias [[META1]] -; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1]], align 4, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <4 x float> [[CVT]], ptr addrspace(1) [[OUT_LOAD]], align 16, !alias.scope [[META38:![0-9]+]], !noalias [[META39:![0-9]+]] +; CHECK-NEXT: store <4 x i8> [[SHUFFLE0_0]], ptr addrspace(1) [[OUT1_LOAD]], align 4, !alias.scope [[META40:![0-9]+]], !noalias [[META41:![0-9]+]] ; CHECK-NEXT: ret void ; entry: @@ -325,45 +349,45 @@ define amdgpu_kernel void @aliasinfo_10v16f16_NA(ptr addrspace(3) noalias %in, p ; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META42:![0-9]+]], !noalias [[META45:![0-9]+]] ; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] ; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] ; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] ; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] ; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) ; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) ; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) ; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) ; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) ; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] ; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] ; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] ; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] ; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47:![0-9]+]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] ; CHECK-NEXT: ret void ; entry: @@ -418,30 +442,30 @@ define amdgpu_kernel void @aliasinfo_10v16f16_AS(ptr addrspace(3) %in, ptr addrs ; CHECK-NEXT: [[ALIASINFO_10V16F16_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] ; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) ; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) ; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) ; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) ; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) ; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) ; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) @@ -511,45 +535,45 @@ define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS(ptr addrspace(3) noalias %in ; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META48:![0-9]+]], !noalias [[META51:![0-9]+]] ; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] ; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] ; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] ; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META1]], !noalias [[META4]] +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] ; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) ; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) ; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) ; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) ; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) ; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] ; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] ; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] ; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] ; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META4]], !noalias [[META1]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53:![0-9]+]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] ; CHECK-NEXT: ret void ; entry: @@ -613,8 +637,56 @@ attributes #2 = { nounwind readnone speculatable } ;. ; CHECK: [[META0]] = !{} ; CHECK: [[META1]] = !{[[META2:![0-9]+]]} -; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"alias_scope_3"} -; CHECK: [[META3]] = distinct !{[[META3]], !"alias_scope_0"} +; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"aliasinfo_2i32_NA: %in"} +; CHECK: [[META3]] = distinct !{[[META3]], !"aliasinfo_2i32_NA"} ; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"alias_scope_1"} +; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"aliasinfo_2i32_NA: %out"} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"alias_scope_3"} +; CHECK: [[META8]] = distinct !{[[META8]], !"alias_scope_0"} +; CHECK: [[META9]] = !{[[META10:![0-9]+]]} +; CHECK: [[META10]] = distinct !{[[META10]], [[META8]], !"alias_scope_1"} +; CHECK: [[META11]] = !{[[META7]], [[META12:![0-9]+]]} +; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"aliasinfo_2i32_NA_AS: %in"} +; CHECK: [[META13]] = distinct !{[[META13]], !"aliasinfo_2i32_NA_AS"} +; CHECK: [[META14]] = !{[[META10]], [[META15:![0-9]+]]} +; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"aliasinfo_2i32_NA_AS: %out"} +; CHECK: [[META16]] = !{[[META17:![0-9]+]]} +; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA: %in"} +; CHECK: [[META18]] = distinct !{[[META18]], !"aliasinfo_v4f32_3v4i8_NA"} +; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META21:![0-9]+]], [[META22:![0-9]+]]} +; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %out"} +; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %out1"} +; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %in1"} +; CHECK: [[META23]] = !{[[META22]]} +; CHECK: [[META24]] = !{[[META20]], [[META21]], [[META17]]} +; CHECK: [[META25]] = !{[[META20]]} +; CHECK: [[META26]] = !{[[META21]], [[META17]], [[META22]]} +; CHECK: [[META27]] = !{[[META21]]} +; CHECK: [[META28]] = !{[[META20]], [[META17]], [[META22]]} +; CHECK: [[META29]] = !{[[META7]], [[META30:![0-9]+]]} +; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA_AS: %in"} +; CHECK: [[META31]] = distinct !{[[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS"} +; CHECK: [[META32]] = !{[[META10]], [[META33:![0-9]+]], [[META34:![0-9]+]], [[META35:![0-9]+]]} +; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %out"} +; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %out1"} +; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %in1"} +; CHECK: [[META36]] = !{[[META7]], [[META35]]} +; CHECK: [[META37]] = !{[[META10]], [[META33]], [[META34]], [[META30]]} +; CHECK: [[META38]] = !{[[META10]], [[META33]]} +; CHECK: [[META39]] = !{[[META7]], [[META34]], [[META30]], [[META35]]} +; CHECK: [[META40]] = !{[[META10]], [[META34]]} +; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]} +; CHECK: [[META42]] = !{[[META43:![0-9]+]]} +; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"aliasinfo_10v16f16_NA: %in"} +; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_10v16f16_NA"} +; CHECK: [[META45]] = !{[[META46:![0-9]+]]} +; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"aliasinfo_10v16f16_NA: %out"} +; CHECK: [[META47]] = !{[[META43]], [[META46]]} +; CHECK: [[META48]] = !{[[META7]], [[META49:![0-9]+]]} +; CHECK: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"aliasinfo_10v16f16_NA_AS: %in"} +; CHECK: [[META50]] = distinct !{[[META50]], !"aliasinfo_10v16f16_NA_AS"} +; CHECK: [[META51]] = !{[[META10]], [[META52:![0-9]+]]} +; CHECK: [[META52]] = distinct !{[[META52]], [[META50]], !"aliasinfo_10v16f16_NA_AS: %out"} +; CHECK: [[META53]] = !{[[META49]], [[META52]]} ;. diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll index 320d3c77a6d9f..cf6732d30d080 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -1047,46 +1047,40 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { ; SI-STD-LABEL: aggressive_combine_to_mad_fsub_0_f32: ; SI-STD: ; %bb.0: -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 -; SI-STD-NEXT: s_mov_b32 s3, 0xf000 -; SI-STD-NEXT: s_mov_b32 s2, 0 -; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: s_mov_b32 s7, 0xf000 +; SI-STD-NEXT: s_mov_b32 s6, 0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: s_bitcmp1_b32 s6, 0 -; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-STD-NEXT: s_bitcmp1_b32 s8, 0 +; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-STD-NEXT: s_cbranch_vccnz .LBB12_2 ; SI-STD-NEXT: ; %bb.1: ; %normal ; SI-STD-NEXT: v_mul_f32_e32 v4, v6, v1 ; SI-STD-NEXT: v_fma_f32 v4, v2, v3, v4 ; SI-STD-NEXT: v_sub_f32_e32 v4, v4, v5 -; SI-STD-NEXT: s_mov_b64 s[2:3], 0 -; SI-STD-NEXT: s_branch .LBB12_3 +; SI-STD-NEXT: s_cbranch_execz .LBB12_3 +; SI-STD-NEXT: s_branch .LBB12_4 ; SI-STD-NEXT: .LBB12_2: -; SI-STD-NEXT: s_mov_b64 s[2:3], -1 ; SI-STD-NEXT: ; implicit-def: $vgpr4 -; SI-STD-NEXT: .LBB12_3: ; %Flow -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: s_mov_b64 vcc, vcc -; SI-STD-NEXT: s_cbranch_vccnz .LBB12_5 -; SI-STD-NEXT: ; %bb.4: ; %aggressive +; SI-STD-NEXT: .LBB12_3: ; %aggressive ; SI-STD-NEXT: v_mad_f32 v4, v6, v1, -v5 ; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3 -; SI-STD-NEXT: .LBB12_5: ; %exit +; SI-STD-NEXT: .LBB12_4: ; %exit ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 ; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 @@ -1095,93 +1089,81 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) ; ; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_0_f32: ; SI-DENORM-FASTFMAF: ; %bb.0: -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB12_2 ; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal -; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v6 -; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v6, v4 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB12_3 +; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v4, v6, v1 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v4 +; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v4, v4, v5 +; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB12_3 +; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB12_4 ; SI-DENORM-FASTFMAF-NEXT: .LBB12_2: -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1 -; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6 -; SI-DENORM-FASTFMAF-NEXT: .LBB12_3: ; %Flow -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB12_5 -; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v5, v1, -v4 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v1 -; SI-DENORM-FASTFMAF-NEXT: .LBB12_5: ; %exit +; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr4 +; SI-DENORM-FASTFMAF-NEXT: .LBB12_3: ; %aggressive +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v6, v1, -v5 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v1 +; SI-DENORM-FASTFMAF-NEXT: .LBB12_4: ; %exit ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64 ; SI-DENORM-FASTFMAF-NEXT: s_endpgm ; ; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_0_f32: ; SI-DENORM-SLOWFMAF: ; %bb.0: -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1 ; SI-DENORM-SLOWFMAF-NEXT: v_fma_f32 v1, v3, v4, v1 ; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB12_2 ; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB12_3 +; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB12_3 +; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB12_4 ; SI-DENORM-SLOWFMAF-NEXT: .LBB12_2: -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1 ; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3 -; SI-DENORM-SLOWFMAF-NEXT: .LBB12_3: ; %Flow -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB12_5 -; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive +; SI-DENORM-SLOWFMAF-NEXT: .LBB12_3: ; %aggressive ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-DENORM-SLOWFMAF-NEXT: .LBB12_5: ; %exit +; SI-DENORM-SLOWFMAF-NEXT: .LBB12_4: ; %exit ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 @@ -1275,142 +1257,124 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1) define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { ; SI-STD-LABEL: aggressive_combine_to_mad_fsub_2_f32: ; SI-STD: ; %bb.0: -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 -; SI-STD-NEXT: s_mov_b32 s3, 0xf000 -; SI-STD-NEXT: s_mov_b32 s2, 0 -; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: s_mov_b32 s7, 0xf000 +; SI-STD-NEXT: s_mov_b32 s6, 0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: s_bitcmp1_b32 s6, 0 -; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-STD-NEXT: s_bitcmp1_b32 s8, 0 +; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-STD-NEXT: s_cbranch_vccnz .LBB14_2 ; SI-STD-NEXT: ; %bb.1: ; %normal -; SI-STD-NEXT: v_mul_f32_e32 v5, v6, v1 -; SI-STD-NEXT: v_mac_f32_e32 v5, v2, v3 -; SI-STD-NEXT: v_sub_f32_e32 v5, v5, v4 -; SI-STD-NEXT: s_mov_b64 s[2:3], 0 -; SI-STD-NEXT: s_branch .LBB14_3 +; SI-STD-NEXT: v_mul_f32_e32 v4, v6, v1 +; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3 +; SI-STD-NEXT: v_sub_f32_e32 v4, v4, v5 +; SI-STD-NEXT: s_cbranch_execz .LBB14_3 +; SI-STD-NEXT: s_branch .LBB14_4 ; SI-STD-NEXT: .LBB14_2: -; SI-STD-NEXT: s_mov_b64 s[2:3], -1 -; SI-STD-NEXT: ; implicit-def: $vgpr5 -; SI-STD-NEXT: .LBB14_3: ; %Flow -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: s_mov_b64 vcc, vcc -; SI-STD-NEXT: s_cbranch_vccnz .LBB14_5 -; SI-STD-NEXT: ; %bb.4: ; %aggressive -; SI-STD-NEXT: v_mad_f32 v5, v6, v1, -v4 -; SI-STD-NEXT: v_mac_f32_e32 v5, v2, v3 -; SI-STD-NEXT: .LBB14_5: ; %exit +; SI-STD-NEXT: ; implicit-def: $vgpr4 +; SI-STD-NEXT: .LBB14_3: ; %aggressive +; SI-STD-NEXT: v_mad_f32 v4, v6, v1, -v5 +; SI-STD-NEXT: v_mac_f32_e32 v4, v2, v3 +; SI-STD-NEXT: .LBB14_4: ; %exit ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 ; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 -; SI-STD-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64 +; SI-STD-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64 ; SI-STD-NEXT: s_endpgm ; ; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_2_f32: ; SI-DENORM-FASTFMAF: ; %bb.0: -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB14_2 ; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal -; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v6 -; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v6, v4 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB14_3 +; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v4, v6, v1 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v4 +; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v4, v4, v5 +; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB14_3 +; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB14_4 ; SI-DENORM-FASTFMAF-NEXT: .LBB14_2: -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1 -; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6 -; SI-DENORM-FASTFMAF-NEXT: .LBB14_3: ; %Flow -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB14_5 -; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v5, v1, -v4 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v2, v3, v1 -; SI-DENORM-FASTFMAF-NEXT: .LBB14_5: ; %exit +; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr4 +; SI-DENORM-FASTFMAF-NEXT: .LBB14_3: ; %aggressive +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, v6, v1, -v5 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v4, v2, v3, v1 +; SI-DENORM-FASTFMAF-NEXT: .LBB14_4: ; %exit ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64 ; SI-DENORM-FASTFMAF-NEXT: s_endpgm ; ; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_2_f32: ; SI-DENORM-SLOWFMAF: ; %bb.0: -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v3, v3, v4 ; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1 ; SI-DENORM-SLOWFMAF-NEXT: v_add_f32_e32 v1, v3, v1 ; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB14_2 ; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB14_3 +; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB14_3 +; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB14_4 ; SI-DENORM-SLOWFMAF-NEXT: .LBB14_2: -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1 ; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3 -; SI-DENORM-SLOWFMAF-NEXT: .LBB14_3: ; %Flow -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB14_5 -; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive +; SI-DENORM-SLOWFMAF-NEXT: .LBB14_3: ; %aggressive ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v1, v2 -; SI-DENORM-SLOWFMAF-NEXT: .LBB14_5: ; %exit +; SI-DENORM-SLOWFMAF-NEXT: .LBB14_4: ; %exit ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 @@ -1455,142 +1419,124 @@ exit: define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 { ; SI-STD-LABEL: aggressive_combine_to_mad_fsub_3_f32: ; SI-STD: ; %bb.0: -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-STD-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-STD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-STD-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 -; SI-STD-NEXT: s_mov_b32 s3, 0xf000 -; SI-STD-NEXT: s_mov_b32 s2, 0 -; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: s_mov_b32 s7, 0xf000 +; SI-STD-NEXT: s_mov_b32 s6, 0 ; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-STD-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-STD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-STD-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-STD-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-STD-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-STD-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-STD-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-STD-NEXT: s_waitcnt vmcnt(0) -; SI-STD-NEXT: s_bitcmp1_b32 s6, 0 -; SI-STD-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-STD-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-STD-NEXT: s_bitcmp1_b32 s8, 0 +; SI-STD-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-STD-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-STD-NEXT: s_cbranch_vccnz .LBB15_2 ; SI-STD-NEXT: ; %bb.1: ; %normal -; SI-STD-NEXT: v_mul_f32_e32 v6, v5, v1 -; SI-STD-NEXT: v_mac_f32_e32 v6, v3, v4 -; SI-STD-NEXT: v_sub_f32_e32 v6, v2, v6 -; SI-STD-NEXT: s_mov_b64 s[2:3], 0 -; SI-STD-NEXT: s_branch .LBB15_3 +; SI-STD-NEXT: v_mul_f32_e32 v5, v6, v1 +; SI-STD-NEXT: v_mac_f32_e32 v5, v3, v4 +; SI-STD-NEXT: v_sub_f32_e32 v5, v2, v5 +; SI-STD-NEXT: s_cbranch_execz .LBB15_3 +; SI-STD-NEXT: s_branch .LBB15_4 ; SI-STD-NEXT: .LBB15_2: -; SI-STD-NEXT: s_mov_b64 s[2:3], -1 -; SI-STD-NEXT: ; implicit-def: $vgpr6 -; SI-STD-NEXT: .LBB15_3: ; %Flow -; SI-STD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-STD-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-STD-NEXT: s_waitcnt lgkmcnt(0) -; SI-STD-NEXT: s_mov_b64 vcc, vcc -; SI-STD-NEXT: s_cbranch_vccnz .LBB15_5 -; SI-STD-NEXT: ; %bb.4: ; %aggressive -; SI-STD-NEXT: v_mad_f32 v1, -v5, v1, v2 -; SI-STD-NEXT: v_mad_f32 v6, -v3, v4, v1 -; SI-STD-NEXT: .LBB15_5: ; %exit +; SI-STD-NEXT: ; implicit-def: $vgpr5 +; SI-STD-NEXT: .LBB15_3: ; %aggressive +; SI-STD-NEXT: v_mad_f32 v1, -v6, v1, v2 +; SI-STD-NEXT: v_mad_f32 v5, -v3, v4, v1 +; SI-STD-NEXT: .LBB15_4: ; %exit ; SI-STD-NEXT: s_mov_b32 s3, 0xf000 ; SI-STD-NEXT: s_mov_b32 s2, 0 ; SI-STD-NEXT: v_mov_b32_e32 v1, 0 -; SI-STD-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 +; SI-STD-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64 ; SI-STD-NEXT: s_endpgm ; ; SI-DENORM-FASTFMAF-LABEL: aggressive_combine_to_mad_fsub_3_f32: ; SI-DENORM-FASTFMAF: ; %bb.0: -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-FASTFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-FASTFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-FASTFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-FASTFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-FASTFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-FASTFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB15_2 ; SI-DENORM-FASTFMAF-NEXT: ; %bb.1: ; %normal -; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v6, v5, v1 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, v3, v4, v6 -; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v6, v2, v6 -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB15_3 +; SI-DENORM-FASTFMAF-NEXT: v_mul_f32_e32 v5, v6, v1 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v5, v3, v4, v5 +; SI-DENORM-FASTFMAF-NEXT: v_sub_f32_e32 v5, v2, v5 +; SI-DENORM-FASTFMAF-NEXT: s_cbranch_execz .LBB15_3 +; SI-DENORM-FASTFMAF-NEXT: s_branch .LBB15_4 ; SI-DENORM-FASTFMAF-NEXT: .LBB15_2: -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 s[2:3], -1 -; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr6 -; SI-DENORM-FASTFMAF-NEXT: .LBB15_3: ; %Flow -; SI-DENORM-FASTFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-FASTFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-FASTFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-FASTFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-FASTFMAF-NEXT: s_cbranch_vccnz .LBB15_5 -; SI-DENORM-FASTFMAF-NEXT: ; %bb.4: ; %aggressive -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, -v5, v1, v2 -; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v6, -v3, v4, v1 -; SI-DENORM-FASTFMAF-NEXT: .LBB15_5: ; %exit +; SI-DENORM-FASTFMAF-NEXT: ; implicit-def: $vgpr5 +; SI-DENORM-FASTFMAF-NEXT: .LBB15_3: ; %aggressive +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v1, -v6, v1, v2 +; SI-DENORM-FASTFMAF-NEXT: v_fma_f32 v5, -v3, v4, v1 +; SI-DENORM-FASTFMAF-NEXT: .LBB15_4: ; %exit ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-FASTFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-FASTFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v6, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMAF-NEXT: buffer_store_dword v5, v[0:1], s[0:3], 0 addr64 ; SI-DENORM-FASTFMAF-NEXT: s_endpgm ; ; SI-DENORM-SLOWFMAF-LABEL: aggressive_combine_to_mad_fsub_3_f32: ; SI-DENORM-SLOWFMAF: ; %bb.0: -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s6, s[4:5], 0xd +; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMAF-NEXT: s_load_dword s8, s[4:5], 0xd ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 -; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s6, 0 ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMAF-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[0:3], 0 addr64 offset:12 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:12 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:16 glc +; SI-DENORM-SLOWFMAF-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt vmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s6, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-DENORM-SLOWFMAF-NEXT: s_bitcmp1_b32 s8, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-DENORM-SLOWFMAF-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v3, v3, v4 ; SI-DENORM-SLOWFMAF-NEXT: v_mul_f32_e32 v1, v5, v1 ; SI-DENORM-SLOWFMAF-NEXT: v_add_f32_e32 v1, v3, v1 ; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB15_2 ; SI-DENORM-SLOWFMAF-NEXT: ; %bb.1: ; %normal ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v2, v1 -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], 0 -; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB15_3 +; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_execz .LBB15_3 +; SI-DENORM-SLOWFMAF-NEXT: s_branch .LBB15_4 ; SI-DENORM-SLOWFMAF-NEXT: .LBB15_2: -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 s[2:3], -1 ; SI-DENORM-SLOWFMAF-NEXT: ; implicit-def: $vgpr3 -; SI-DENORM-SLOWFMAF-NEXT: .LBB15_3: ; %Flow -; SI-DENORM-SLOWFMAF-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-DENORM-SLOWFMAF-NEXT: s_andn2_b64 vcc, exec, s[2:3] -; SI-DENORM-SLOWFMAF-NEXT: s_waitcnt lgkmcnt(0) -; SI-DENORM-SLOWFMAF-NEXT: s_mov_b64 vcc, vcc -; SI-DENORM-SLOWFMAF-NEXT: s_cbranch_vccnz .LBB15_5 -; SI-DENORM-SLOWFMAF-NEXT: ; %bb.4: ; %aggressive +; SI-DENORM-SLOWFMAF-NEXT: .LBB15_3: ; %aggressive ; SI-DENORM-SLOWFMAF-NEXT: v_sub_f32_e32 v3, v2, v1 -; SI-DENORM-SLOWFMAF-NEXT: .LBB15_5: ; %exit +; SI-DENORM-SLOWFMAF-NEXT: .LBB15_4: ; %exit ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s3, 0xf000 ; SI-DENORM-SLOWFMAF-NEXT: s_mov_b32 s2, 0 ; SI-DENORM-SLOWFMAF-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 0a5160145fbd8..287839eba695b 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -646,13 +646,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1 define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { ; GFX6-LABEL: s_sub_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_sub_u32 s0, s0, s2 -; GFX6-NEXT: s_subb_u32 s1, s1, s3 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_sub_u32 s0, s2, s8 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_subb_u32 s1, s3, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -660,41 +662,41 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 ; ; GFX8-LABEL: s_sub_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_u32 s0, s0, s2 -; GFX8-NEXT: s_subb_u32 s1, s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_sub_u32 s0, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_subb_u32 s1, s3, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_sub_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_u32 s0, s0, s2 -; GFX9-NEXT: s_subb_u32 s1, s1, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_sub_u32 s2, s2, s6 +; GFX9-NEXT: s_subb_u32 s3, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %result = sub i64 %a, %b store i64 %result, ptr addrspace(1) %out, align 8 @@ -739,12 +741,12 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_sub_i64: @@ -831,14 +833,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v8, s0 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_sub_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index d254b7effbfc6..cd4522ee49c21 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -298,35 +298,33 @@ exit: define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 { ; SI-LABEL: multi_vcond_loop: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xf +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-NEXT: v_mov_b32_e32 v7, 0 ; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: buffer_load_dword v0, v[6:7], s[8:11], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SI-NEXT: s_cbranch_execz .LBB5_5 ; SI-NEXT: ; %bb.1: ; %bb10.preheader -; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_mov_b32_e32 v3, s1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v6 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v5, s5 +; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v7, s3 +; SI-NEXT: v_add_i32_e32 v6, vcc, s2, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_mov_b32 s8, s10 ; SI-NEXT: s_mov_b32 s9, s10 ; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, s13 -; SI-NEXT: v_add_i32_e32 v2, vcc, s12, v6 -; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; SI-NEXT: v_mov_b32_e32 v5, s1 -; SI-NEXT: v_add_i32_e32 v4, vcc, s0, v6 -; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_add_i32_e32 v6, vcc, s14, v6 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; SI-NEXT: s_mov_b64 s[6:7], 0 ; SI-NEXT: .LBB5_2: ; %bb10 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 From d105d31fc689ee0ec1dc962f87157538ddc531e6 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Mon, 24 Nov 2025 12:00:03 +0000 Subject: [PATCH 03/11] Formatting. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index e1b613cc233bc..f863c28424397 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -134,7 +134,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Collect underlying objects of pointer arguments. auto Scopes = SmallVector(); - auto ObjSet = SmallPtrSet(); + auto ObjSet = SmallPtrSet(); auto NoAliases = SmallVector(); for (auto &Ptr : PtrArgs) { @@ -149,15 +149,13 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { for (auto *Val : ObjSet) { if (isa(Val) || isa(Val) || - isa(Val) || isa(Val) || - isa(Val)) + isa(Val) || isa(Val) || isa(Val)) continue; if (auto *Arg = dyn_cast(Val)) { if (!Arg->hasAttribute(Attribute::NoAlias)) UsesAliasingPtr = true; - } - else + } else UsesAliasingPtr = true; if (isEscapeSource(Val)) @@ -174,7 +172,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { if (ObjSet.contains(Arg)) continue; - if (!RequiresNoCaptureBefore || + if (!RequiresNoCaptureBefore || !capturesAnything(PointerMayBeCapturedBefore( Arg, false, I, &DT, false, CaptureComponents::Provenance))) NoAliases.push_back(NewScopes[Arg]); @@ -182,9 +180,9 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Add noalias metadata to instruction. if (!NoAliases.empty()) { - auto *NewMD = MDNode::concatenate( - Inst->getMetadata(LLVMContext::MD_noalias), - MDNode::get(F.getContext(), NoAliases)); + auto *NewMD = + MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias), + MDNode::get(F.getContext(), NoAliases)); Inst->setMetadata(LLVMContext::MD_noalias, NewMD); } @@ -197,9 +195,9 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Add alias.scope metadata to instruction. if (!Scopes.empty()) { - auto *NewMD = MDNode::concatenate( - Inst->getMetadata(LLVMContext::MD_alias_scope), - MDNode::get(F.getContext(), Scopes)); + auto *NewMD = + MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(F.getContext(), Scopes)); Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD); } } From aca295a56dda912461bad0f03f5857ea26eb8309 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 25 Nov 2025 17:00:53 +0000 Subject: [PATCH 04/11] Remove auto. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 71 ++++++++++--------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index f863c28424397..60fea4f6a9de1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" @@ -68,9 +69,9 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Collect noalias arguments. - auto NoAliasArgs = SmallVector(); + SmallVector NoAliasArgs; - for (auto &Arg : F.args()) + for (Argument &Arg : F.args()) if (Arg.hasNoAliasAttr() && !Arg.use_empty()) NoAliasArgs.push_back(&Arg); @@ -78,50 +79,50 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { return; // Add alias scopes for each noalias argument. - auto MDB = MDBuilder(F.getContext()); - auto NewScopes = DenseMap(); - auto *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName()); + MDBuilder MDB(F.getContext()); + DenseMap NewScopes; + MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName()); - for (auto I = 0u; I < NoAliasArgs.size(); ++I) { - auto *Arg = NoAliasArgs[I]; - auto Name = std::string(F.getName()); + for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) { + Argument const *Arg = NoAliasArgs[I]; + std::string Name(F.getName()); if (Arg->hasName()) Name += std::string(": %") + std::string(Arg->getName()); else Name += std::string(": argument ") + std::to_string(I); - auto *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); + MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); NewScopes.insert(std::make_pair(Arg, NewScope)); } // Iterate over all instructions. - auto DT = DominatorTree(); + DominatorTree DT; DT.recalculate(F); - for (auto Inst = inst_begin(F); Inst != inst_end(F); ++Inst) { + for (inst_iterator Inst = inst_begin(F); Inst != inst_end(F); ++Inst) { // If instruction accesses memory, collect its pointer arguments. - auto *I = &(*Inst); - auto IsFuncCall = false; - auto PtrArgs = SmallVector(); + Instruction *I = &(*Inst); + bool IsFuncCall = false; + SmallVector PtrArgs; - if (auto *LI = dyn_cast(I)) + if (LoadInst *LI = dyn_cast(I)) PtrArgs.push_back(LI->getPointerOperand()); - else if (auto *SI = dyn_cast(I)) + else if (StoreInst *SI = dyn_cast(I)) PtrArgs.push_back(SI->getPointerOperand()); - else if (auto *VAAI = dyn_cast(I)) + else if (VAArgInst *VAAI = dyn_cast(I)) PtrArgs.push_back(VAAI->getPointerOperand()); - else if (auto *CXI = dyn_cast(I)) + else if (AtomicCmpXchgInst *CXI = dyn_cast(I)) PtrArgs.push_back(CXI->getPointerOperand()); - else if (auto *RMWI = dyn_cast(I)) + else if (AtomicRMWInst *RMWI = dyn_cast(I)) PtrArgs.push_back(RMWI->getPointerOperand()); - else if (auto *Call = dyn_cast(I)) { + else if (CallBase *Call = dyn_cast(I)) { if (Call->doesNotAccessMemory()) continue; IsFuncCall = true; - for (auto &Arg : Call->args()) { + for (Use &Arg : Call->args()) { if (!Arg->getType()->isPointerTy()) continue; @@ -133,26 +134,26 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { continue; // Collect underlying objects of pointer arguments. - auto Scopes = SmallVector(); - auto ObjSet = SmallPtrSet(); - auto NoAliases = SmallVector(); + SmallVector Scopes; + SmallPtrSet ObjSet; + SmallVector NoAliases; - for (auto &Ptr : PtrArgs) { - auto Objects = SmallVector(); + for (Value const *&Ptr : PtrArgs) { + SmallVector Objects; getUnderlyingObjects(Ptr, Objects); ObjSet.insert_range(Objects); } - auto RequiresNoCaptureBefore = false; - auto UsesUnknownObject = false; - auto UsesAliasingPtr = false; + bool RequiresNoCaptureBefore = false; + bool UsesUnknownObject = false; + bool UsesAliasingPtr = false; - for (auto *Val : ObjSet) { + for (Value const *Val : ObjSet) { if (isa(Val) || isa(Val) || isa(Val) || isa(Val) || isa(Val)) continue; - if (auto *Arg = dyn_cast(Val)) { + if (Argument const *Arg = dyn_cast(Val)) { if (!Arg->hasAttribute(Attribute::NoAlias)) UsesAliasingPtr = true; } else @@ -168,7 +169,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { continue; // Collect noalias scopes for instruction. - for (auto *Arg : NoAliasArgs) { + for (Argument const *Arg : NoAliasArgs) { if (ObjSet.contains(Arg)) continue; @@ -180,7 +181,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Add noalias metadata to instruction. if (!NoAliases.empty()) { - auto *NewMD = + MDNode *NewMD = MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_noalias), MDNode::get(F.getContext(), NoAliases)); Inst->setMetadata(LLVMContext::MD_noalias, NewMD); @@ -188,14 +189,14 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { // Collect scopes for alias.scope metadata. if (!UsesAliasingPtr) - for (auto *Arg : NoAliasArgs) { + for (Argument const *Arg : NoAliasArgs) { if (ObjSet.count(Arg)) Scopes.push_back(NewScopes[Arg]); } // Add alias.scope metadata to instruction. if (!Scopes.empty()) { - auto *NewMD = + MDNode *NewMD = MDNode::concatenate(Inst->getMetadata(LLVMContext::MD_alias_scope), MDNode::get(F.getContext(), Scopes)); Inst->setMetadata(LLVMContext::MD_alias_scope, NewMD); From 96cb1429bf0c8d144a42bf91eb4091345773c662 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 26 Nov 2025 06:20:16 +0000 Subject: [PATCH 05/11] Address review comments. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 54 ++------ llvm/test/CodeGen/AMDGPU/lower-kernargs.ll | 12 +- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 120 +++++++++--------- 3 files changed, 79 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 60fea4f6a9de1..2b7535297af29 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUAsanInstrumentation.h" #include "GCNSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" @@ -86,12 +87,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) { Argument const *Arg = NoAliasArgs[I]; std::string Name(F.getName()); - - if (Arg->hasName()) - Name += std::string(": %") + std::string(Arg->getName()); - else - Name += std::string(": argument ") + std::to_string(I); - + Name += std::string(": argument ") + std::to_string(I); MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); NewScopes.insert(std::make_pair(Arg, NewScope)); } @@ -100,37 +96,14 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { DominatorTree DT; DT.recalculate(F); - for (inst_iterator Inst = inst_begin(F); Inst != inst_end(F); ++Inst) { + for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F); + Inst != InstEnd; ++Inst) { // If instruction accesses memory, collect its pointer arguments. Instruction *I = &(*Inst); - bool IsFuncCall = false; - SmallVector PtrArgs; - - if (LoadInst *LI = dyn_cast(I)) - PtrArgs.push_back(LI->getPointerOperand()); - else if (StoreInst *SI = dyn_cast(I)) - PtrArgs.push_back(SI->getPointerOperand()); - else if (VAArgInst *VAAI = dyn_cast(I)) - PtrArgs.push_back(VAAI->getPointerOperand()); - else if (AtomicCmpXchgInst *CXI = dyn_cast(I)) - PtrArgs.push_back(CXI->getPointerOperand()); - else if (AtomicRMWInst *RMWI = dyn_cast(I)) - PtrArgs.push_back(RMWI->getPointerOperand()); - else if (CallBase *Call = dyn_cast(I)) { - if (Call->doesNotAccessMemory()) - continue; - - IsFuncCall = true; - - for (Use &Arg : Call->args()) { - if (!Arg->getType()->isPointerTy()) - continue; - - PtrArgs.push_back(Arg); - } - } + SmallVector MemOps; + llvm::AMDGPU::getInterestingMemoryOperands(*F.getParent(), I, MemOps); - if (PtrArgs.empty() && !IsFuncCall) + if (MemOps.empty()) continue; // Collect underlying objects of pointer arguments. @@ -138,9 +111,9 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { SmallPtrSet ObjSet; SmallVector NoAliases; - for (Value const *&Ptr : PtrArgs) { + for (InterestingMemoryOperand &MO : MemOps) { SmallVector Objects; - getUnderlyingObjects(Ptr, Objects); + getUnderlyingObjects(MO.getPtr(), Objects); ObjSet.insert_range(Objects); } @@ -148,12 +121,11 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { bool UsesUnknownObject = false; bool UsesAliasingPtr = false; - for (Value const *Val : ObjSet) { - if (isa(Val) || isa(Val) || - isa(Val) || isa(Val) || isa(Val)) + for (const Value *Val : ObjSet) { + if (isa(Val) || isa(Val)) continue; - if (Argument const *Arg = dyn_cast(Val)) { + if (const Argument *Arg = dyn_cast(Val)) { if (!Arg->hasAttribute(Attribute::NoAlias)) UsesAliasingPtr = true; } else @@ -387,6 +359,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { auto &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); + // DominatorTree &DT = getAnalysis().getDomTree(); return lowerKernelArguments(F, TM); } @@ -403,6 +376,7 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { PreservedAnalyses AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { + // DominatorTree &DT = AM.getResult(F); bool Changed = lowerKernelArguments(F, TM); if (Changed) { // TODO: Preserves a lot more. diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll index efece9d02950d..c1549e7911662 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1879,12 +1879,12 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; HSA: [[META3]] = !{i64 128} ; HSA: [[META4]] = !{i64 1024} ; HSA: [[META5]] = !{[[META6:![0-9]+]]} -; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: argument 0"} ; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} ; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} -; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: argument 0"} ; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} -; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} +; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: argument 1"} ;. ; MESA: [[META0]] = !{} ; MESA: [[RNG1]] = !{i32 0, i32 8} @@ -1892,10 +1892,10 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; MESA: [[META3]] = !{i64 128} ; MESA: [[META4]] = !{i64 1024} ; MESA: [[META5]] = !{[[META6:![0-9]+]]} -; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: %ptr"} +; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: argument 0"} ; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} ; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} -; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: %ptr0"} +; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: argument 0"} ; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} -; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: %ptr1"} +; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: argument 1"} ;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll index 11bf238a1b13f..95ea7306247dd 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -373,21 +373,21 @@ define amdgpu_kernel void @aliasinfo_10v16f16_NA(ptr addrspace(3) noalias %in, p ; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] ; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 ; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47:![0-9]+]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) ; CHECK-NEXT: ret void ; entry: @@ -535,45 +535,45 @@ define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS(ptr addrspace(3) noalias %in ; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META48:![0-9]+]], !noalias [[META51:![0-9]+]] +; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META47:![0-9]+]], !noalias [[META50:![0-9]+]] ; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] +; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] ; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] +; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] ; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] +; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] ; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META48]], !noalias [[META51]] +; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] ; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) ; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) ; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) ; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) ; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) ; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] +; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] ; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] +; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] ; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] +; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] ; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] +; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] ; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META51]], !noalias [[META48]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53:![0-9]+]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0), !noalias [[META53]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0), !noalias [[META53]] +; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) ; CHECK-NEXT: ret void ; entry: @@ -637,27 +637,27 @@ attributes #2 = { nounwind readnone speculatable } ;. ; CHECK: [[META0]] = !{} ; CHECK: [[META1]] = !{[[META2:![0-9]+]]} -; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"aliasinfo_2i32_NA: %in"} +; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"aliasinfo_2i32_NA: argument 1"} ; CHECK: [[META3]] = distinct !{[[META3]], !"aliasinfo_2i32_NA"} ; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"aliasinfo_2i32_NA: %out"} +; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"aliasinfo_2i32_NA: argument 0"} ; CHECK: [[META6]] = !{[[META7:![0-9]+]]} ; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"alias_scope_3"} ; CHECK: [[META8]] = distinct !{[[META8]], !"alias_scope_0"} ; CHECK: [[META9]] = !{[[META10:![0-9]+]]} ; CHECK: [[META10]] = distinct !{[[META10]], [[META8]], !"alias_scope_1"} ; CHECK: [[META11]] = !{[[META7]], [[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"aliasinfo_2i32_NA_AS: %in"} +; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"aliasinfo_2i32_NA_AS: argument 1"} ; CHECK: [[META13]] = distinct !{[[META13]], !"aliasinfo_2i32_NA_AS"} ; CHECK: [[META14]] = !{[[META10]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"aliasinfo_2i32_NA_AS: %out"} +; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"aliasinfo_2i32_NA_AS: argument 0"} ; CHECK: [[META16]] = !{[[META17:![0-9]+]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA: %in"} +; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA: argument 2"} ; CHECK: [[META18]] = distinct !{[[META18]], !"aliasinfo_v4f32_3v4i8_NA"} ; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META21:![0-9]+]], [[META22:![0-9]+]]} -; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %out"} -; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %out1"} -; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: %in1"} +; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 0"} +; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 1"} +; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 3"} ; CHECK: [[META23]] = !{[[META22]]} ; CHECK: [[META24]] = !{[[META20]], [[META21]], [[META17]]} ; CHECK: [[META25]] = !{[[META20]]} @@ -665,12 +665,12 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META27]] = !{[[META21]]} ; CHECK: [[META28]] = !{[[META20]], [[META17]], [[META22]]} ; CHECK: [[META29]] = !{[[META7]], [[META30:![0-9]+]]} -; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA_AS: %in"} +; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 2"} ; CHECK: [[META31]] = distinct !{[[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS"} ; CHECK: [[META32]] = !{[[META10]], [[META33:![0-9]+]], [[META34:![0-9]+]], [[META35:![0-9]+]]} -; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %out"} -; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %out1"} -; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: %in1"} +; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 0"} +; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 1"} +; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 3"} ; CHECK: [[META36]] = !{[[META7]], [[META35]]} ; CHECK: [[META37]] = !{[[META10]], [[META33]], [[META34]], [[META30]]} ; CHECK: [[META38]] = !{[[META10]], [[META33]]} @@ -678,15 +678,13 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META40]] = !{[[META10]], [[META34]]} ; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]} ; CHECK: [[META42]] = !{[[META43:![0-9]+]]} -; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"aliasinfo_10v16f16_NA: %in"} +; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"aliasinfo_10v16f16_NA: argument 0"} ; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_10v16f16_NA"} ; CHECK: [[META45]] = !{[[META46:![0-9]+]]} -; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"aliasinfo_10v16f16_NA: %out"} -; CHECK: [[META47]] = !{[[META43]], [[META46]]} -; CHECK: [[META48]] = !{[[META7]], [[META49:![0-9]+]]} -; CHECK: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"aliasinfo_10v16f16_NA_AS: %in"} -; CHECK: [[META50]] = distinct !{[[META50]], !"aliasinfo_10v16f16_NA_AS"} -; CHECK: [[META51]] = !{[[META10]], [[META52:![0-9]+]]} -; CHECK: [[META52]] = distinct !{[[META52]], [[META50]], !"aliasinfo_10v16f16_NA_AS: %out"} -; CHECK: [[META53]] = !{[[META49]], [[META52]]} +; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"aliasinfo_10v16f16_NA: argument 1"} +; CHECK: [[META47]] = !{[[META7]], [[META48:![0-9]+]]} +; CHECK: [[META48]] = distinct !{[[META48]], [[META49:![0-9]+]], !"aliasinfo_10v16f16_NA_AS: argument 0"} +; CHECK: [[META49]] = distinct !{[[META49]], !"aliasinfo_10v16f16_NA_AS"} +; CHECK: [[META50]] = !{[[META10]], [[META51:![0-9]+]]} +; CHECK: [[META51]] = distinct !{[[META51]], [[META49]], !"aliasinfo_10v16f16_NA_AS: argument 1"} ;. From 8738ded5309e9a4860e8fffe4bc919b8cdbd3ff0 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Thu, 27 Nov 2025 01:54:12 +0000 Subject: [PATCH 06/11] Address review comments. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 20 +++++++++---------- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +++++---- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 2b7535297af29..2556abc33c14d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -47,6 +47,7 @@ class AMDGPULowerKernelArguments : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } }; @@ -68,7 +69,8 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } -static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { +static void addAliasScopeMetadata(Function &F, DataLayout const &DL, + DominatorTree &DT) { // Collect noalias arguments. SmallVector NoAliasArgs; @@ -93,9 +95,6 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { } // Iterate over all instructions. - DominatorTree DT; - DT.recalculate(F); - for (inst_iterator Inst = inst_begin(F), InstEnd = inst_end(F); Inst != InstEnd; ++Inst) { // If instruction accesses memory, collect its pointer arguments. @@ -176,7 +175,8 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL) { } } -static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { +static bool lowerKernelArguments(Function &F, const TargetMachine &TM, + DominatorTree &DT) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; @@ -205,7 +205,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { uint64_t ExplicitArgOffset = 0; - addAliasScopeMetadata(F, F.getParent()->getDataLayout()); + addAliasScopeMetadata(F, F.getParent()->getDataLayout(), DT); for (Argument &Arg : F.args()) { const bool IsByRef = Arg.hasByRefAttr(); @@ -359,8 +359,8 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { auto &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); - // DominatorTree &DT = getAnalysis().getDomTree(); - return lowerKernelArguments(F, TM); + DominatorTree &DT = getAnalysis().getDomTree(); + return lowerKernelArguments(F, TM, DT); } INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, @@ -376,8 +376,8 @@ FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { PreservedAnalyses AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) { - // DominatorTree &DT = AM.getResult(F); - bool Changed = lowerKernelArguments(F, TM); + DominatorTree &DT = *AM.getCachedResult(F); + bool Changed = lowerKernelArguments(F, TM, DT); if (Changed) { // TODO: Preserves a lot more. PreservedAnalyses PA; diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index fe75b2b5bfcf5..05e20f82211c5 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -53,6 +53,7 @@ ; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining) ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics +; GCN-O0-NEXT: Dominator Tree Construction ; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources ; GCN-O0-NEXT: AMDGPU lower intrinsics @@ -243,8 +244,8 @@ ; GCN-O1-NEXT: Expand reduction intrinsics ; GCN-O1-NEXT: AMDGPU Preload Kernel Arguments ; GCN-O1-NEXT: FunctionPass Manager -; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: CodeGen Prepare ; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources @@ -550,8 +551,8 @@ ; GCN-O1-OPTS-NEXT: Early CSE ; GCN-O1-OPTS-NEXT: AMDGPU Preload Kernel Arguments ; GCN-O1-OPTS-NEXT: FunctionPass Manager -; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: CodeGen Prepare ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -874,8 +875,8 @@ ; GCN-O2-NEXT: Early CSE ; GCN-O2-NEXT: AMDGPU Preload Kernel Arguments ; GCN-O2-NEXT: FunctionPass Manager -; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: CodeGen Prepare ; GCN-O2-NEXT: Dominator Tree Construction @@ -1213,8 +1214,8 @@ ; GCN-O3-NEXT: Global Value Numbering ; GCN-O3-NEXT: AMDGPU Preload Kernel Arguments ; GCN-O3-NEXT: FunctionPass Manager -; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: CodeGen Prepare ; GCN-O3-NEXT: Dominator Tree Construction From 3f9b094113a36b934e0fb69fbb4a766eb47e74f7 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 9 Dec 2025 10:11:55 +0000 Subject: [PATCH 07/11] Remove east const. --- .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 2556abc33c14d..9e0113d095174 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -69,10 +69,10 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) { return InsPt; } -static void addAliasScopeMetadata(Function &F, DataLayout const &DL, +static void addAliasScopeMetadata(Function &F, const DataLayout &DL, DominatorTree &DT) { // Collect noalias arguments. - SmallVector NoAliasArgs; + SmallVector NoAliasArgs; for (Argument &Arg : F.args()) if (Arg.hasNoAliasAttr() && !Arg.use_empty()) @@ -83,11 +83,11 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL, // Add alias scopes for each noalias argument. MDBuilder MDB(F.getContext()); - DenseMap NewScopes; + DenseMap NewScopes; MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain(F.getName()); for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) { - Argument const *Arg = NoAliasArgs[I]; + const Argument *Arg = NoAliasArgs[I]; std::string Name(F.getName()); Name += std::string(": argument ") + std::to_string(I); MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); @@ -107,11 +107,11 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL, // Collect underlying objects of pointer arguments. SmallVector Scopes; - SmallPtrSet ObjSet; + SmallPtrSet ObjSet; SmallVector NoAliases; for (InterestingMemoryOperand &MO : MemOps) { - SmallVector Objects; + SmallVector Objects; getUnderlyingObjects(MO.getPtr(), Objects); ObjSet.insert_range(Objects); } @@ -140,7 +140,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL, continue; // Collect noalias scopes for instruction. - for (Argument const *Arg : NoAliasArgs) { + for (const Argument *Arg : NoAliasArgs) { if (ObjSet.contains(Arg)) continue; @@ -160,7 +160,7 @@ static void addAliasScopeMetadata(Function &F, DataLayout const &DL, // Collect scopes for alias.scope metadata. if (!UsesAliasingPtr) - for (Argument const *Arg : NoAliasArgs) { + for (const Argument *Arg : NoAliasArgs) { if (ObjSet.count(Arg)) Scopes.push_back(NewScopes[Arg]); } From 5189b3b9acdae50903ab6784bbf1910d809452eb Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 9 Dec 2025 10:19:06 +0000 Subject: [PATCH 08/11] Address comments. --- llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 9e0113d095174..96a5faad917a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -88,10 +88,8 @@ static void addAliasScopeMetadata(Function &F, const DataLayout &DL, for (unsigned I = 0u; I < NoAliasArgs.size(); ++I) { const Argument *Arg = NoAliasArgs[I]; - std::string Name(F.getName()); - Name += std::string(": argument ") + std::to_string(I); - MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); - NewScopes.insert(std::make_pair(Arg, NewScope)); + MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Arg->getName()); + NewScopes.insert({Arg, NewScope}); } // Iterate over all instructions. @@ -100,7 +98,7 @@ static void addAliasScopeMetadata(Function &F, const DataLayout &DL, // If instruction accesses memory, collect its pointer arguments. Instruction *I = &(*Inst); SmallVector MemOps; - llvm::AMDGPU::getInterestingMemoryOperands(*F.getParent(), I, MemOps); + AMDGPU::getInterestingMemoryOperands(*F.getParent(), I, MemOps); if (MemOps.empty()) continue; @@ -121,7 +119,7 @@ static void addAliasScopeMetadata(Function &F, const DataLayout &DL, bool UsesAliasingPtr = false; for (const Value *Val : ObjSet) { - if (isa(Val) || isa(Val)) + if (isa(Val)) continue; if (const Argument *Arg = dyn_cast(Val)) { From 54f99bba33c2221afe548d65e4e00e2c9198ad66 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Tue, 9 Dec 2025 10:52:55 +0000 Subject: [PATCH 09/11] Update tests. --- llvm/test/CodeGen/AMDGPU/lower-kernargs.ll | 12 +++---- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 32 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll index c1549e7911662..e4a7d9dcc837c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -1879,12 +1879,12 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; HSA: [[META3]] = !{i64 128} ; HSA: [[META4]] = !{i64 1024} ; HSA: [[META5]] = !{[[META6:![0-9]+]]} -; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: argument 0"} +; HSA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"ptr"} ; HSA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} ; HSA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} -; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: argument 0"} +; HSA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"ptr0"} ; HSA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} -; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: argument 1"} +; HSA: [[META11]] = distinct !{[[META11]], [[META10]], !"ptr1"} ;. ; MESA: [[META0]] = !{} ; MESA: [[RNG1]] = !{i32 0, i32 8} @@ -1892,10 +1892,10 @@ attributes #2 = { nounwind "target-cpu"="tahiti" } ; MESA: [[META3]] = !{i64 128} ; MESA: [[META4]] = !{i64 1024} ; MESA: [[META5]] = !{[[META6:![0-9]+]]} -; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"kern_noalias_global_ptr: argument 0"} +; MESA: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]], !"ptr"} ; MESA: [[META7]] = distinct !{[[META7]], !"kern_noalias_global_ptr"} ; MESA: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]]} -; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"kern_noalias_global_ptr_x2: argument 0"} +; MESA: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"ptr0"} ; MESA: [[META10]] = distinct !{[[META10]], !"kern_noalias_global_ptr_x2"} -; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"kern_noalias_global_ptr_x2: argument 1"} +; MESA: [[META11]] = distinct !{[[META11]], [[META10]], !"ptr1"} ;. diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll index 95ea7306247dd..8eea6cadcaa7c 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -637,27 +637,27 @@ attributes #2 = { nounwind readnone speculatable } ;. ; CHECK: [[META0]] = !{} ; CHECK: [[META1]] = !{[[META2:![0-9]+]]} -; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"aliasinfo_2i32_NA: argument 1"} +; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]], !"in"} ; CHECK: [[META3]] = distinct !{[[META3]], !"aliasinfo_2i32_NA"} ; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"aliasinfo_2i32_NA: argument 0"} +; CHECK: [[META5]] = distinct !{[[META5]], [[META3]], !"out"} ; CHECK: [[META6]] = !{[[META7:![0-9]+]]} ; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"alias_scope_3"} ; CHECK: [[META8]] = distinct !{[[META8]], !"alias_scope_0"} ; CHECK: [[META9]] = !{[[META10:![0-9]+]]} ; CHECK: [[META10]] = distinct !{[[META10]], [[META8]], !"alias_scope_1"} ; CHECK: [[META11]] = !{[[META7]], [[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"aliasinfo_2i32_NA_AS: argument 1"} +; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]], !"in"} ; CHECK: [[META13]] = distinct !{[[META13]], !"aliasinfo_2i32_NA_AS"} ; CHECK: [[META14]] = !{[[META10]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"aliasinfo_2i32_NA_AS: argument 0"} +; CHECK: [[META15]] = distinct !{[[META15]], [[META13]], !"out"} ; CHECK: [[META16]] = !{[[META17:![0-9]+]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA: argument 2"} +; CHECK: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]], !"in"} ; CHECK: [[META18]] = distinct !{[[META18]], !"aliasinfo_v4f32_3v4i8_NA"} ; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META21:![0-9]+]], [[META22:![0-9]+]]} -; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 0"} -; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 1"} -; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"aliasinfo_v4f32_3v4i8_NA: argument 3"} +; CHECK: [[META20]] = distinct !{[[META20]], [[META18]], !"out"} +; CHECK: [[META21]] = distinct !{[[META21]], [[META18]], !"out1"} +; CHECK: [[META22]] = distinct !{[[META22]], [[META18]], !"in1"} ; CHECK: [[META23]] = !{[[META22]]} ; CHECK: [[META24]] = !{[[META20]], [[META21]], [[META17]]} ; CHECK: [[META25]] = !{[[META20]]} @@ -665,12 +665,12 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META27]] = !{[[META21]]} ; CHECK: [[META28]] = !{[[META20]], [[META17]], [[META22]]} ; CHECK: [[META29]] = !{[[META7]], [[META30:![0-9]+]]} -; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 2"} +; CHECK: [[META30]] = distinct !{[[META30]], [[META31:![0-9]+]], !"in"} ; CHECK: [[META31]] = distinct !{[[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS"} ; CHECK: [[META32]] = !{[[META10]], [[META33:![0-9]+]], [[META34:![0-9]+]], [[META35:![0-9]+]]} -; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 0"} -; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 1"} -; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"aliasinfo_v4f32_3v4i8_NA_AS: argument 3"} +; CHECK: [[META33]] = distinct !{[[META33]], [[META31]], !"out"} +; CHECK: [[META34]] = distinct !{[[META34]], [[META31]], !"out1"} +; CHECK: [[META35]] = distinct !{[[META35]], [[META31]], !"in1"} ; CHECK: [[META36]] = !{[[META7]], [[META35]]} ; CHECK: [[META37]] = !{[[META10]], [[META33]], [[META34]], [[META30]]} ; CHECK: [[META38]] = !{[[META10]], [[META33]]} @@ -678,13 +678,13 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META40]] = !{[[META10]], [[META34]]} ; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]} ; CHECK: [[META42]] = !{[[META43:![0-9]+]]} -; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"aliasinfo_10v16f16_NA: argument 0"} +; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"in"} ; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_10v16f16_NA"} ; CHECK: [[META45]] = !{[[META46:![0-9]+]]} -; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"aliasinfo_10v16f16_NA: argument 1"} +; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"out"} ; CHECK: [[META47]] = !{[[META7]], [[META48:![0-9]+]]} -; CHECK: [[META48]] = distinct !{[[META48]], [[META49:![0-9]+]], !"aliasinfo_10v16f16_NA_AS: argument 0"} +; CHECK: [[META48]] = distinct !{[[META48]], [[META49:![0-9]+]], !"in"} ; CHECK: [[META49]] = distinct !{[[META49]], !"aliasinfo_10v16f16_NA_AS"} ; CHECK: [[META50]] = !{[[META10]], [[META51:![0-9]+]]} -; CHECK: [[META51]] = distinct !{[[META51]], [[META49]], !"aliasinfo_10v16f16_NA_AS: argument 1"} +; CHECK: [[META51]] = distinct !{[[META51]], [[META49]], !"out"} ;. From 93ee4cce2c857c6227d797d81cfbedf67b7a94f4 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Wed, 10 Dec 2025 10:52:39 +0000 Subject: [PATCH 10/11] Address review comments. --- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 25 +- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 392 +----------------- 2 files changed, 24 insertions(+), 393 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 96a5faad917a2..c78db0034c49a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -25,10 +25,12 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetMachine.h" +#include #include #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" @@ -97,10 +99,23 @@ static void addAliasScopeMetadata(Function &F, const DataLayout &DL, Inst != InstEnd; ++Inst) { // If instruction accesses memory, collect its pointer arguments. Instruction *I = &(*Inst); - SmallVector MemOps; - AMDGPU::getInterestingMemoryOperands(*F.getParent(), I, MemOps); + SmallVector PtrArgs; - if (MemOps.empty()) + if (std::optional MO = MemoryLocation::getOrNone(I)) + PtrArgs.push_back(MO->Ptr); + else if (const CallBase *Call = dyn_cast(I)) { + if (Call->doesNotAccessMemory()) + continue; + + for (Value *Arg : Call->args()) { + if (!Arg->getType()->isPointerTy()) + continue; + + PtrArgs.push_back(Arg); + } + } + + if (PtrArgs.empty()) continue; // Collect underlying objects of pointer arguments. @@ -108,9 +123,9 @@ static void addAliasScopeMetadata(Function &F, const DataLayout &DL, SmallPtrSet ObjSet; SmallVector NoAliases; - for (InterestingMemoryOperand &MO : MemOps) { + for (const Value *Val : PtrArgs) { SmallVector Objects; - getUnderlyingObjects(MO.getPtr(), Objects); + getUnderlyingObjects(Val, Objects); ObjSet.insert_range(Objects); } diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll index 8eea6cadcaa7c..9f83c93524cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @aliasinfo_2i32(ptr addrspace(1) %out, ptr addrspace(1 ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; CHECK-NEXT: ret void ; @@ -38,7 +38,7 @@ define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] ; CHECK-NEXT: ret void ; @@ -63,7 +63,7 @@ define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspac ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: ret void ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, p ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR5]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META14]], !noalias [[META11]] ; CHECK-NEXT: ret void ; @@ -249,381 +249,7 @@ entry: ret void } -define amdgpu_kernel void @aliasinfo_10v16f16(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { -; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16( -; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALIASINFO_10V16F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32 -; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32 -; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32 -; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32 -; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32 -; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) -; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) -; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) -; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) -; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) -; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32 -; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32 -; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32 -; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32 -; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32 -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: ret void -; -entry: - %idx = call i32 @llvm.amdgcn.workitem.id.x() - %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx - %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr - %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 - %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr - %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 - %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr - %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 - %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr - %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 - %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr - %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) - %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) - %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) - %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) - %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) - %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx - store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr - %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 - store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr - %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 - store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr - %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 - store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr - %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 - store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - ret void -} - -define amdgpu_kernel void @aliasinfo_10v16f16_NA(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { -; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA( -; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META42:![0-9]+]], !noalias [[META45:![0-9]+]] -; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] -; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] -; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] -; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META42]], !noalias [[META45]] -; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) -; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) -; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) -; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) -; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) -; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META45]], !noalias [[META42]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: ret void -; -entry: - %idx = call i32 @llvm.amdgcn.workitem.id.x() - %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx - %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr - %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 - %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr - %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 - %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr - %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 - %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr - %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 - %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr - %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) - %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) - %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) - %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) - %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) - %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx - store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr - %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 - store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr - %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 - store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr - %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 - store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr - %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 - store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - ret void -} - -define amdgpu_kernel void @aliasinfo_10v16f16_AS(ptr addrspace(3) %in, ptr addrspace(3) %out) #0 { -; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_AS( -; CHECK-SAME: ptr addrspace(3) [[IN:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALIASINFO_10V16F16_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META6]], !noalias [[META9]] -; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) -; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) -; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) -; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) -; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) -; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] -; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] -; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] -; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] -; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META9]], !noalias [[META6]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: ret void -; -entry: - %idx = call i32 @llvm.amdgcn.workitem.id.x() - %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx - %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 - %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 - %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 - %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 - %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 - %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 - %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 - %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 - %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 - %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) - %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) - %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) - %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) - %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) - %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx - store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 - %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 - store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 - %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 - store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 - %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 - store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 - %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 - store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - ret void -} - -define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { -; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_10v16f16_NA_AS( -; CHECK-SAME: ptr addrspace(3) noalias [[IN:%.*]], ptr addrspace(3) noalias [[OUT:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ALIASINFO_10V16F16_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; CHECK-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; CHECK-NEXT: [[LOAD_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[IN]], i32 [[IDX]] -; CHECK-NEXT: [[LOAD_0:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], align 32, !alias.scope [[META47:![0-9]+]], !noalias [[META50:![0-9]+]] -; CHECK-NEXT: [[LOAD_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_0_ADDR]], i32 64 -; CHECK-NEXT: [[LOAD_1:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] -; CHECK-NEXT: [[LOAD_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_1_ADDR]], i32 128 -; CHECK-NEXT: [[LOAD_2:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] -; CHECK-NEXT: [[LOAD_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_2_ADDR]], i32 192 -; CHECK-NEXT: [[LOAD_3:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] -; CHECK-NEXT: [[LOAD_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[LOAD_3_ADDR]], i32 256 -; CHECK-NEXT: [[LOAD_4:%.*]] = load <16 x half>, ptr addrspace(3) [[LOAD_4_ADDR]], align 32, !alias.scope [[META47]], !noalias [[META50]] -; CHECK-NEXT: [[MAI_0:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], <16 x half> [[LOAD_0]], i1 false) -; CHECK-NEXT: [[MAI_1:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], <16 x half> [[LOAD_1]], i1 false) -; CHECK-NEXT: [[MAI_2:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], <16 x half> [[LOAD_2]], i1 false) -; CHECK-NEXT: [[MAI_3:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], <16 x half> [[LOAD_3]], i1 false) -; CHECK-NEXT: [[MAI_4:%.*]] = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], <16 x half> [[LOAD_4]], i1 false) -; CHECK-NEXT: [[STORE_0_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 [[IDX]] -; CHECK-NEXT: store <16 x half> [[MAI_0]], ptr addrspace(3) [[STORE_0_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] -; CHECK-NEXT: [[STORE_1_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 64 -; CHECK-NEXT: store <16 x half> [[MAI_1]], ptr addrspace(3) [[STORE_1_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] -; CHECK-NEXT: [[STORE_2_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 128 -; CHECK-NEXT: store <16 x half> [[MAI_2]], ptr addrspace(3) [[STORE_2_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] -; CHECK-NEXT: [[STORE_3_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 192 -; CHECK-NEXT: store <16 x half> [[MAI_3]], ptr addrspace(3) [[STORE_3_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] -; CHECK-NEXT: [[STORE_4_ADDR:%.*]] = getelementptr <16 x half>, ptr addrspace(3) [[OUT]], i32 256 -; CHECK-NEXT: store <16 x half> [[MAI_4]], ptr addrspace(3) [[STORE_4_ADDR]], align 32, !alias.scope [[META50]], !noalias [[META47]] -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) -; CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) -; CHECK-NEXT: ret void -; -entry: - %idx = call i32 @llvm.amdgcn.workitem.id.x() - %load.0.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx - %load.0 = load <16 x half>, ptr addrspace(3) %load.0.addr, !alias.scope !4, !noalias !2 - %load.1.addr = getelementptr <16 x half>, ptr addrspace(3) %load.0.addr, i32 64 - %load.1 = load <16 x half>, ptr addrspace(3) %load.1.addr, !alias.scope !4, !noalias !2 - %load.2.addr = getelementptr <16 x half>, ptr addrspace(3) %load.1.addr, i32 128 - %load.2 = load <16 x half>, ptr addrspace(3) %load.2.addr, !alias.scope !4, !noalias !2 - %load.3.addr = getelementptr <16 x half>, ptr addrspace(3) %load.2.addr, i32 192 - %load.3 = load <16 x half>, ptr addrspace(3) %load.3.addr, !alias.scope !4, !noalias !2 - %load.4.addr = getelementptr <16 x half>, ptr addrspace(3) %load.3.addr, i32 256 - %load.4 = load <16 x half>, ptr addrspace(3) %load.4.addr, !alias.scope !4, !noalias !2 - %mai.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.0, <16 x half> %load.0, <16 x half> %load.0, i1 0) - %mai.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.1, <16 x half> %load.1, <16 x half> %load.1, i1 0) - %mai.2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.2, <16 x half> %load.2, <16 x half> %load.2, i1 0) - %mai.3 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.3, <16 x half> %load.3, <16 x half> %load.3, i1 0) - %mai.4 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %load.4, <16 x half> %load.4, <16 x half> %load.4, i1 0) - %store.0.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 %idx - store <16 x half> %mai.0, ptr addrspace(3) %store.0.addr, !alias.scope !2, !noalias !4 - %store.1.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 64 - store <16 x half> %mai.1, ptr addrspace(3) %store.1.addr, !alias.scope !2, !noalias !4 - %store.2.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 128 - store <16 x half> %mai.2, ptr addrspace(3) %store.2.addr, !alias.scope !2, !noalias !4 - %store.3.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 192 - store <16 x half> %mai.3, ptr addrspace(3) %store.3.addr, !alias.scope !2, !noalias !4 - %store.4.addr = getelementptr <16 x half>, ptr addrspace(3) %out, i32 256 - store <16 x half> %mai.4, ptr addrspace(3) %store.4.addr, !alias.scope !2, !noalias !4 - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 2, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) - call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 2, i32 0) - ret void -} - declare i32 @llvm.amdgcn.workitem.id.x() #2 -declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1 -declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } attributes #1 = { nounwind } @@ -677,14 +303,4 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META39]] = !{[[META7]], [[META34]], [[META30]], [[META35]]} ; CHECK: [[META40]] = !{[[META10]], [[META34]]} ; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]} -; CHECK: [[META42]] = !{[[META43:![0-9]+]]} -; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"in"} -; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_10v16f16_NA"} -; CHECK: [[META45]] = !{[[META46:![0-9]+]]} -; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"out"} -; CHECK: [[META47]] = !{[[META7]], [[META48:![0-9]+]]} -; CHECK: [[META48]] = distinct !{[[META48]], [[META49:![0-9]+]], !"in"} -; CHECK: [[META49]] = distinct !{[[META49]], !"aliasinfo_10v16f16_NA_AS"} -; CHECK: [[META50]] = !{[[META10]], [[META51:![0-9]+]]} -; CHECK: [[META51]] = distinct !{[[META51]], [[META49]], !"out"} ;. From 06725a8da60ee9e11aa27df81dda7a5da255a931 Mon Sep 17 00:00:00 2001 From: Leon Clark Date: Thu, 18 Dec 2025 15:38:15 +0000 Subject: [PATCH 11/11] Add tests. --- .../CodeGen/AMDGPU/lower-noalias-kernargs.ll | 136 +++++++++++++++++- 1 file changed, 132 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll index 9f83c93524cd2..36fad7beb6b9b 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-noalias-kernargs.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @aliasinfo_2i32(ptr addrspace(1) %out, ptr addrspace(1 ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4 -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3:[0-9]+]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; CHECK-NEXT: ret void ; @@ -38,7 +38,7 @@ define amdgpu_kernel void @aliasinfo_2i32_NA(ptr addrspace(1) noalias %out, ptr ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META1:![0-9]+]], !noalias [[META4:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META4]], !noalias [[META1]] ; CHECK-NEXT: ret void ; @@ -63,7 +63,7 @@ define amdgpu_kernel void @aliasinfo_2i32_AS(ptr addrspace(1) %out, ptr addrspac ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META9]], !noalias [[META6]] ; CHECK-NEXT: ret void ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @aliasinfo_2i32_NA_AS(ptr addrspace(1) noalias %out, p ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN_LOAD]], i32 [[TID]] ; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[IN_GEP]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR2]] +; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 false) #[[ATTR3]] ; CHECK-NEXT: store i32 [[CTLZ]], ptr addrspace(1) [[OUT_LOAD]], align 4, !alias.scope [[META14]], !noalias [[META11]] ; CHECK-NEXT: ret void ; @@ -249,7 +249,117 @@ entry: ret void } +define amdgpu_kernel void @aliasinfo_mixed_intrinsics(ptr addrspace(1) %in, ptr addrspace(1) %inout, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[INOUT:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]) +; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]) +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false) +; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]] +; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16 +; CHECK-NEXT: ret void +; +entry: + %val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in) + %val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout) + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false) + %val3 = fmul <4 x float> %val1, %val2 + store <4 x float> %val3, ptr addrspace(1) %inout + ret void +} + +define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %inout, ptr addrspace(1) noalias %out) { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA( +; CHECK-SAME: ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[INOUT:%.*]], ptr addrspace(1) noalias [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META42:![0-9]+]], !noalias [[META45:![0-9]+]] +; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META48:![0-9]+]], !noalias [[META49:![0-9]+]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META45]], !noalias [[META42]] +; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]] +; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META48]], !noalias [[META49]] +; CHECK-NEXT: ret void +; +entry: + %val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in) + %val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout) + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false) + %val3 = fmul <4 x float> %val1, %val2 + store <4 x float> %val3, ptr addrspace(1) %inout + ret void +} + +define amdgpu_kernel void @aliasinfo_mixed_intrinsics_AS(ptr addrspace(1) %in, ptr addrspace(1) %inout, ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_AS( +; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[INOUT:%.*]], ptr addrspace(1) [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_AS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META6]], !noalias [[META9]] +; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META6]], !noalias [[META9]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META9]], !noalias [[META6]] +; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]] +; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META9]], !noalias [[META6]] +; CHECK-NEXT: ret void +; +entry: + %val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in), !alias.scope !4, !noalias !2 + %val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout), !alias.scope !4, !noalias !2 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false), !alias.scope !2, !noalias !4 + %val3 = fmul <4 x float> %val1, %val2 + store <4 x float> %val3, ptr addrspace(1) %inout, !alias.scope !2, !noalias !4 + ret void +} + +define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA_AS(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %inout, ptr addrspace(1) noalias %out) { +; CHECK-LABEL: define amdgpu_kernel void @aliasinfo_mixed_intrinsics_NA_AS( +; CHECK-SAME: ptr addrspace(1) noalias [[IN:%.*]], ptr addrspace(1) noalias [[INOUT:%.*]], ptr addrspace(1) noalias [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(280) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; CHECK-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 36 +; CHECK-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[INOUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 44 +; CHECK-NEXT: [[INOUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[INOUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[ALIASINFO_MIXED_INTRINSICS_NA_AS_KERNARG_SEGMENT]], i64 52 +; CHECK-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; CHECK-NEXT: [[VAL1:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[IN_LOAD]]), !alias.scope [[META50:![0-9]+]], !noalias [[META53:![0-9]+]] +; CHECK-NEXT: [[VAL2:%.*]] = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) [[INOUT_LOAD]]), !alias.scope [[META56:![0-9]+]], !noalias [[META57:![0-9]+]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[OUT_LOAD]], ptr addrspace(1) [[INOUT_LOAD]], i64 16, i1 false), !alias.scope [[META53]], !noalias [[META50]] +; CHECK-NEXT: [[VAL3:%.*]] = fmul <4 x float> [[VAL1]], [[VAL2]] +; CHECK-NEXT: store <4 x float> [[VAL3]], ptr addrspace(1) [[INOUT_LOAD]], align 16, !alias.scope [[META58:![0-9]+]], !noalias [[META59:![0-9]+]] +; CHECK-NEXT: ret void +; +entry: + %val1 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %in), !alias.scope !4, !noalias !2 + %val2 = call <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1) %inout), !alias.scope !4, !noalias !2 + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %out, ptr addrspace(1) %inout, i64 16, i1 false), !alias.scope !2, !noalias !4 + %val3 = fmul <4 x float> %val1, %val2 + store <4 x float> %val3, ptr addrspace(1) %inout, !alias.scope !2, !noalias !4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare <4 x float> @llvm.amdgcn.global.load.tr.b256.v4f32.p1(ptr addrspace(1)) +declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1), ptr addrspace(1), i64, i1) attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1,32" } attributes #1 = { nounwind } @@ -303,4 +413,22 @@ attributes #2 = { nounwind readnone speculatable } ; CHECK: [[META39]] = !{[[META7]], [[META34]], [[META30]], [[META35]]} ; CHECK: [[META40]] = !{[[META10]], [[META34]]} ; CHECK: [[META41]] = !{[[META7]], [[META33]], [[META30]], [[META35]]} +; CHECK: [[META42]] = !{[[META43:![0-9]+]]} +; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"in"} +; CHECK: [[META44]] = distinct !{[[META44]], !"aliasinfo_mixed_intrinsics_NA"} +; CHECK: [[META45]] = !{[[META46:![0-9]+]], [[META47:![0-9]+]]} +; CHECK: [[META46]] = distinct !{[[META46]], [[META44]], !"inout"} +; CHECK: [[META47]] = distinct !{[[META47]], [[META44]], !"out"} +; CHECK: [[META48]] = !{[[META46]]} +; CHECK: [[META49]] = !{[[META43]], [[META47]]} +; CHECK: [[META50]] = !{[[META7]], [[META51:![0-9]+]]} +; CHECK: [[META51]] = distinct !{[[META51]], [[META52:![0-9]+]], !"in"} +; CHECK: [[META52]] = distinct !{[[META52]], !"aliasinfo_mixed_intrinsics_NA_AS"} +; CHECK: [[META53]] = !{[[META10]], [[META54:![0-9]+]], [[META55:![0-9]+]]} +; CHECK: [[META54]] = distinct !{[[META54]], [[META52]], !"inout"} +; CHECK: [[META55]] = distinct !{[[META55]], [[META52]], !"out"} +; CHECK: [[META56]] = !{[[META7]], [[META54]]} +; CHECK: [[META57]] = !{[[META10]], [[META51]], [[META55]]} +; CHECK: [[META58]] = !{[[META10]], [[META54]]} +; CHECK: [[META59]] = !{[[META7]], [[META51]], [[META55]]} ;.