diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll index aeb7faade4715..3f9e354d322f6 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll @@ -1,20 +1,80 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s + +define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) %out, ptr addrspace(1) byref(i32) %arg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] +; NO-PRELOAD-NEXT: [[ARG_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[ARG_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) byref(i32) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[ARG_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 8 +; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[ARG_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD0]], [[LOAD1]] +; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load0 = load i32, ptr addrspace(4) %imp_arg_ptr + %load1 = load i32, ptr addrspace(1) %arg + %add = add i32 %load0, %load1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 @@ -27,10 +87,35 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { ret void } -define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, <16 x i32> inreg) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(384) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -38,9 +123,9 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <16 x i32> inreg [[TMP0:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(384) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 ; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 @@ -52,31 +137,335 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5 ret void } -define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 +define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 +; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i64, ptr addrspace(4) %imp_arg_ptr + store i64 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2 +; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2 +; PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i16, ptr addrspace(4) %imp_arg_ptr + store i16 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Y]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Z]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load = load i32, ptr addrspace(4) %gep + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) %out, i8 %val) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[VAL_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[VAL_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x_imparg_align_ptr_i8 +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[VAL:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_IMPARG_ALIGN_PTR_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 +; PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[VAL]] to i32 +; PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[_HIDDEN_BLOCK_COUNT_X]], [[EXT]] +; PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %load = load i32, ptr addrspace(4) %imp_arg_ptr + %ext = zext i8 %val to i32 + %add = add i32 %load, %ext + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0 +; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4 +; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4 +; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD_X]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[LOAD_Y]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[LOAD_Z]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 0 +; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i32, ptr addrspace(4) [[GEP_X]], align 4 +; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4 +; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_Y]], align 4 +; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_Z]], align 4 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_X]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[_HIDDEN_BLOCK_COUNT_Y]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 + %load_x = load i32, ptr addrspace(4) %gep_x + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load_y = load i32, ptr addrspace(4) %gep_y + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %load_z = load i32, ptr addrspace(4) %gep_z + %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 ; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 ; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 ; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 -; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32 ; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 %load = load i16, ptr addrspace(4) %gep %conv = zext i16 %load to i32 store i32 %conv, ptr addrspace(1) %out @@ -84,8 +473,8 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { } define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( -; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] @@ -105,8 +494,8 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-LABEL: define {{[^@]+}}@preload_workgroup_size_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { ; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 @@ -141,74 +530,206 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { ret void } -define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 -; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preload_remainder_x +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_REMAINDER_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 -; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i64, ptr addrspace(4) %imp_arg_ptr - store i64 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( -; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Y_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( -; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_y +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOADREMAINDER_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 -; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 -; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; NO-PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32 +; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; NO-PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preloadremainder_xyz +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOADREMAINDER_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 18 +; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 +; PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[_HIDDEN_REMAINDER_X]] to i32 +; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 20 +; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 +; PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Y]] to i32 +; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 +; PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 + %load_x = load i16, ptr addrspace(4) %gep_x + %conv_x = zext i16 %load_x to i32 + %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 + %load_y = load i16, ptr addrspace(4) %gep_y + %conv_y = zext i16 %load_y to i32 + %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load_z = load i16, ptr addrspace(4) %gep_z + %conv_z = zext i16 %load_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) { -; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( -; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 +; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_preloadremainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[NO_FREE_SGPRS_PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load = load i16, ptr addrspace(4) %gep + %conv = zext i16 %load to i32 + store i32 %conv, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) %out, i192 %t0, i32 %t1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i192 [[T0:%.*]], i32 [[T1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( -; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 -; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_max_user_sgprs +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i192 inreg [[T0:%.*]], i32 inreg [[T1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_MAX_USER_SGPRS_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(296) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() ; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() ; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 -; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 +; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 ; PRELOAD-NEXT: ret void ; %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() @@ -217,6 +738,57 @@ define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) ret void } -;. -; NO-PRELOAD: [[META0]] = !{} -;. +define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; NO-PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; NO-PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; NO-PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2 +; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2 +; NO-PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[LOAD1]] to i32 +; NO-PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[LOAD2]] to i32 +; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[LOAD0]], i32 0 +; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1 +; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_z_workgroup_size_z_remainder_z +; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { +; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Z_WORKGROUP_SIZE_Z_REMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; PRELOAD-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 8 +; PRELOAD-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 +; PRELOAD-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 +; PRELOAD-NEXT: [[LOAD0:%.*]] = load i32, ptr addrspace(4) [[GEP0]], align 4 +; PRELOAD-NEXT: [[LOAD1:%.*]] = load i16, ptr addrspace(4) [[GEP1]], align 2 +; PRELOAD-NEXT: [[LOAD2:%.*]] = load i16, ptr addrspace(4) [[GEP2]], align 2 +; PRELOAD-NEXT: [[CONV1:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 +; PRELOAD-NEXT: [[CONV2:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 +; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[_HIDDEN_BLOCK_COUNT_Z]], i32 0 +; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV1]], i32 1 +; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV2]], i32 2 +; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-NEXT: ret void +; + %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 + %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 + %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 + %load0 = load i32, ptr addrspace(4) %gep0 + %load1 = load i16, ptr addrspace(4) %gep1 + %load2 = load i16, ptr addrspace(4) %gep2 + %conv1 = zext i16 %load1 to i32 + %conv2 = zext i16 %load2 to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 31beb7a3cce24..7c667027bf542 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s -define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_block_count_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -16,27 +15,13 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB0_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB0_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s8 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 { +define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_unused_arg_block_count_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -51,60 +36,30 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_unused_arg_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB1_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB1_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) { +define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg %0, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: no_free_sgprs_block_count_x: ; GFX940: ; %bb.1: -; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 +; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB2_0 ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB2_0: -; GFX940-NEXT: s_load_dword s0, s[4:5], 0x28 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v1, s12 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_free_sgprs_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB2_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB2_0: -; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %_hidden_block_count_x, ptr addrspace(1) %out, align 4 ret void } @@ -118,26 +73,13 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_inreg_block_count_x: -; GFX90a: ; %bb.0: -; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } -; Implicit arg preloading is currently restricted to cases where all explicit -; args are inreg (preloaded). - -define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 { +define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg %0) #0 { ; GFX940-LABEL: mixed_inreg_block_count_x: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10 @@ -147,19 +89,9 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 ; GFX940-NEXT: v_mov_b32_e32 v1, s4 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: mixed_inreg_block_count_x: -; GFX90a: ; %bb.0: -; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10 -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } @@ -178,24 +110,9 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: incorrect_type_i64_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB5_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB5_0: -; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90a-NEXT: v_mov_b32_e32 v2, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i64, ptr addrspace(4) %imp_arg_ptr - store i64 %load, ptr addrspace(1) %out + %load = load i64, ptr addrspace(4) %imp_arg_ptr, align 8 + store i64 %load, ptr addrspace(1) %out, align 8 ret void } @@ -214,28 +131,13 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: incorrect_type_i16_block_count_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB6_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB6_0: -; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_short v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i16, ptr addrspace(4) %imp_arg_ptr - store i16 %load, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %imp_arg_ptr, align 2 + store i16 %load, ptr addrspace(1) %out, align 2 ret void } -define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 { ; GFX940-LABEL: preload_block_count_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -249,24 +151,10 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s5 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB7_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB7_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4 ret void } @@ -286,30 +174,14 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: random_incorrect_offset: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB8_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB8_0: -; GFX90a-NEXT: s_mov_b32 s0, 8 -; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %load, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 { ; GFX940-LABEL: preload_block_count_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -324,29 +196,14 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v1, s6 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB9_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB9_0: -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s10 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 - %load = load i32, ptr addrspace(4) %gep - store i32 %load, ptr addrspace(1) %out + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_z, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 { +define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x) #0 { ; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -363,32 +220,15 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB10_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB10_0: -; GFX90a-NEXT: s_and_b32 s0, s8, 0xff -; GFX90a-NEXT: s_add_i32 s0, s10, s0 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr + %load = load i32, ptr addrspace(4) %imp_arg_ptr, align 4 %ext = zext i8 %val to i32 - %add = add i32 %load, %ext - store i32 %add, ptr addrspace(1) %out + %add = add i32 %_hidden_block_count_x, %ext + store i32 %add, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z) #0 { ; GFX940-LABEL: preload_block_count_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -405,38 +245,21 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v2, s6 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB11_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB11_0: -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s8 -; GFX90a-NEXT: v_mov_b32_e32 v1, s9 -; GFX90a-NEXT: v_mov_b32_e32 v2, s10 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 - %load_x = load i32, ptr addrspace(4) %gep_x + %load_x = load i32, ptr addrspace(4) %gep_x, align 4 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 - %load_y = load i32, ptr addrspace(4) %gep_y + %load_y = load i32, ptr addrspace(4) %gep_y, align 4 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 - %load_z = load i32, ptr addrspace(4) %gep_z - %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i32, ptr addrspace(4) %gep_z, align 4 + %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %_hidden_block_count_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %_hidden_block_count_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x) #0 { ; GFX940-LABEL: preload_workgroup_size_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -451,30 +274,15 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB12_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB12_0: -; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_x to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y) #0 { ; GFX940-LABEL: preload_workgroup_size_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -489,30 +297,15 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB13_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB13_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_y to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 { ; GFX940-LABEL: preload_workgroup_size_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -528,31 +321,15 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB14_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB14_0: -; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_group_size_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z) #0 { ; GFX940-LABEL: preload_workgroup_size_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -572,44 +349,24 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_workgroup_size_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB15_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB15_0: -; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 -; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff -; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s1 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: v_mov_b32_e32 v2, s2 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 - %load_x = load i16, ptr addrspace(4) %gep_x - %conv_x = zext i16 %load_x to i32 + %load_x = load i16, ptr addrspace(4) %gep_x, align 2 + %conv_x = zext i16 %_hidden_group_size_x to i32 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 - %load_y = load i16, ptr addrspace(4) %gep_y - %conv_y = zext i16 %load_y to i32 + %load_y = load i16, ptr addrspace(4) %gep_y, align 2 + %conv_y = zext i16 %_hidden_group_size_y to i32 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 - %load_z = load i16, ptr addrspace(4) %gep_z - %conv_z = zext i16 %load_z to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i16, ptr addrspace(4) %gep_z, align 2 + %conv_z = zext i16 %_hidden_group_size_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x) #0 { ; GFX940-LABEL: preload_remainder_x: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -625,31 +382,15 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_remainder_x: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB16_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB16_0: -; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_x to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y) #0 { ; GFX940-LABEL: preloadremainder_y: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -665,31 +406,15 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_y: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB17_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB17_0: -; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_y to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preloadremainder_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -705,31 +430,15 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { ; GFX940-NEXT: v_mov_b32_e32 v1, s0 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB18_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB18_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preloadremainder_xyz: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -749,47 +458,29 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preloadremainder_xyz: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB19_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB19_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 -; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s1 -; GFX90a-NEXT: v_mov_b32_e32 v1, s2 -; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 - %load_x = load i16, ptr addrspace(4) %gep_x - %conv_x = zext i16 %load_x to i32 + %load_x = load i16, ptr addrspace(4) %gep_x, align 2 + %conv_x = zext i16 %_hidden_remainder_x to i32 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 - %load_y = load i16, ptr addrspace(4) %gep_y - %conv_y = zext i16 %load_y to i32 + %load_y = load i16, ptr addrspace(4) %gep_y, align 2 + %conv_y = zext i16 %_hidden_remainder_y to i32 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load_z = load i16, ptr addrspace(4) %gep_z - %conv_z = zext i16 %load_z to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load_z = load i16, ptr addrspace(4) %gep_z, align 2 + %conv_z = zext i16 %_hidden_remainder_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) { +define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out, i128 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: no_free_sgprs_preloadremainder_z: ; GFX940: ; %bb.1: -; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 +; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB20_0 ; GFX940-NEXT: .p2align 8 @@ -798,74 +489,41 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr ; GFX940-NEXT: s_lshr_b32 s0, s15, 16 ; GFX940-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NEXT: v_mov_b32_e32 v1, s0 -; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 +; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB20_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB20_0: -; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load = load i16, ptr addrspace(4) %gep - %conv = zext i16 %load to i32 - store i32 %conv, ptr addrspace(1) %out + %load = load i16, ptr addrspace(4) %gep, align 2 + %conv = zext i16 %_hidden_remainder_z to i32 + store i32 %conv, ptr addrspace(1) %out, align 4 ret void } -; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs. +; This should use s15 for the hidden argument. -define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 { -; GFX940-LABEL: preload_block_max_user_sgprs: +define amdgpu_kernel void @preload_block_y_max_user_sgprs(ptr addrspace(1) inreg %out, i256 inreg, i64 inreg, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y) #0 { +; GFX940-LABEL: preload_block_y_max_user_sgprs: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 -; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28 +; GFX940-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x28 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: s_branch .LBB21_0 ; GFX940-NEXT: .p2align 8 ; GFX940-NEXT: ; %bb.2: ; GFX940-NEXT: .LBB21_0: ; GFX940-NEXT: v_mov_b32_e32 v0, 0 -; GFX940-NEXT: v_mov_b32_e32 v1, s12 +; GFX940-NEXT: v_mov_b32_e32 v1, s15 ; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_max_user_sgprs: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB21_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB21_0: -; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28 -; GFX90a-NEXT: v_mov_b32_e32 v0, 0 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: v_mov_b32_e32 v1, s0 -; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() - %load = load i32, ptr addrspace(4) %imp_arg_ptr - store i32 %load, ptr addrspace(1) %out + %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 + %load = load i32, ptr addrspace(4) %gep, align 4 + store i32 %_hidden_block_count_y, ptr addrspace(1) %out, align 4 ret void } -define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 { +define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_x, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_y, i32 inreg "amdgpu-hidden-argument" %_hidden_block_count_z, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_x, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_y, i16 inreg "amdgpu-hidden-argument" %_hidden_group_size_z, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_x, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_y, i16 inreg "amdgpu-hidden-argument" %_hidden_remainder_z) #0 { ; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: ; GFX940: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 @@ -884,39 +542,20 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 ; GFX940-NEXT: s_endpgm -; -; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: -; GFX90a: ; %bb.1: -; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 -; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 -; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 -; GFX90a-NEXT: s_waitcnt lgkmcnt(0) -; GFX90a-NEXT: s_branch .LBB22_0 -; GFX90a-NEXT: .p2align 8 -; GFX90a-NEXT: ; %bb.2: -; GFX90a-NEXT: .LBB22_0: -; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 -; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff -; GFX90a-NEXT: v_mov_b32_e32 v3, 0 -; GFX90a-NEXT: v_mov_b32_e32 v0, s10 -; GFX90a-NEXT: v_mov_b32_e32 v1, s1 -; GFX90a-NEXT: v_mov_b32_e32 v2, s0 -; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] -; GFX90a-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 - %load0 = load i32, ptr addrspace(4) %gep0 - %load1 = load i16, ptr addrspace(4) %gep1 - %load2 = load i16, ptr addrspace(4) %gep2 - %conv1 = zext i16 %load1 to i32 - %conv2 = zext i16 %load2 to i32 - %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 - %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 - %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 - store <3 x i32> %ins.2, ptr addrspace(1) %out + %load0 = load i32, ptr addrspace(4) %gep0, align 4 + %load1 = load i16, ptr addrspace(4) %gep1, align 2 + %load2 = load i16, ptr addrspace(4) %gep2, align 2 + %conv1 = zext i16 %_hidden_group_size_z to i32 + %conv2 = zext i16 %_hidden_remainder_z to i32 + %ins.0 = insertelement <3 x i32> poison, i32 %_hidden_block_count_z, i32 0 + %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 + %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 + store <3 x i32> %ins.2, ptr addrspace(1) %out, align 16 ret void } -attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx940" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll index ab0fb7584d50c..658ef33f74935 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll @@ -1,60 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature ; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=2 -S < %s | FileCheck -check-prefix=PRELOAD-2 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD-ALL %s -define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 +define amdgpu_kernel void @ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-2-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; PRELOAD-ALL-NEXT: [[PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in store i32 %load, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 @@ -62,40 +52,27 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] +; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24 +; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -104,25 +81,25 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %i ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel ; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48 ; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56 ; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 @@ -134,70 +111,45 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i ; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48 -; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 -; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8 -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56 -; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 -; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 24 +; PRELOAD-2-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 40 +; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 48 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56 +; PRELOAD-2-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-2-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_PTR1_KERNEL_KERNARG_SEGMENT]], i64 56 +; PRELOAD-ALL-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]] +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -210,19 +162,17 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %i ret void } -; Preload args with inreg in the NO-PRELOAD case. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 @@ -230,38 +180,27 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT]], i64 24 +; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -270,56 +209,45 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr ad ret void } -; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +define amdgpu_kernel void @ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence -; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT]], i64 24 +; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_ptr1_ptr1_ptr1_inreg_offset_two_sequence_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_PTR1_PTR1_PTR1_INREG_OFFSET_TWO_SEQUENCE_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -328,20 +256,20 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_se ret void } -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned +define amdgpu_kernel void @i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16 ; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32 ; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 ; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 @@ -351,50 +279,33 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg ; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 -; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 -; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 -; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 -; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] -; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned -; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 -; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel +; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 24 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-2-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-2-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_ptr1_ptr1_ptr1_ptr1_misaligned_kernel +; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[I16_PTR1_PTR1_PTR1_PTR1_MISALIGNED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4 +; PRELOAD-ALL-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4 +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %load = load i32, ptr addrspace(1) %in %load1 = load i32, ptr addrspace(1) %in1 @@ -405,20 +316,18 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg ret void } -; In this case both i16 args with be preloaded into the first SGPR. - -define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 +define amdgpu_kernel void @i16_i16_ptr1_kernel(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel ; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 -; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 ; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 ; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 -; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 ; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] ; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 ; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 @@ -426,38 +335,25 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, ; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 ; NO-PRELOAD-NEXT: ret void ; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 -; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 -; PRELOAD-1-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 -; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 -; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] -; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32 -; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 -; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16 -; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { -; PRELOAD-8-NEXT: [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() -; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 -; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 -; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] -; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 -; PRELOAD-8-NEXT: ret void +; PRELOAD-2-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel +; PRELOAD-2-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; PRELOAD-2-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i16_i16_ptr1_kernel +; PRELOAD-ALL-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void ; %ext = zext i16 %arg0 to i32 %ext1 = zext i16 %arg1 to i32 @@ -466,4 +362,1104 @@ define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, ret void } -attributes #0 = { nounwind } +define amdgpu_kernel void @ptr1_i8_kernel(ptr addrspace(1) %out, i8 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_zeroext_kernel(ptr addrspace(1) %out, i8 zeroext %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_zeroext_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg zeroext [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I8_ZEROEXT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i16_kernel(ptr addrspace(1) %out, i16 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i16 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_i32_kernel(ptr addrspace(1) %out, i32 %arg0) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[ARG0:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i32 [[ARG0_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG0:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i32 [[ARG0]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store i32 %arg0, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @i32_ptr1_i32_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_kernel +; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_kernel(ptr addrspace(1) %out, i16 %arg0, i16 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT]], i64 8 +; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-2-NEXT: [[EXT1:%.*]] = zext i16 [[TMP3]] to i32 +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i16 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: [[EXT1:%.*]] = zext i16 [[ARG1]] to i32 +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v2i8_kernel(ptr addrspace(1) %out, <2 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to <2 x i8> +; NO-PRELOAD-NEXT: store <2 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V2I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <2 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store <2 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_byref_i32_i32_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256 +; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260 +; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256 +; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260 +; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 256 +; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_KERNEL_KERNARG_SEGMENT]], i64 260 +; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_byref_i32_i32_staggered_kernel(ptr addrspace(1) %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256 +; NO-PRELOAD-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260 +; NO-PRELOAD-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256 +; PRELOAD-2-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260 +; PRELOAD-2-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_byref_i32_i32_staggered_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(4) byref(i32) align 256 [[IN_BYREF:%.*]], i32 [[AFTER_OFFSET:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 256 +; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BYREF_I32_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 260 +; PRELOAD-ALL-NEXT: [[AFTER_OFFSET_LOAD:%.*]] = load i32, ptr addrspace(4) [[AFTER_OFFSET_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; PRELOAD-ALL-NEXT: [[IN:%.*]] = load i32, ptr addrspace(4) [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: store volatile i32 [[AFTER_OFFSET_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %in = load i32, ptr addrspace(4) %in.byref + store volatile i32 %in, ptr addrspace(1) %out, align 4 + store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v8i32_kernel(ptr addrspace(1) nocapture %out, <8 x i32> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <8 x i32> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <8 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I32_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-ALL-NEXT: store <8 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <8 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3i16_kernel(ptr addrspace(1) nocapture %out, <3 x i16> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i16> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <3 x i32> +; NO-PRELOAD-NEXT: store <3 x i16> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i16_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i16> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V3I16_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <3 x i16> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x i16> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3i32_kernel(ptr addrspace(1) nocapture %out, <3 x i32> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x i32> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> +; NO-PRELOAD-NEXT: store <3 x i32> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3i32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x i32> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V3I32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <3 x i32> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x i32> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v3f32_kernel(ptr addrspace(1) nocapture %out, <3 x float> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <3 x float> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <3 x i32> +; NO-PRELOAD-NEXT: store <3 x float> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3f32_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <3 x float> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V3F32_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <3 x float> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x float> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v5i8_kernel(ptr addrspace(1) nocapture %out, <5 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <5 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V5I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <5 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <5 x i8> %in, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @ptr1_v5f64_kernel(ptr addrspace(1) nocapture %out, <5 x double> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) nocapture [[OUT:%.*]], <5 x double> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64 +; PRELOAD-2-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v5f64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg nocapture [[OUT:%.*]], <5 x double> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(128) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V5F64_KERNEL_KERNARG_SEGMENT]], i64 64 +; PRELOAD-ALL-NEXT: [[IN_LOAD:%.*]] = load <5 x double>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-ALL-NEXT: store <5 x double> [[IN_LOAD]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <5 x double> %in, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @ptr1_v8i8_kernel(ptr addrspace(1) %out, <8 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <8 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <8 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <8 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v8i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <8 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V8I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <8 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <8 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i64_kernel(ptr addrspace(1) %out, i64 %a) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i64 [[A:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[A_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I64_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[A_LOAD:%.*]] = load i64, ptr addrspace(4) [[A_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i64 [[A_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i64 inreg [[A:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i64 [[A]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store i64 %a, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @ptr1_f64_kernel(ptr addrspace(1) %out, double %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], double [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_F64_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load double, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store double [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_f64_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], double inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_F64_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store double [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store double %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_half_kernel(ptr addrspace(1) %out, half %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half +; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store half %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_bfloat_kernel(ptr addrspace(1) %out, bfloat %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], bfloat [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to bfloat +; NO-PRELOAD-NEXT: store bfloat [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], bfloat inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store bfloat [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store bfloat %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v2bfloat_kernel(ptr addrspace(1) %out, <2 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <2 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <2 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v2bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <2 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V2BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <2 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store <2 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v3bfloat_kernel(ptr addrspace(1) %out, <3 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <3 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <3 x i32> +; NO-PRELOAD-NEXT: store <3 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v3bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <3 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V3BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <3 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <3 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v6bfloat_kernel(ptr addrspace(1) %out, <6 x bfloat> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <6 x bfloat> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <6 x bfloat>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <6 x bfloat> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v6bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <6 x bfloat> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V6BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <6 x bfloat> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store <6 x bfloat> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_half_v7bfloat_kernel(ptr addrspace(1) %out, half %in, <7 x bfloat> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], half [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = bitcast i16 [[TMP2]] to half +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store half [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load <7 x bfloat>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <7 x bfloat> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_half_v7bfloat_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], half inreg [[IN:%.*]], <7 x bfloat> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_HALF_V7BFLOAT_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store half [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <7 x bfloat> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store half %in, ptr addrspace(1) %out + store <7 x bfloat> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i1_kernel(ptr addrspace(1) %out, i1 %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i1 [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 +; NO-PRELOAD-NEXT: store i1 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 1 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i1 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i1 [[IN]], ptr addrspace(1) [[OUT]], align 1 +; PRELOAD-ALL-NEXT: ret void +; + store i1 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_fp128_kernel(ptr addrspace(1) %out, fp128 %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], fp128 [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_FP128_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load fp128, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store fp128 [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_fp128_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], fp128 inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_FP128_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store fp128 [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store fp128 %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v7i8_kernel(ptr addrspace(1) %out, <7 x i8> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x i8> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x i8>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <7 x i8> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7i8_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x i8> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V7I8_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <7 x i8> [[IN]], ptr addrspace(1) [[OUT]], align 8 +; PRELOAD-ALL-NEXT: ret void +; + store <7 x i8> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_v7half_kernel(ptr addrspace(1) %out, <7 x half> %in) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], <7 x half> [[IN:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load <7 x half>, ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store <7 x half> [[IN_LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_v7half_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], <7 x half> inreg [[IN:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_V7HALF_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store <7 x half> [[IN]], ptr addrspace(1) [[OUT]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store <7 x half> %in, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i16_i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i32 %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12 +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 12 +; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 4, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store i32 [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i32_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i32 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store i32 [[IN2]], ptr addrspace(1) [[OUT2]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store i32 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_v3i32_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <3 x i32> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <3 x i32> +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT]], i64 32 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <3 x i32> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 16 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v3i32_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <3 x i32> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_V3I32_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <3 x i32> [[IN2]], ptr addrspace(1) [[OUT2]], align 16 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store <3 x i32> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_i16_ptr1_kernel(ptr addrspace(1) %out, i16 %in, i16 %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[OUT2_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[OUT2_LOAD]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_i16_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], i16 inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_I16_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store i16 [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store i16 %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @ptr1_i16_v2i8_ptr1_kernel(ptr addrspace(1) %out, i16 %in, <2 x i8> %in2, ptr addrspace(1) %out2) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i16 [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP5]] to <2 x i8> +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[OUT_LOAD]], align 2 +; NO-PRELOAD-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> [[IN2:%.*]], ptr addrspace(1) [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[IN2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 8 +; PRELOAD-2-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN2_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16 +; PRELOAD-2-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; PRELOAD-2-NEXT: [[IN2_LOAD:%.*]] = bitcast i16 [[TMP3]] to <2 x i8> +; PRELOAD-2-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-2-NEXT: store <2 x i8> [[IN2_LOAD]], ptr addrspace(1) [[OUT2_LOAD]], align 2 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i16_v2i8_ptr1_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i16 inreg [[IN:%.*]], <2 x i8> inreg [[IN2:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I16_V2I8_PTR1_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(24) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: store i16 [[IN]], ptr addrspace(1) [[OUT]], align 2 +; PRELOAD-ALL-NEXT: store <2 x i8> [[IN2]], ptr addrspace(1) [[OUT2]], align 2 +; PRELOAD-ALL-NEXT: ret void +; + store i16 %in, ptr addrspace(1) %out + store <2 x i8> %in2, ptr addrspace(1) %out2 + ret void +} + +define amdgpu_kernel void @i32_ptr1_i32_staggered_kernel(i32 %arg0, ptr addrspace(1) %out, i32 %arg1) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; NO-PRELOAD-SAME: (i32 [[ARG0:%.*]], ptr addrspace(1) [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[ARG0_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[ARG0_LOAD]], [[ARG1_LOAD]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; PRELOAD-2-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT]], i64 16 +; PRELOAD-2-NEXT: [[ARG1_LOAD:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; PRELOAD-2-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1_LOAD]] +; PRELOAD-2-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@i32_ptr1_i32_staggered_kernel +; PRELOAD-ALL-SAME: (i32 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[ARG1:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[I32_PTR1_I32_STAGGERED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[ADD:%.*]] = add i32 [[ARG0]], [[ARG1]] +; PRELOAD-ALL-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %add = add i32 %arg0, %arg1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ptr1_i8_i32_trailing_unused_kernel(ptr addrspace(1) %out, i8 %arg0, i32 %unused) { +; NO-PRELOAD-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i8 [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 8, !invariant.load [[META0]] +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i8 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-2-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; PRELOAD-2-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 [[UNUSED:%.*]]) #[[ATTR0]] { +; PRELOAD-2-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-2-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-2-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-2-NEXT: ret void +; +; PRELOAD-ALL-LABEL: define {{[^@]+}}@ptr1_i8_i32_trailing_unused_kernel +; PRELOAD-ALL-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i8 inreg [[ARG0:%.*]], i32 inreg [[UNUSED:%.*]]) #[[ATTR0]] { +; PRELOAD-ALL-NEXT: [[PTR1_I8_I32_TRAILING_UNUSED_KERNEL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-ALL-NEXT: [[EXT:%.*]] = zext i8 [[ARG0]] to i32 +; PRELOAD-ALL-NEXT: store i32 [[EXT]], ptr addrspace(1) [[OUT]], align 4 +; PRELOAD-ALL-NEXT: ret void +; + %ext = zext i8 %arg0 to i32 + store i32 %ext, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll deleted file mode 100644 index 20edbd6c0d0fa..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ /dev/null @@ -1,263 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s - -define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1 -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_4(i32 %0, i64 %1, <2 x float> %2, ptr %3) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_4 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_18(i32 %0, i64 %1, <2 x float> %2, ptr %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14, i32 %15, i32 %16, i32 %17) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]], <2 x float> [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr [[TMP3:%.*]], i32 [[TMP4:%.*]], i32 [[TMP5:%.*]], i32 [[TMP6:%.*]], i32 [[TMP7:%.*]], i32 [[TMP8:%.*]], i32 [[TMP9:%.*]], i32 [[TMP10:%.*]], i32 [[TMP11:%.*]], i32 [[TMP12:%.*]], i32 [[TMP13:%.*]], i32 [[TMP14:%.*]], i32 [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_18 -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]], <2 x float> inreg [[TMP2:%.*]], ptr inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], i32 inreg [[TMP5:%.*]], i32 inreg [[TMP6:%.*]], i32 inreg [[TMP7:%.*]], i32 inreg [[TMP8:%.*]], i32 inreg [[TMP9:%.*]], i32 inreg [[TMP10:%.*]], i32 inreg [[TMP11:%.*]], i32 inreg [[TMP12:%.*]], i32 inreg [[TMP13:%.*]], i32 inreg [[TMP14:%.*]], i32 inreg [[TMP15:%.*]], i32 [[TMP16:%.*]], i32 [[TMP17:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define void @test_preload_hint_non_kernel_2(i32 %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; NO-PRELOAD-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-1-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-3-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-16-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_non_kernel_2 -; PRELOAD-20-SAME: (i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @func(ptr [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-1-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-1-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-3-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-3-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-16-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-16-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_func -; PRELOAD-20-SAME: (ptr inreg [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { -; PRELOAD-20-NEXT: call void @func(ptr [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @func(ptr %0) - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) -; PRELOAD-20-NEXT: ret void -; - call void @llvm.amdgcn.set.prio(i16 %0) - ret void -} - -define spir_kernel void @test_preload_hint_kernel_1_spir_cc(ptr %0) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; NO-PRELOAD-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-3-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-16-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_spir_cc -; PRELOAD-20-SAME: (ptr [[TMP0:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, i64 %1) #0 { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; NO-PRELOAD-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-1-SAME: (i32 inreg [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-3-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-16-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_2_preexisting -; PRELOAD-20-SAME: (i32 inreg [[TMP0:%.*]], i64 inreg [[TMP1:%.*]]) #[[ATTR0]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { -; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; NO-PRELOAD-NEXT: ret void -; -; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-1-NEXT: ret void -; -; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-3-NEXT: ret void -; -; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-16-NEXT: ret void -; -; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { -; PRELOAD-20-NEXT: ret void -; - ret void -} - -declare void @func(ptr) #0 -declare void @llvm.amdgcn.set.prio(i16) - -attributes #0 = { nounwind }