-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU] Pre-commit tests for preload kernarg move to attributor #123546
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Pre-commit tests for preload kernarg move to attributor #123546
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Austin Kerbow (kerbowa) ChangesNFC. Since the handling will be moved to the middlend the tests need to be reorganized. The IR tests are expanded since end-to-end testing is no longer baked in. Patch is 279.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123546.diff 4 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
index aeb7faade47150..180f01257f1f6d 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-IR-lowering.ll
@@ -1,20 +1,72 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=100 -S < %s | FileCheck -check-prefix=PRELOAD %s
+
+define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incompatible_attribute_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1)
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_aggregate_arg_block_count_x(ptr addrspace(1) %out, { i32, i32 } inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_aggregate_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], { i32, i32 } inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_AGGREGATE_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
-; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] {
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -27,9 +79,34 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) {
ret void
}
-define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] {
+define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) %out, i32 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_unused_arg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_UNUSED_ARG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
@@ -38,8 +115,8 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
; NO-PRELOAD-NEXT: ret void
;
-; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x(
-; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
+; PRELOAD-LABEL: define {{[^@]+}}@no_free_sgprs_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] {
; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
@@ -52,31 +129,335 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i5
ret void
}
-define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) {
-; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z(
-; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0
+define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg, i32) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@mixed_inreg_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[MIXED_INREG_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(272) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i32, ptr addrspace(4) %imp_arg_ptr
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 8
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i64_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8
+; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i64, ptr addrspace(4) %imp_arg_ptr
+ store i64 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; NO-PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 2
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@incorrect_type_i16_block_count_x
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[INCORRECT_TYPE_I16_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[IMP_ARG_PTR]], align 2
+; PRELOAD-NEXT: store i16 [[LOAD]], ptr addrspace(1) [[OUT]], align 2
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %load = load i16, ptr addrspace(4) %imp_arg_ptr
+ store i16 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABEL: define {{[^@]+}}@preload_block_count_y
+; PRELOAD-SAME: (ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]]) #[[ATTR0]] {
+; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_Y_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 4
+; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_Y]], ptr addrspace(1) [[OUT]], align 4
+; PRELOAD-NEXT: ret void
+;
+ %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+ %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
+ %load = load i32, ptr addrspace(4) %gep
+ store i32 %load, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) %out) {
+; NO-PRELOAD-LABEL: define {{[^@]+}}@random_incorrect_offset
+; NO-PRELOAD-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT]], i64 0
+; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
+; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2
+; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4
+; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
+; NO-PRELOAD-NEXT: ret void
+;
+; PRELOAD-LABE...
[truncated]
|
| ; | ||
| %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() | ||
| %load = load i32, ptr addrspace(4) %imp_arg_ptr | ||
| store i32 %load, ptr addrspace(1) %out |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this storing to a byref kernel argument?
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512 inreg) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should use something more normal than i512 for this, maybe <16 x i32>?
NFC. Since the handling will be moved to the middlend the tests need to be reorganized. The IR tests are expanded since end-to-end testing is no longer baked in.
da3258e to
90a7741
Compare
NFC. Since the handling will be moved to the middlend the tests need to be reorganized. The IR tests are expanded since end-to-end testing is no longer baked in.