-
Notifications
You must be signed in to change notification settings - Fork 15.3k
AMDGPU: Autogenerate checks in a test #168815
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
commit-id:eb3ee40e
|
@llvm/pr-subscribers-backend-amdgpu Author: Nicolai Hähnle (nhaehnle) ChangesStack:
Patch is 28.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168815.diff 1 Files Affected:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index f71fdbdee527b..c9a013bd58322 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@@ -5,9 +6,18 @@
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fadd_combine_v2f16
-; GCN: fadd <2 x half>
define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fadd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -24,9 +34,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fsub_combine_v2f16
-; GCN: fsub <2 x half>
define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fsub_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -43,9 +62,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmul_combine_v2f16
-; GCN: fmul <2 x half>
define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fmul_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -61,9 +89,18 @@ bb:
ret void
}
-; GCN-LABEL: @fdiv_combine_v2f16
-; GCN: fdiv <2 x half>
define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fdiv_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -79,9 +116,18 @@ bb:
ret void
}
-; GCN-LABEL: @frem_combine_v2f16
-; GCN: frem <2 x half>
define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @frem_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -98,9 +144,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fma_combine_v2f16
-; GCN: call <2 x half> @llvm.fma.v2f16
define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -117,9 +172,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmuladd_combine_v2f16
-; GCN: call <2 x half> @llvm.fmuladd.v2f16
define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -135,12 +199,35 @@ bb:
ret void
}
-; GCN-LABEL: @minnum_combine_v2f16
-; GFX8: call half @llvm.minnum.f16(
-; GFX8: call half @llvm.minnum.f16(
-; GFX9: call <2 x half> @llvm.minnum.v2f16
define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @minnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @minnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -156,12 +243,35 @@ bb:
ret void
}
-; GCN-LABEL: @maxnum_combine_v2f16
-; GFX8: call half @llvm.maxnum.f16(
-; GFX8: call half @llvm.maxnum.f16(
-; GFX9: call <2 x half> @llvm.maxnum.v2f16
define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @maxnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @maxnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -178,10 +288,23 @@ bb:
}
; FIXME: Should vectorize
-; GCN-LABEL: @minimum_combine_v2f16
-; GCN: call half @llvm.minimum.f16(
-; GCN: call half @llvm.minimum.f16(
define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -197,10 +320,23 @@ bb:
ret void
}
-; GCN-LABEL: @maximum_combine_v2f16
-; GCN: call half @llvm.maximum.f16(
-; GCN: call half @llvm.maximum.f16(
define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -216,9 +352,18 @@ bb:
ret void
}
-; GCN-LABEL: @canonicalize_combine_v2f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @canonicalize_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -234,9 +379,18 @@ bb:
ret void
}
-; GCN-LABEL: @fabs_combine_v2f16
-; GCN: call <2 x half> @llvm.fabs.v2f16(
define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fabs_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -252,9 +406,18 @@ bb:
ret void
}
-; GCN-LABEL: @fneg_combine_v2f16
-; GCN: fneg <2 x half>
define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fneg_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -270,11 +433,36 @@ bb:
ret void
}
-; GCN-LABEL: @copysign_combine_v2f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -291,12 +479,59 @@ bb:
}
; FIXME: Should always vectorize
-; GCN-LABEL: @copysign_combine_v4f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT: store...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Nicolai Hähnle (nhaehnle) ChangesStack:
Patch is 28.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168815.diff 1 Files Affected:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index f71fdbdee527b..c9a013bd58322 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@@ -5,9 +6,18 @@
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fadd_combine_v2f16
-; GCN: fadd <2 x half>
define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fadd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -24,9 +34,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fsub_combine_v2f16
-; GCN: fsub <2 x half>
define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fsub_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -43,9 +62,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmul_combine_v2f16
-; GCN: fmul <2 x half>
define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fmul_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -61,9 +89,18 @@ bb:
ret void
}
-; GCN-LABEL: @fdiv_combine_v2f16
-; GCN: fdiv <2 x half>
define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fdiv_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -79,9 +116,18 @@ bb:
ret void
}
-; GCN-LABEL: @frem_combine_v2f16
-; GCN: frem <2 x half>
define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @frem_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -98,9 +144,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fma_combine_v2f16
-; GCN: call <2 x half> @llvm.fma.v2f16
define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -117,9 +172,18 @@ bb:
}
; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmuladd_combine_v2f16
-; GCN: call <2 x half> @llvm.fmuladd.v2f16
define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -135,12 +199,35 @@ bb:
ret void
}
-; GCN-LABEL: @minnum_combine_v2f16
-; GFX8: call half @llvm.minnum.f16(
-; GFX8: call half @llvm.minnum.f16(
-; GFX9: call <2 x half> @llvm.minnum.v2f16
define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @minnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @minnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -156,12 +243,35 @@ bb:
ret void
}
-; GCN-LABEL: @maxnum_combine_v2f16
-; GFX8: call half @llvm.maxnum.f16(
-; GFX8: call half @llvm.maxnum.f16(
-; GFX9: call <2 x half> @llvm.maxnum.v2f16
define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @maxnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @maxnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -178,10 +288,23 @@ bb:
}
; FIXME: Should vectorize
-; GCN-LABEL: @minimum_combine_v2f16
-; GCN: call half @llvm.minimum.f16(
-; GCN: call half @llvm.minimum.f16(
define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -197,10 +320,23 @@ bb:
ret void
}
-; GCN-LABEL: @maximum_combine_v2f16
-; GCN: call half @llvm.maximum.f16(
-; GCN: call half @llvm.maximum.f16(
define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -216,9 +352,18 @@ bb:
ret void
}
-; GCN-LABEL: @canonicalize_combine_v2f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @canonicalize_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -234,9 +379,18 @@ bb:
ret void
}
-; GCN-LABEL: @fabs_combine_v2f16
-; GCN: call <2 x half> @llvm.fabs.v2f16(
define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fabs_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -252,9 +406,18 @@ bb:
ret void
}
-; GCN-LABEL: @fneg_combine_v2f16
-; GCN: fneg <2 x half>
define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fneg_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT: [[BB:.*:]]
+; GCN-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
+; GCN-NEXT: store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -270,11 +433,36 @@ bb:
ret void
}
-; GCN-LABEL: @copysign_combine_v2f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT: [[BB:.*:]]
+; GFX9-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT: [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT: store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = zext i32 %tmp to i64
@@ -291,12 +479,59 @@ bb:
}
; FIXME: Should always vectorize
-; GCN-LABEL: @copysign_combine_v4f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT: [[BB:.*:]]
+; GFX8-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT: [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT: [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT: [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT: store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT: [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT: [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT: [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT: [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT: store...
[truncated]
|
|
(no review expected) |
|
Test |
Stack: