Skip to content

Conversation

@nhaehnle
Copy link
Collaborator

@nhaehnle nhaehnle commented Nov 20, 2025

@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

Changes

Stack:

  • [6/6] #168820
  • [5/6] #168819
  • [4/6] #168818
  • [3/6] #168817
  • [2/6] #168816
  • [1/6] #168815 ⬅

⚠️ Part of a stack created by spr. Merging this PR using the GitHub UI may have unexpected results.


Patch is 28.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168815.diff

1 Files Affected:

  • (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll (+345-52)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index f71fdbdee527b..c9a013bd58322 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@@ -5,9 +6,18 @@
 
 ; FIXME: Should not vectorize on gfx8
 
-; GCN-LABEL: @fadd_combine_v2f16
-; GCN: fadd <2 x half>
 define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fadd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -24,9 +34,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fsub_combine_v2f16
-; GCN: fsub <2 x half>
 define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fsub_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -43,9 +62,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmul_combine_v2f16
-; GCN: fmul <2 x half>
 define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fmul_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -61,9 +89,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fdiv_combine_v2f16
-; GCN: fdiv <2 x half>
 define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fdiv_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -79,9 +116,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @frem_combine_v2f16
-; GCN: frem <2 x half>
 define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @frem_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -98,9 +144,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fma_combine_v2f16
-; GCN: call <2 x half> @llvm.fma.v2f16
 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -117,9 +172,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmuladd_combine_v2f16
-; GCN: call <2 x half> @llvm.fmuladd.v2f16
 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -135,12 +199,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @minnum_combine_v2f16
-; GFX8: call half @llvm.minnum.f16(
-; GFX8: call half @llvm.minnum.f16(
 
-; GFX9: call <2 x half> @llvm.minnum.v2f16
 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @minnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @minnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -156,12 +243,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maxnum_combine_v2f16
-; GFX8: call half @llvm.maxnum.f16(
-; GFX8: call half @llvm.maxnum.f16(
 
-; GFX9: call <2 x half> @llvm.maxnum.v2f16
 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @maxnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @maxnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -178,10 +288,23 @@ bb:
 }
 
 ; FIXME: Should vectorize
-; GCN-LABEL: @minimum_combine_v2f16
-; GCN: call half @llvm.minimum.f16(
-; GCN: call half @llvm.minimum.f16(
 define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -197,10 +320,23 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maximum_combine_v2f16
-; GCN: call half @llvm.maximum.f16(
-; GCN: call half @llvm.maximum.f16(
 define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -216,9 +352,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @canonicalize_combine_v2f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @canonicalize_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -234,9 +379,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fabs_combine_v2f16
-; GCN: call <2 x half> @llvm.fabs.v2f16(
 define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fabs_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -252,9 +406,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fneg_combine_v2f16
-; GCN: fneg <2 x half>
 define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fneg_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -270,11 +433,36 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @copysign_combine_v2f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -291,12 +479,59 @@ bb:
 }
 
 ; FIXME: Should always vectorize
-; GCN-LABEL: @copysign_combine_v4f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
 
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 20, 2025

@llvm/pr-subscribers-llvm-transforms

Author: Nicolai Hähnle (nhaehnle)

Changes

Stack:

  • [6/6] #168820
  • [5/6] #168819
  • [4/6] #168818
  • [3/6] #168817
  • [2/6] #168816
  • [1/6] #168815 ⬅

⚠️ Part of a stack created by spr. Merging this PR using the GitHub UI may have unexpected results.


Patch is 28.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168815.diff

1 Files Affected:

  • (modified) llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll (+345-52)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index f71fdbdee527b..c9a013bd58322 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name I --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX9 %s
@@ -5,9 +6,18 @@
 
 ; FIXME: Should not vectorize on gfx8
 
-; GCN-LABEL: @fadd_combine_v2f16
-; GCN: fadd <2 x half>
 define void @fadd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fadd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fadd <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -24,9 +34,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fsub_combine_v2f16
-; GCN: fsub <2 x half>
 define void @fsub_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fsub_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fsub <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -43,9 +62,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmul_combine_v2f16
-; GCN: fmul <2 x half>
 define void @fmul_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fmul_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fmul <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -61,9 +89,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fdiv_combine_v2f16
-; GCN: fdiv <2 x half>
 define void @fdiv_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fdiv_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fdiv <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -79,9 +116,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @frem_combine_v2f16
-; GCN: frem <2 x half>
 define void @frem_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @frem_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = frem <2 x half> [[TMP0]], splat (half 0xH3C00)
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -98,9 +144,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fma_combine_v2f16
-; GCN: call <2 x half> @llvm.fma.v2f16
 define amdgpu_kernel void @fma_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fma_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -117,9 +172,18 @@ bb:
 }
 
 ; FIXME: Should not vectorize on gfx8
-; GCN-LABEL: @fmuladd_combine_v2f16
-; GCN: call <2 x half> @llvm.fmuladd.v2f16
 define amdgpu_kernel void @fmuladd_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define amdgpu_kernel void @fmuladd_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00), <2 x half> splat (half 0xH3C00))
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -135,12 +199,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @minnum_combine_v2f16
-; GFX8: call half @llvm.minnum.f16(
-; GFX8: call half @llvm.minnum.f16(
 
-; GFX9: call <2 x half> @llvm.minnum.v2f16
 define void @minnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @minnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.minnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.minnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @minnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -156,12 +243,35 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maxnum_combine_v2f16
-; GFX8: call half @llvm.maxnum.f16(
-; GFX8: call half @llvm.maxnum.f16(
 
-; GFX9: call <2 x half> @llvm.maxnum.v2f16
 define void @maxnum_combine_v2f16(ptr addrspace(1) %arg) {
+; GFX8-LABEL: define void @maxnum_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.maxnum.f16(half [[ITMP3]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.maxnum.f16(half [[ITMP7]], half 0xH3C00)
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @maxnum_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[TMP0]], <2 x half> splat (half 0xH3C00))
+; GFX9-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -178,10 +288,23 @@ bb:
 }
 
 ; FIXME: Should vectorize
-; GCN-LABEL: @minimum_combine_v2f16
-; GCN: call half @llvm.minimum.f16(
-; GCN: call half @llvm.minimum.f16(
 define void @minimum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @minimum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.minimum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.minimum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -197,10 +320,23 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @maximum_combine_v2f16
-; GCN: call half @llvm.maximum.f16(
-; GCN: call half @llvm.maximum.f16(
 define void @maximum_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @maximum_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP4:%.*]] = call half @llvm.maximum.f16(half [[ITMP3]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GCN-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GCN-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    [[ITMP8:%.*]] = call half @llvm.maximum.f16(half [[ITMP7]], half 0xH3C00)
+; GCN-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -216,9 +352,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @canonicalize_combine_v2f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @canonicalize_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -234,9 +379,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fabs_combine_v2f16
-; GCN: call <2 x half> @llvm.fabs.v2f16(
 define void @fabs_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fabs_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP0]])
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -252,9 +406,18 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @fneg_combine_v2f16
-; GCN: fneg <2 x half>
 define void @fneg_combine_v2f16(ptr addrspace(1) %arg) {
+; GCN-LABEL: define void @fneg_combine_v2f16(
+; GCN-SAME: ptr addrspace(1) [[ARG:%.*]]) #[[ATTR0]] {
+; GCN-NEXT:  [[BB:.*:]]
+; GCN-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GCN-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    [[TMP1:%.*]] = fneg <2 x half> [[TMP0]]
+; GCN-NEXT:    store <2 x half> [[TMP1]], ptr addrspace(1) [[ITMP2]], align 2
+; GCN-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -270,11 +433,36 @@ bb:
   ret void
 }
 
-; GCN-LABEL: @copysign_combine_v2f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v2f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP8]], ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    ret void
+;
+; GFX9-LABEL: define void @copysign_combine_v2f16(
+; GFX9-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX9-NEXT:  [[BB:.*:]]
+; GFX9-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX9-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX9-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    [[TMP1:%.*]] = insertelement <2 x half> poison, half [[SIGN]], i32 0
+; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <2 x i32> zeroinitializer
+; GFX9-NEXT:    [[TMP3:%.*]] = call <2 x half> @llvm.copysign.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP2]])
+; GFX9-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX9-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = zext i32 %tmp to i64
@@ -291,12 +479,59 @@ bb:
 }
 
 ; FIXME: Should always vectorize
-; GCN-LABEL: @copysign_combine_v4f16
-; GFX8: call half @llvm.copysign.f16(
-; GFX8: call half @llvm.copysign.f16(
 
-; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v4f16(ptr addrspace(1) %arg, half %sign) {
+; GFX8-LABEL: define void @copysign_combine_v4f16(
+; GFX8-SAME: ptr addrspace(1) [[ARG:%.*]], half [[SIGN:%.*]]) #[[ATTR0]] {
+; GFX8-NEXT:  [[BB:.*:]]
+; GFX8-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GFX8-NEXT:    [[ITMP1:%.*]] = zext i32 [[TMP]] to i64
+; GFX8-NEXT:    [[ITMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP1]]
+; GFX8-NEXT:    [[ITMP3:%.*]] = load half, ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP4:%.*]] = call half @llvm.copysign.f16(half [[ITMP3]], half [[SIGN]])
+; GFX8-NEXT:    store half [[ITMP4]], ptr addrspace(1) [[ITMP2]], align 2
+; GFX8-NEXT:    [[ITMP5:%.*]] = add nuw nsw i64 [[ITMP1]], 1
+; GFX8-NEXT:    [[ITMP6:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[ARG]], i64 [[ITMP5]]
+; GFX8-NEXT:    [[ITMP7:%.*]] = load half, ptr addrspace(1) [[ITMP6]], align 2
+; GFX8-NEXT:    [[ITMP8:%.*]] = call half @llvm.copysign.f16(half [[ITMP7]], half [[SIGN]])
+; GFX8-NEXT:    store...
[truncated]

@nhaehnle
Copy link
Collaborator Author

(no review expected)

@nhaehnle nhaehnle enabled auto-merge (squash) November 20, 2025 03:20
@nhaehnle nhaehnle merged commit 13ed14f into main Nov 20, 2025
10 of 12 checks passed
@nhaehnle nhaehnle deleted the users/nhaehnle/spr/main/eb3ee40e branch November 20, 2025 03:51
aadeshps-mcw pushed a commit to aadeshps-mcw/llvm-project that referenced this pull request Nov 26, 2025
Priyanshu3820 pushed a commit to Priyanshu3820/llvm-project that referenced this pull request Nov 26, 2025
@nhaehnle
Copy link
Collaborator Author

nhaehnle commented Dec 1, 2025

Test

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants