From c9e159fe98bcfd4348a43a16966595f6fc6aacb7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 5 Nov 2024 12:41:54 +0800
Subject: [PATCH] [RISCV] Handle zvfhmin/zvfbfmin in
 lowerVECTOR_SHUFFLEAsVSlide1

Most of lowerVECTOR_SHUFFLE lowers to nodes that work on f16 and bf16 vectors, with the exception of the vslide1 lowering which tries to emit vfslide1s. Handle this case as an integer vslide1 via fmv.x.h.

Fixes #114893
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 18 +++++
 .../rvv/fixed-vectors-shuffle-vslide1down.ll  | 60 ++++++++++++++---
 .../rvv/fixed-vectors-shuffle-vslide1up.ll    | 66 +++++++++++++++----
 3 files changed, 123 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d2d03d4572dac..96490cdec6c69 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4817,6 +4817,24 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT,
 
   MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   auto [TrueMask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+  // zvfhmin and zvfbfmin don't have vfslide1{down,up}.vf so use fmv.x.h +
+  // vslide1{down,up}.vx instead.
+  if (VT.getVectorElementType() == MVT::bf16 ||
+      (VT.getVectorElementType() == MVT::f16 &&
+       !Subtarget.hasVInstructionsF16())) {
+    MVT IntVT = ContainerVT.changeVectorElementTypeToInteger();
+    Splat =
+        DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, Subtarget.getXLenVT(), Splat);
+    V2 = DAG.getBitcast(
+        IntVT, convertToScalableVector(ContainerVT, V2, DAG, Subtarget));
+    SDValue Vec = DAG.getNode(
+        IsVSlidedown ? RISCVISD::VSLIDE1DOWN_VL : RISCVISD::VSLIDE1UP_VL, DL,
+        IntVT, DAG.getUNDEF(IntVT), V2, Splat, TrueMask, VL);
+    Vec = DAG.getBitcast(ContainerVT, Vec);
+    return convertFromScalableVector(VT, Vec, DAG, Subtarget);
+  }
+
   auto OpCode = IsVSlidedown ?
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL) :
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
index f531ff3a835e4..563b90dfa47ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1down.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -131,23 +133,61 @@ define <4 x i64> @vslide1down_4xi64(<4 x i64> %v, i64 %b) {
   ret <4 x i64> %v1
 }
 
-define <2 x half> @vslide1down_2xf16(<2 x half> %v, half %b) {
-; CHECK-LABEL: vslide1down_2xf16:
+define <2 x bfloat> @vslide1down_2xbf16(<2 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1down_2xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %vb = insertelement <2 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <2 x bfloat> %v, <2 x bfloat> %vb, <2 x i32> <i32 1, i32 2>
+  ret <2 x bfloat> %v1
+}
+
+define <4 x bfloat> @vslide1down_4xbf16(<4 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1down_4xbf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    ret
+  %vb = insertelement <4 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <4 x bfloat> %v, <4 x bfloat> %vb, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x bfloat> %v1
+}
+
+define <2 x half> @vslide1down_2xf16(<2 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1down_2xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfslide1down.vf v8, v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1down_2xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <2 x half> poison, half %b, i64 0
   %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 1, i32 2>
   ret <2 x half> %v1
 }
 
 define <4 x half> @vslide1down_4xf16(<4 x half> %v, half %b) {
-; CHECK-LABEL: vslide1down_4xf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vslide1down_4xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1down.vf v8, v8, fa0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1down_4xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslide1down.vx v8, v8, a0
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <4 x half> poison, half %b, i64 0
   %v1 = shufflevector <4 x half> %v, <4 x half> %vb, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   ret <4 x half> %v1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
index b3390b6eeeccd..0f6d68dc1a6c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -140,25 +142,67 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) {
   ret <4 x i64> %v1
 }
 
-define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
-; CHECK-LABEL: vslide1up_2xf16:
+define <2 x bfloat> @vslide1up_2xbf16(<2 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1up_2xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %vb = insertelement <2 x half> poison, half %b, i64 0
-  %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 2, i32 0>
-  ret <2 x half> %v1
+  %vb = insertelement <2 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <2 x bfloat> %v, <2 x bfloat> %vb, <2 x i32> <i32 2, i32 0>
+  ret <2 x bfloat> %v1
 }
 
-define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
-; CHECK-LABEL: vslide1up_4xf16:
+define <4 x bfloat> @vslide1up_4xbf16(<4 x bfloat> %v, bfloat %b) {
+; CHECK-LABEL: vslide1up_4xbf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vfslide1up.vf v9, v8, fa0
+; CHECK-NEXT:    vslide1up.vx v9, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
+  %vb = insertelement <4 x bfloat> poison, bfloat %b, i64 0
+  %v1 = shufflevector <4 x bfloat> %v, <4 x bfloat> %vb, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
+  ret <4 x bfloat> %v1
+}
+
+define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1up_2xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1up_2xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vslide1up.vx v9, v8, a0
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
+  %vb = insertelement <2 x half> poison, half %b, i64 0
+  %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> <i32 2, i32 0>
+  ret <2 x half> %v1
+}
+
+define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
+; ZVFH-LABEL: vslide1up_4xf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfslide1up.vf v9, v8, fa0
+; ZVFH-NEXT:    vmv1r.v v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vslide1up_4xf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslide1up.vx v9, v8, a0
+; ZVFHMIN-NEXT:    vmv1r.v v8, v9
+; ZVFHMIN-NEXT:    ret
   %vb = insertelement <4 x half> poison, half %b, i64 0
   %v1 = shufflevector <4 x half> %v, <4 x half> %vb, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
   ret <4 x half> %v1