From 227745cdfee432fe0f73af5d9ba2f5b6c75a2d03 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Mon, 27 Jan 2025 18:47:59 -0600 Subject: [PATCH 01/10] [RISCV] Tune flag for fast vrgather.vv Add tune knob for N*Log2(N) vrgather.vv cost. --- llvm/lib/Target/RISCV/RISCVFeatures.td | 4 ++++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +++++ llvm/lib/Target/RISCV/RISCVProcessors.td | 1 + llvm/test/CodeGen/RISCV/features-info.ll | 1 + 4 files changed, 11 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 24828cde28079..316ec15c20933 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1445,6 +1445,10 @@ def FeatureUnalignedVectorMem "true", "Has reasonably performant unaligned vector " "loads and stores">; +def TuneFastVRGather + : SubtargetFeature<"fast-vrgather", "HasFastVRGather", + "true", "Has vrgather.vv with LMUL*log2(LMUL) latency">; + def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6a259e4b0334c..867204d5daa4f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2872,6 +2872,11 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const { /// is generally quadratic in the number of vreg implied by LMUL. Note that /// operand (index and possibly mask) are handled separately. InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { + auto LMULCost = getLMULCost(VT); + if (true && Subtarget.hasFastVRGather() && LMULCost.isValid()) { + unsigned Log = Log2_64(*LMULCost.getValue()); + return LMULCost * Log; + } return getLMULCost(VT) * getLMULCost(VT); } diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index c2d98c2180299..63289168a3e62 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -494,6 +494,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8", FeatureUnalignedScalarMem, FeatureUnalignedVectorMem]), [TuneNoDefaultUnroll, + TuneFastVRGather, TuneOptimizedZeroStrideLoad, TunePostRAScheduler]>; diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 70fbda47a14a1..dab9bf92cef17 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -31,6 +31,7 @@ ; CHECK: experimental-zvbc32e - 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements). ; CHECK: experimental-zvkgs - 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography). ; CHECK: f - 'F' (Single-Precision Floating-Point). +; CHECK: fast-vrgather - Has vrgather.vv with LMUL*log2(LMUL) latency ; CHECK: forced-atomics - Assume that lock-free native-width atomics are available. ; CHECK: h - 'H' (Hypervisor). ; CHECK: i - 'I' (Base Integer Instruction Set). From 472f7a5d8fcff4af9f507537e65bbf48760dbc6d Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Tue, 28 Jan 2025 09:20:48 -0800 Subject: [PATCH 02/10] Fix typo --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 867204d5daa4f..befdc6e298423 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2873,7 +2873,7 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const { /// operand (index and possibly mask) are handled separately. InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { auto LMULCost = getLMULCost(VT); - if (true && Subtarget.hasFastVRGather() && LMULCost.isValid()) { + if (Subtarget.hasFastVRGather() && LMULCost.isValid()) { unsigned Log = Log2_64(*LMULCost.getValue()); return LMULCost * Log; } From f6a369890d57314ac51943d52900030c199c0978 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Tue, 28 Jan 2025 10:03:26 -0800 Subject: [PATCH 03/10] Add run line to RISCV shuffle-permute.ll test --- .../CostModel/RISCV/shuffle-permute.ll | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll index 105cc8a1c2ba3..c423d454c68be 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll @@ -3,6 +3,8 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE +; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG +; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG ; Check that we don't crash querying costs when vectors are not enabled. ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv32 @@ -44,6 +46,24 @@ define void @general_permute_single_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; FAST-VRG-LABEL: 'general_permute_single_source' +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> @@ -133,6 +153,37 @@ define void @general_permute_two_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; FAST-VRG-LABEL: 'general_permute_two_source' +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> From 65c419d946b7860cc1719a98ae3c3e160c1bfb47 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Tue, 28 Jan 2025 21:51:59 -0800 Subject: [PATCH 04/10] Remove code size from analysis test --- .../CostModel/RISCV/shuffle-permute.ll | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll index c423d454c68be..f6ddc9ff92b0e 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE -; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG -; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG ; Check that we don't crash querying costs when vectors are not enabled. ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv32 @@ -48,22 +48,22 @@ define void @general_permute_single_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; FAST-VRG-LABEL: 'general_permute_single_source' -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> @@ -155,35 +155,35 @@ define void @general_permute_two_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; FAST-VRG-LABEL: 'general_permute_two_source' -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> +; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> From 4e1738a5f2bc58ec9264793793fb56a34e8db2d4 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Wed, 29 Jan 2025 08:16:21 -0800 Subject: [PATCH 05/10] Reuse calculated LMUL cost --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index befdc6e298423..3f4a803b78cf9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2877,7 +2877,7 @@ InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { unsigned Log = Log2_64(*LMULCost.getValue()); return LMULCost * Log; } - return getLMULCost(VT) * getLMULCost(VT); + return LMULCost * LMULCost; } /// Return the cost of a vrgather.vi (or vx) instruction for the type VT. From cdb48f4a8d8d15ab45d253fc8fac2403f0e61943 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Wed, 5 Feb 2025 17:35:05 -0600 Subject: [PATCH 06/10] Guard against zero and negative log values --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3f4a803b78cf9..c7b0094948eaf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2875,7 +2875,8 @@ InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { auto LMULCost = getLMULCost(VT); if (Subtarget.hasFastVRGather() && LMULCost.isValid()) { unsigned Log = Log2_64(*LMULCost.getValue()); - return LMULCost * Log; + if (Log > 0) + return LMULCost * Log; } return LMULCost * LMULCost; } From 49ca050219467fd72baa3e86ac2b468a667af567 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Fri, 14 Feb 2025 17:17:00 -0600 Subject: [PATCH 07/10] Change vrgather cost model to use an enum Rename option to explicitly say "log", change tune flag to set one of the values. --- llvm/lib/Target/RISCV/RISCVFeatures.td | 6 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +- llvm/lib/Target/RISCV/RISCVProcessors.td | 2 +- llvm/lib/Target/RISCV/RISCVSubtarget.h | 7 ++ .../CostModel/RISCV/shuffle-permute.ll | 98 +++++++++---------- 5 files changed, 64 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 316ec15c20933..6ebabe9dc4b2d 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1445,9 +1445,9 @@ def FeatureUnalignedVectorMem "true", "Has reasonably performant unaligned vector " "loads and stores">; -def TuneFastVRGather - : SubtargetFeature<"fast-vrgather", "HasFastVRGather", - "true", "Has vrgather.vv with LMUL*log2(LMUL) latency">; +def TuneNLogNVRGather + : SubtargetFeature<"log-vrgather", "RISCVVRGatherCostModel", "NLog2N", + "Has vrgather.vv with LMUL*log2(LMUL) latency">; def TunePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c7b0094948eaf..a7018d1834d51 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2873,7 +2873,9 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const { /// operand (index and possibly mask) are handled separately. InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { auto LMULCost = getLMULCost(VT); - if (Subtarget.hasFastVRGather() && LMULCost.isValid()) { + bool Log2CostModel = + Subtarget.getVRGatherCostModel() == llvm::RISCVSubtarget::NLog2N; + if (Log2CostModel && LMULCost.isValid()) { unsigned Log = Log2_64(*LMULCost.getValue()); if (Log > 0) return LMULCost * Log; @@ -4258,7 +4260,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Cap the cost at a value linear to the number of elements in the vector. // The default lowering is to use the stack. The vector store + scalar loads // is linear in VL. However, at high lmuls vslide1down and vslidedown end up - // being (at least) linear in LMUL. As a result, using the vslidedown + // being (at least) linear in LMUL. As a resultdedown // lowering for every element ends up being VL*LMUL.. // TODO: Should we be directly costing the stack alternative? Doing so might // give us a more accurate upper bound. diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 63289168a3e62..9d48adeec5e86 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -494,7 +494,7 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8", FeatureUnalignedScalarMem, FeatureUnalignedVectorMem]), [TuneNoDefaultUnroll, - TuneFastVRGather, + TuneNLogNVRGather, TuneOptimizedZeroStrideLoad, TunePostRAScheduler]>; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index a059ed9202e33..cc9aef2d52556 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -84,11 +84,16 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { VentanaVeyron, MIPSP8700, }; + enum RISCVVRGatherCostModelEnum : uint8_t { + Quadratic, + NLog2N, + }; // clang-format on private: virtual void anchor(); RISCVProcFamilyEnum RISCVProcFamily = Others; + RISCVVRGatherCostModelEnum RISCVVRGatherCostModel = Quadratic; #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ bool ATTRIBUTE = DEFAULT; @@ -155,6 +160,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { /// initializeProperties(). RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; } + RISCVVRGatherCostModelEnum getVRGatherCostModel() const { return RISCVVRGatherCostModel; } + #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ bool GETTER() const { return ATTRIBUTE; } #include "RISCVGenSubtargetInfo.inc" diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll index f6ddc9ff92b0e..c4f6f3f1d874b 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-permute.ll @@ -3,8 +3,8 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE ; RUN: opt < %s -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+fast-vrgather | FileCheck %s --check-prefix=FAST-VRG +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfh,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=riscv32 -mattr=+v,+f,+d,+zfh,+zvfhmin,+log-vrgather | FileCheck %s --check-prefix=LOG-VRG ; Check that we don't crash querying costs when vectors are not enabled. ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=riscv32 @@ -47,23 +47,23 @@ define void @general_permute_single_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; FAST-VRG-LABEL: 'general_permute_single_source' -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; LOG-VRG-LABEL: 'general_permute_single_source' +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> @@ -154,36 +154,36 @@ define void @general_permute_two_source() { ; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; -; FAST-VRG-LABEL: 'general_permute_two_source' -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> -; FAST-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; LOG-VRG-LABEL: 'general_permute_two_source' +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2half = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4half = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v8half = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16half = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2float = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4float = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8float = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %v16float = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2double = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %v4double = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v8double = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %v16double = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> +; LOG-VRG-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> From f62b83a60f757de1351280918bebdbf9a4e5abde Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Fri, 21 Feb 2025 02:55:27 -0600 Subject: [PATCH 08/10] Update comment --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a7018d1834d51..733ae2bd795c0 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2869,7 +2869,8 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const { /// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv -/// is generally quadratic in the number of vreg implied by LMUL. Note that +/// may be quadratic in the number of vreg implied by LMUL, and is assumed to +/// be by default. VRGartherCostModel reflects available options. Note that /// operand (index and possibly mask) are handled separately. InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { auto LMULCost = getLMULCost(VT); From 52547d550bcc99911e97e67ab234729d7a77e8c9 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Mon, 24 Feb 2025 22:48:19 -0800 Subject: [PATCH 09/10] Address typos --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 733ae2bd795c0..6d3241e79adfe 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2870,7 +2870,7 @@ InstructionCost RISCVTargetLowering::getLMULCost(MVT VT) const { /// Return the cost of a vrgather.vv instruction for the type VT. vrgather.vv /// may be quadratic in the number of vreg implied by LMUL, and is assumed to -/// be by default. VRGartherCostModel reflects available options. Note that +/// be by default. VRGatherCostModel reflects available options. Note that /// operand (index and possibly mask) are handled separately. InstructionCost RISCVTargetLowering::getVRGatherVVCost(MVT VT) const { auto LMULCost = getLMULCost(VT); @@ -4261,7 +4261,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Cap the cost at a value linear to the number of elements in the vector. // The default lowering is to use the stack. The vector store + scalar loads // is linear in VL. However, at high lmuls vslide1down and vslidedown end up - // being (at least) linear in LMUL. As a resultdedown + // being (at least) linear in LMUL. As a result, using the vslidedown // lowering for every element ends up being VL*LMUL.. // TODO: Should we be directly costing the stack alternative? Doing so might // give us a more accurate upper bound. From 8a48a6c6edf2c317d1a18e1a7db72a6cae84d154 Mon Sep 17 00:00:00 2001 From: Petr Penzin Date: Mon, 3 Mar 2025 10:30:41 -0800 Subject: [PATCH 10/10] Fix test flag spelling, add to release notes --- llvm/docs/ReleaseNotes.md | 1 + llvm/test/CodeGen/RISCV/features-info.ll | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index f1f64f77ee71a..2fb2be1e77793 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -111,6 +111,7 @@ Changes to the RISC-V Backend extension. * Adds experimental assembler support for the Qualcomm 'Xqccmp' extension, which is a frame-pointer convention compatible version of Zcmp. +* Added non-quadratic ``log-vrgather`` cost model for ``vrgather.vv`` instruction Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index dab9bf92cef17..ff29777a3ec37 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -31,11 +31,11 @@ ; CHECK: experimental-zvbc32e - 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements). ; CHECK: experimental-zvkgs - 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography). ; CHECK: f - 'F' (Single-Precision Floating-Point). -; CHECK: fast-vrgather - Has vrgather.vv with LMUL*log2(LMUL) latency ; CHECK: forced-atomics - Assume that lock-free native-width atomics are available. ; CHECK: h - 'H' (Hypervisor). ; CHECK: i - 'I' (Base Integer Instruction Set). ; CHECK: ld-add-fusion - Enable LD+ADD macrofusion. +; CHECK: log-vrgather - Has vrgather.vv with LMUL*log2(LMUL) latency ; CHECK: lui-addi-fusion - Enable LUI+ADDI macro fusion. ; CHECK: m - 'M' (Integer Multiplication and Division). ; CHECK: mips-p8700 - MIPS p8700 processor.