Skip to content

Conversation

@wangpc-pp
Copy link
Contributor

This suboptimal case was found when trying to optimize ABD/ABDS
operation.

Adding ISel patterns is the simplest way to optimize. We can add
DAGCombine cases for ISD::SIGN_EXTEND/ISD::ZERO_EXTEND instead
but that may need a lot of manual handlings.

This suboptimal case was found when trying to optimize ABD/ABDS
operation.

Adding ISel patterns is the simplest way to optimize. We can add
DAGCombine cases for `ISD::SIGN_EXTEND/ISD::ZERO_EXTEND` instead
but that may need a lot of manual handlings.
@llvmbot
Copy link
Member

llvmbot commented Apr 27, 2025

@llvm/pr-subscribers-backend-risc-v

Author: Pengcheng Wang (wangpc-pp)

Changes

This suboptimal case was found when trying to optimize ABD/ABDS
operation.

Adding ISel patterns is the simplest way to optimize. We can add
DAGCombine cases for ISD::SIGN_EXTEND/ISD::ZERO_EXTEND instead
but that may need a lot of manual handlings.


Full diff: https://github.com/llvm/llvm-project/pull/137508.diff

2 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td (+11)
  • (modified) llvm/test/CodeGen/RISCV/rvv/abd.ll (+15-29)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index aea125c5348dd..55a5109b7ecfa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -467,6 +467,17 @@ multiclass VPatWidenBinarySDNode_VV_VX<SDNode op, PatFrags extop1, PatFrags exto
                 (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX)
                    (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2,
                    GPR:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>;
+      if !eq(extop1, extop2) then
+        def : Pat<(wti.Vector (extop1 (op (vti.Vector vti.RegClass:$rs2),
+                                          (vti.Vector vti.RegClass:$rs1)))),
+                  (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX)
+                     (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2,
+                     vti.RegClass:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>;
+        def : Pat<(wti.Vector (extop1 (op (vti.Vector vti.RegClass:$rs2),
+                                          (vti.Vector (SplatPat (XLenVT GPR:$rs1)))))),
+                  (!cast<Instruction>(instruction_name#"_VX_"#vti.LMul.MX)
+                     (wti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs2,
+                     GPR:$rs1, vti.AVL, vti.Log2SEW, TA_MA)>;
     }
   }
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/abd.ll b/llvm/test/CodeGen/RISCV/rvv/abd.ll
index 583d872238df7..b8d95bd95df8a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abd.ll
@@ -58,10 +58,8 @@ define <vscale x 8 x i16> @sabd_h_promoted_ops(<vscale x 8 x i8> %a, <vscale x 8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmin.vv v10, v8, v9
-; CHECK-NEXT:    vmax.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmax.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.sext = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -91,10 +89,8 @@ define <vscale x 4 x i32> @sabd_s_promoted_ops(<vscale x 4 x i16> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmin.vv v10, v8, v9
-; CHECK-NEXT:    vmax.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmax.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %b.sext = sext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -124,10 +120,8 @@ define <vscale x 2 x i64> @sabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmin.vv v10, v8, v9
-; CHECK-NEXT:    vmax.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmax.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %b.sext = sext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -192,10 +186,8 @@ define <vscale x 8 x i16> @uabd_h_promoted_ops(<vscale x 8 x i8> %a, <vscale x 8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vminu.vv v10, v8, v9
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmaxu.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.zext = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -225,10 +217,8 @@ define <vscale x 4 x i32> @uabd_s_promoted_ops(<vscale x 4 x i16> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vminu.vv v10, v8, v9
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmaxu.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
   %b.zext = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>
@@ -258,10 +248,8 @@ define <vscale x 2 x i64> @uabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vminu.vv v10, v8, v9
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vmaxu.vv v11, v8, v9
+; CHECK-NEXT:    vwsubu.vv v8, v11, v10
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
   %b.zext = zext <vscale x 2 x i32> %b to <vscale x 2 x i64>
@@ -296,11 +284,9 @@ define <vscale x 4 x i32> @uabd_non_matching_promoted_ops(<vscale x 4 x i8> %a,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vminu.vv v8, v10, v9
-; CHECK-NEXT:    vmaxu.vv v9, v10, v9
-; CHECK-NEXT:    vsub.vv v10, v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v8, v10
+; CHECK-NEXT:    vminu.vv v11, v10, v9
+; CHECK-NEXT:    vmaxu.vv v10, v10, v9
+; CHECK-NEXT:    vwsubu.vv v8, v10, v11
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
   %b.zext = zext <vscale x 4 x i16> %b to <vscale x 4 x i32>

@wangpc-pp wangpc-pp requested review from NexMing and lukel97 April 27, 2025 08:34
@NexMing
Copy link
Contributor

NexMing commented Apr 27, 2025

If there is a risk of overflow, is it correct to match (ext (op a, b)) to (wop a, b)?

@wangpc-pp
Copy link
Contributor Author

If there is a risk of overflow, is it correct to match (ext (op a, b)) to (wop a, b)?

Yes, you are right. This transformation is not correct.

@wangpc-pp wangpc-pp closed this Apr 27, 2025
@wangpc-pp wangpc-pp deleted the main-riscv-widen-abd branch April 27, 2025 11:15
@wangpc-pp
Copy link
Contributor Author

wangpc-pp commented Apr 27, 2025

If there is a risk of overflow, is it correct to match (ext (op a, b)) to (wop a, b)?

Yes, you are right. This transformation is not correct.

Is there a generic mechanism to analyze if the result won't overflow? Apparently for ABD cases this holds, but how to detect such cases?

@NexMing
Copy link
Contributor

NexMing commented Apr 27, 2025

If there is a risk of overflow, is it correct to match (ext (op a, b)) to (wop a, b)?

Yes, you are right. This transformation is not correct.

Is there a generic mechanism to analyze if the result won't overflow? Apparently for ABD cases this holds, but how to detect such cases?

If you just want to optimize ABD, you can combine (vzext (abd (x), (y))) into your desired form in DAGCombine.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants