-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[RISCV] Implement isHighLatencyDef() #127476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Implement isHighLatencyDef() #127476
Conversation
And returns true for div/rem/sqrt/... operations. This is an alternative if we don't support generic scheduling model.
|
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesAnd returns true for div/rem/sqrt/... operations. This is an alternative if we don't support generic scheduling model. Full diff: https://github.com/llvm/llvm-project/pull/127476.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1ec299e3c8cc0..74235488ca53f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3679,6 +3679,52 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
+bool RISCVInstrInfo::isHighLatencyDef(int Opc) const {
+ switch (Opc) {
+ default:
+ return false;
+ // Integer div/rem.
+ case RISCV::DIV:
+ case RISCV::DIVW:
+ case RISCV::DIVU:
+ case RISCV::DIVUW:
+ case RISCV::REM:
+ case RISCV::REMW:
+ case RISCV::REMU:
+ case RISCV::REMUW:
+ // Floating-point div/rem/sqrt.
+ case RISCV::FDIV_H:
+ case RISCV::FDIV_S:
+ case RISCV::FDIV_D:
+ case RISCV::FDIV_H_INX:
+ case RISCV::FDIV_S_INX:
+ case RISCV::FDIV_D_INX:
+ case RISCV::FDIV_D_IN32X:
+ case RISCV::FSQRT_H:
+ case RISCV::FSQRT_S:
+ case RISCV::FSQRT_D:
+ case RISCV::FSQRT_H_INX:
+ case RISCV::FSQRT_S_INX:
+ case RISCV::FSQRT_D_INX:
+ case RISCV::FSQRT_D_IN32X:
+ // Integer div/rem.
+ case CASE_VFMA_OPCODE_VV(DIV):
+ case CASE_VFMA_OPCODE_VV(DIVU):
+ case CASE_VFMA_OPCODE_VV(REM):
+ case CASE_VFMA_OPCODE_VV(REMU):
+ // case CASE_VFMA_OPCODE_VX(DIV):
+ // case CASE_VFMA_OPCODE_VX(DIVU):
+ // case CASE_VFMA_OPCODE_VX(REM):
+ // case CASE_VFMA_OPCODE_VX(REMU):
+ // Vector floating-point div/sqrt.
+ case CASE_VFMA_OPCODE_VV(FDIV):
+ // case CASE_VFMA_OPCODE_VF(FRDIV):
+ // case CASE_VFMA_OPCODE_VV(FSQRT):
+ // case CASE_VFMA_OPCODE_VV(FRSQRT7):
+ return true;
+ }
+}
+
#undef CASE_RVV_OPCODE_UNMASK_LMUL
#undef CASE_RVV_OPCODE_MASK_LMUL
#undef CASE_RVV_OPCODE_LMUL
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index ec628620d2982..afbc8df50b452 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -300,6 +300,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+ bool isHighLatencyDef(int Opc) const override;
+
protected:
const RISCVSubtarget &STI;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index eb7be14abe431..0d1d75c1b2a75 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -894,18 +894,18 @@ define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) {
; CHECK-LABEL: vwmul_v2i16_multiuse:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vle8.v v9, (a1)
-; CHECK-NEXT: vle8.v v10, (a2)
-; CHECK-NEXT: vle8.v v11, (a3)
-; CHECK-NEXT: vsext.vf2 v12, v8
+; CHECK-NEXT: vle8.v v8, (a1)
+; CHECK-NEXT: vle8.v v9, (a2)
+; CHECK-NEXT: vsext.vf2 v10, v8
; CHECK-NEXT: vsext.vf2 v8, v9
-; CHECK-NEXT: vsext.vf2 v9, v10
-; CHECK-NEXT: vsext.vf2 v10, v11
-; CHECK-NEXT: vmul.vv v11, v12, v10
-; CHECK-NEXT: vmul.vv v10, v8, v10
-; CHECK-NEXT: vdivu.vv v8, v8, v9
-; CHECK-NEXT: vor.vv v9, v11, v10
+; CHECK-NEXT: vdivu.vv v8, v10, v8
+; CHECK-NEXT: vle8.v v9, (a0)
+; CHECK-NEXT: vle8.v v11, (a3)
+; CHECK-NEXT: vsext.vf2 v12, v9
+; CHECK-NEXT: vsext.vf2 v9, v11
+; CHECK-NEXT: vmul.vv v11, v12, v9
+; CHECK-NEXT: vmul.vv v9, v10, v9
+; CHECK-NEXT: vor.vv v9, v11, v9
; CHECK-NEXT: vor.vv v8, v9, v8
; CHECK-NEXT: ret
%a = load <2 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
index 07750623dd44b..217a02d08dead 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
@@ -221,16 +221,16 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v0, v0, v8
+; CHECK-NEXT: vfdiv.vv v16, v0, v16
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vfdiv.vv v24, v0, v24
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
@@ -249,32 +249,42 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: fmv.x.h a0, fa0
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmv.v.x v16, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v0, v8, v0
+; CHECK-NEXT: vfdiv.vv v24, v16, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vfdiv.vv v16, v0, v8
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24
; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
@@ -573,16 +583,16 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v0, v0, v8
+; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24
+; ZVFHMIN-NEXT: vfdiv.vv v24, v0, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -607,32 +617,42 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
; ZVFHMIN-NEXT: addi sp, sp, -16
; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a0, fa0
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
; ZVFHMIN-NEXT: addi a1, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT: vmv.v.x v8, a0
+; ZVFHMIN-NEXT: vmv.v.x v16, a0
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0
+; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT: csrr a0, vlenb
+; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: add a0, sp, a0
+; ZVFHMIN-NEXT: addi a0, a0, 16
+; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16
+; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24
; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
+; ZVFHMIN-NEXT: slli a0, a0, 4
; ZVFHMIN-NEXT: add sp, sp, a0
; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
; ZVFHMIN-NEXT: addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
index e671ba850415b..9aba6455f0fac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
@@ -200,16 +200,16 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v0, v0, v8
+; CHECK-NEXT: vfdiv.vv v16, v0, v16
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v16, v24
+; CHECK-NEXT: vfdiv.vv v24, v0, v24
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
@@ -224,39 +224,23 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
; CHECK-LABEL: vfdiv_vf_nxv32bf16:
; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: fmv.x.h a0, fa0
; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT: addi a1, sp, 16
-; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12
; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; CHECK-NEXT: vmv.v.x v8, a0
; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v0, v8, v0
+; CHECK-NEXT: vfdiv.vv v16, v16, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12
; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT: vfdiv.vv v16, v24, v16
+; CHECK-NEXT: vfdiv.vv v24, v24, v0
; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24
; CHECK-NEXT: ret
%head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
%splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -528,16 +512,16 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
; ZVFHMIN-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v0, v0, v8
+; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v16
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v24
+; ZVFHMIN-NEXT: vfdiv.vv v24, v0, v24
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
; ZVFHMIN-NEXT: csrr a0, vlenb
; ZVFHMIN-NEXT: slli a0, a0, 3
; ZVFHMIN-NEXT: add sp, sp, a0
@@ -558,39 +542,23 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
;
; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
; ZVFHMIN: # %bb.0:
-; ZVFHMIN-NEXT: addi sp, sp, -16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: sub sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; ZVFHMIN-NEXT: fmv.x.h a0, fa0
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT: addi a1, sp, 16
-; ZVFHMIN-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12
; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma
; ZVFHMIN-NEXT: vmv.v.x v8, a0
; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT: addi a0, sp, 16
-; ZVFHMIN-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v0, v8, v0
+; ZVFHMIN-NEXT: vfdiv.vv v16, v16, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT: vfdiv.vv v16, v24, v16
+; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT: csrr a0, vlenb
-; ZVFHMIN-NEXT: slli a0, a0, 3
-; ZVFHMIN-NEXT: add sp, sp, a0
-; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT: addi sp, sp, 16
-; ZVFHMIN-NEXT: .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24
; ZVFHMIN-NEXT: ret
%head = insertelement <vscale x 32 x half> poison, half %b, i32 0
%splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
|
lukel97
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I think it would still be nice to have a default scheduling model chosen but this is better than nothing
topperc
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
And returns true for div/rem/sqrt/... operations. This is an alternative if we don't support generic scheduling model.
|
I'm seeing a 2.15% speedup on 544.nab_r after this change for both rva22u64 and rva22u64_v. You can see some of the |
And returns true for div/rem/sqrt/... operations.
This is an alternative if we don't support generic scheduling model.