@@ -135,9 +135,9 @@ source %{
135135 (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
136136 (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
137137 (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
138- // The vector implementation of Op_AddReductionVD/F is for the Vector API only.
139- // It is not suitable for auto-vectorization because it does not add the elements
140- // in the same order as sequential code, and FP addition is non-associative .
138+ // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
139+ // They are not suitable for auto-vectorization because the result would not conform
140+ // to the JLS, Section Evaluation Order .
141141 opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
142142 opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
143143 opcode == Op_MulVL) {
@@ -2858,26 +2858,28 @@ instruct reduce_addL_sve(iRegLNoSp dst, iRegL isrc, vReg vsrc, vRegD tmp) %{
28582858%}
28592859
28602860// reduction addF
2861- // Floating-point addition is not associative, so the rules for AddReductionVF
2862- // on NEON can't be used to auto-vectorize floating-point reduce-add.
2863- // Currently, on NEON, AddReductionVF is only generated by Vector API.
2864- instruct reduce_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2865- predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 2);
2861+
2862+ instruct reduce_non_strict_order_add2F_neon(vRegF dst, vRegF fsrc, vReg vsrc) %{
2863+ // Non-strictly ordered floating-point add reduction for a 64-bits-long vector. This rule is
2864+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2865+ predicate(Matcher::vector_length(n->in(2)) == 2 && !n->as_Reduction()->requires_strict_order() );
28662866 match(Set dst (AddReductionVF fsrc vsrc));
28672867 effect(TEMP_DEF dst);
2868- format %{ "reduce_add2F_neon $dst, $fsrc, $vsrc" %}
2868+ format %{ "reduce_non_strict_order_add2F_neon $dst, $fsrc, $vsrc" %}
28692869 ins_encode %{
28702870 __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ S);
28712871 __ fadds($dst$$FloatRegister, $dst$$FloatRegister, $fsrc$$FloatRegister);
28722872 %}
28732873 ins_pipe(pipe_slow);
28742874%}
28752875
2876- instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2877- predicate(UseSVE == 0 && Matcher::vector_length(n->in(2)) == 4);
2876+ instruct reduce_non_strict_order_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
2877+ // Non-strictly ordered floating-point add reduction for 128-bits-long vector. This rule is
2878+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2879+ predicate(Matcher::vector_length(n->in(2)) == 4 && !n->as_Reduction()->requires_strict_order());
28782880 match(Set dst (AddReductionVF fsrc vsrc));
28792881 effect(TEMP_DEF dst, TEMP tmp);
2880- format %{ "reduce_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
2882+ format %{ "reduce_non_strict_order_add4F_neon $dst, $fsrc, $vsrc\t# KILL $tmp" %}
28812883 ins_encode %{
28822884 __ faddp($tmp$$FloatRegister, __ T4S, $vsrc$$FloatRegister, $vsrc$$FloatRegister);
28832885 __ faddp($dst$$FloatRegister, $tmp$$FloatRegister, __ S);
@@ -2886,11 +2888,21 @@ instruct reduce_add4F_neon(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
28862888 ins_pipe(pipe_slow);
28872889%}
28882890
2891+ // This rule calculates the reduction result in strict order. Two cases will
2892+ // reach here:
2893+ // 1. Non strictly-ordered AddReductionVF when vector size > 128-bits. For example -
2894+ // AddReductionVF generated by Vector API. For vector size > 128-bits, it is more
2895+ // beneficial performance-wise to generate direct SVE instruction even if it is
2896+ // strictly ordered.
2897+ // 2. Strictly-ordered AddReductionVF. For example - AddReductionVF generated by
2898+ // auto-vectorization on SVE machine.
28892899instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
2890- predicate(UseSVE > 0);
2900+ predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2901+ n->as_Reduction()->requires_strict_order());
28912902 match(Set dst_src1 (AddReductionVF dst_src1 src2));
28922903 format %{ "reduce_addF_sve $dst_src1, $dst_src1, $src2" %}
28932904 ins_encode %{
2905+ assert(UseSVE > 0, "must be sve");
28942906 uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
28952907 assert(length_in_bytes == MaxVectorSize, "invalid vector length");
28962908 __ sve_fadda($dst_src1$$FloatRegister, __ S, ptrue, $src2$$FloatRegister);
@@ -2899,26 +2911,36 @@ instruct reduce_addF_sve(vRegF dst_src1, vReg src2) %{
28992911%}
29002912
29012913// reduction addD
2902- // Floating-point addition is not associative, so the rule for AddReductionVD
2903- // on NEON can't be used to auto-vectorize floating-point reduce-add.
2904- // Currently, on NEON, AddReductionVD is only generated by Vector API.
2905- instruct reduce_addD_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
2906- predicate(UseSVE == 0 );
2914+
2915+ instruct reduce_non_strict_order_add2D_neon(vRegD dst, vRegD dsrc, vReg vsrc) %{
2916+ // Non-strictly ordered floating-point add reduction for doubles. This rule is
2917+ // intended for the VectorAPI (which allows for non-strictly ordered add reduction).
2918+ predicate(!n->as_Reduction()->requires_strict_order() );
29072919 match(Set dst (AddReductionVD dsrc vsrc));
29082920 effect(TEMP_DEF dst);
2909- format %{ "reduce_addD_neon $dst, $dsrc, $vsrc\t# 2D" %}
2921+ format %{ "reduce_non_strict_order_add2D_neon $dst, $dsrc, $vsrc\t# 2D" %}
29102922 ins_encode %{
29112923 __ faddp($dst$$FloatRegister, $vsrc$$FloatRegister, __ D);
29122924 __ faddd($dst$$FloatRegister, $dst$$FloatRegister, $dsrc$$FloatRegister);
29132925 %}
29142926 ins_pipe(pipe_slow);
29152927%}
29162928
2929+ // This rule calculates the reduction result in strict order. Two cases will
2930+ // reach here:
2931+ // 1. Non strictly-ordered AddReductionVD when vector size > 128-bits. For example -
2932+ // AddReductionVD generated by Vector API. For vector size > 128-bits, it is more
2933+ // beneficial performance-wise to generate direct SVE instruction even if it is
2934+ // strictly ordered.
2935+ // 2. Strictly-ordered AddReductionVD. For example - AddReductionVD generated by
2936+ // auto-vectorization on SVE machine.
29172937instruct reduce_addD_sve(vRegD dst_src1, vReg src2) %{
2918- predicate(UseSVE > 0);
2938+ predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n->in(2))) ||
2939+ n->as_Reduction()->requires_strict_order());
29192940 match(Set dst_src1 (AddReductionVD dst_src1 src2));
29202941 format %{ "reduce_addD_sve $dst_src1, $dst_src1, $src2" %}
29212942 ins_encode %{
2943+ assert(UseSVE > 0, "must be sve");
29222944 uint length_in_bytes = Matcher::vector_length_in_bytes(this, $src2);
29232945 assert(length_in_bytes == MaxVectorSize, "invalid vector length");
29242946 __ sve_fadda($dst_src1$$FloatRegister, __ D, ptrue, $src2$$FloatRegister);
0 commit comments