Skip to content

Commit 9a98db5

Browse files
rth7680Michael Tokarev
authored andcommitted
target/arm: Fix f16_dotadd vs nan selection
Implement FPProcessNaNs4 within f16_dotadd, rather than simply letting NaNs propagate through the function. Cc: [email protected] Fixes: 3916841 ("target/arm: Implement FMOPA, FMOPS (widening)") Reviewed-by: Peter Maydell <[email protected]> Signed-off-by: Richard Henderson <[email protected]> Message-id: [email protected] Signed-off-by: Peter Maydell <[email protected]> (cherry picked from commit cfc688c00ade84f6b32c7814b52c217f1d3b5eb1) Signed-off-by: Michael Tokarev <[email protected]>
1 parent 9af1de0 commit 9a98db5

File tree

1 file changed

+46
-16
lines changed

1 file changed

+46
-16
lines changed

target/arm/tcg/sme_helper.c

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,25 +1005,55 @@ static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
10051005
* - we have pre-set-up copy of s_std which is set to round-to-odd,
10061006
* for the multiply (see below)
10071007
*/
1008-
float64 e1r = float16_to_float64(e1 & 0xffff, true, s_f16);
1009-
float64 e1c = float16_to_float64(e1 >> 16, true, s_f16);
1010-
float64 e2r = float16_to_float64(e2 & 0xffff, true, s_f16);
1011-
float64 e2c = float16_to_float64(e2 >> 16, true, s_f16);
1012-
float64 t64;
1008+
float16 h1r = e1 & 0xffff;
1009+
float16 h1c = e1 >> 16;
1010+
float16 h2r = e2 & 0xffff;
1011+
float16 h2c = e2 >> 16;
10131012
float32 t32;
10141013

1015-
/*
1016-
* The ARM pseudocode function FPDot performs both multiplies
1017-
* and the add with a single rounding operation. Emulate this
1018-
* by performing the first multiply in round-to-odd, then doing
1019-
* the second multiply as fused multiply-add, and rounding to
1020-
* float32 all in one step.
1021-
*/
1022-
t64 = float64_mul(e1r, e2r, s_odd);
1023-
t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1014+
/* C.f. FPProcessNaNs4 */
1015+
if (float16_is_any_nan(h1r) || float16_is_any_nan(h1c) ||
1016+
float16_is_any_nan(h2r) || float16_is_any_nan(h2c)) {
1017+
float16 t16;
1018+
1019+
if (float16_is_signaling_nan(h1r, s_f16)) {
1020+
t16 = h1r;
1021+
} else if (float16_is_signaling_nan(h1c, s_f16)) {
1022+
t16 = h1c;
1023+
} else if (float16_is_signaling_nan(h2r, s_f16)) {
1024+
t16 = h2r;
1025+
} else if (float16_is_signaling_nan(h2c, s_f16)) {
1026+
t16 = h2c;
1027+
} else if (float16_is_any_nan(h1r)) {
1028+
t16 = h1r;
1029+
} else if (float16_is_any_nan(h1c)) {
1030+
t16 = h1c;
1031+
} else if (float16_is_any_nan(h2r)) {
1032+
t16 = h2r;
1033+
} else {
1034+
t16 = h2c;
1035+
}
1036+
t32 = float16_to_float32(t16, true, s_f16);
1037+
} else {
1038+
float64 e1r = float16_to_float64(h1r, true, s_f16);
1039+
float64 e1c = float16_to_float64(h1c, true, s_f16);
1040+
float64 e2r = float16_to_float64(h2r, true, s_f16);
1041+
float64 e2c = float16_to_float64(h2c, true, s_f16);
1042+
float64 t64;
10241043

1025-
/* This conversion is exact, because we've already rounded. */
1026-
t32 = float64_to_float32(t64, s_std);
1044+
/*
1045+
* The ARM pseudocode function FPDot performs both multiplies
1046+
* and the add with a single rounding operation. Emulate this
1047+
* by performing the first multiply in round-to-odd, then doing
1048+
* the second multiply as fused multiply-add, and rounding to
1049+
* float32 all in one step.
1050+
*/
1051+
t64 = float64_mul(e1r, e2r, s_odd);
1052+
t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
1053+
1054+
/* This conversion is exact, because we've already rounded. */
1055+
t32 = float64_to_float32(t64, s_std);
1056+
}
10271057

10281058
/* The final accumulation step is not fused. */
10291059
return float32_add(sum, t32, s_std);

0 commit comments

Comments
 (0)