@@ -1005,25 +1005,55 @@ static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
1005
1005
* - we have pre-set-up copy of s_std which is set to round-to-odd,
1006
1006
* for the multiply (see below)
1007
1007
*/
1008
- float64 e1r = float16_to_float64 (e1 & 0xffff , true, s_f16 );
1009
- float64 e1c = float16_to_float64 (e1 >> 16 , true, s_f16 );
1010
- float64 e2r = float16_to_float64 (e2 & 0xffff , true, s_f16 );
1011
- float64 e2c = float16_to_float64 (e2 >> 16 , true, s_f16 );
1012
- float64 t64 ;
1008
+ float16 h1r = e1 & 0xffff ;
1009
+ float16 h1c = e1 >> 16 ;
1010
+ float16 h2r = e2 & 0xffff ;
1011
+ float16 h2c = e2 >> 16 ;
1013
1012
float32 t32 ;
1014
1013
1015
- /*
1016
- * The ARM pseudocode function FPDot performs both multiplies
1017
- * and the add with a single rounding operation. Emulate this
1018
- * by performing the first multiply in round-to-odd, then doing
1019
- * the second multiply as fused multiply-add, and rounding to
1020
- * float32 all in one step.
1021
- */
1022
- t64 = float64_mul (e1r , e2r , s_odd );
1023
- t64 = float64r32_muladd (e1c , e2c , t64 , 0 , s_std );
1014
+ /* C.f. FPProcessNaNs4 */
1015
+ if (float16_is_any_nan (h1r ) || float16_is_any_nan (h1c ) ||
1016
+ float16_is_any_nan (h2r ) || float16_is_any_nan (h2c )) {
1017
+ float16 t16 ;
1018
+
1019
+ if (float16_is_signaling_nan (h1r , s_f16 )) {
1020
+ t16 = h1r ;
1021
+ } else if (float16_is_signaling_nan (h1c , s_f16 )) {
1022
+ t16 = h1c ;
1023
+ } else if (float16_is_signaling_nan (h2r , s_f16 )) {
1024
+ t16 = h2r ;
1025
+ } else if (float16_is_signaling_nan (h2c , s_f16 )) {
1026
+ t16 = h2c ;
1027
+ } else if (float16_is_any_nan (h1r )) {
1028
+ t16 = h1r ;
1029
+ } else if (float16_is_any_nan (h1c )) {
1030
+ t16 = h1c ;
1031
+ } else if (float16_is_any_nan (h2r )) {
1032
+ t16 = h2r ;
1033
+ } else {
1034
+ t16 = h2c ;
1035
+ }
1036
+ t32 = float16_to_float32 (t16 , true, s_f16 );
1037
+ } else {
1038
+ float64 e1r = float16_to_float64 (h1r , true, s_f16 );
1039
+ float64 e1c = float16_to_float64 (h1c , true, s_f16 );
1040
+ float64 e2r = float16_to_float64 (h2r , true, s_f16 );
1041
+ float64 e2c = float16_to_float64 (h2c , true, s_f16 );
1042
+ float64 t64 ;
1024
1043
1025
- /* This conversion is exact, because we've already rounded. */
1026
- t32 = float64_to_float32 (t64 , s_std );
1044
+ /*
1045
+ * The ARM pseudocode function FPDot performs both multiplies
1046
+ * and the add with a single rounding operation. Emulate this
1047
+ * by performing the first multiply in round-to-odd, then doing
1048
+ * the second multiply as fused multiply-add, and rounding to
1049
+ * float32 all in one step.
1050
+ */
1051
+ t64 = float64_mul (e1r , e2r , s_odd );
1052
+ t64 = float64r32_muladd (e1c , e2c , t64 , 0 , s_std );
1053
+
1054
+ /* This conversion is exact, because we've already rounded. */
1055
+ t32 = float64_to_float32 (t64 , s_std );
1056
+ }
1027
1057
1028
1058
/* The final accumulation step is not fused. */
1029
1059
return float32_add (sum , t32 , s_std );
0 commit comments