@@ -2989,31 +2989,62 @@ float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst)
2989
2989
float32 bfdotadd_ebf (float32 sum , uint32_t e1 , uint32_t e2 ,
2990
2990
float_status * fpst , float_status * fpst_odd )
2991
2991
{
2992
- /*
2993
- * Compare f16_dotadd() in sme_helper.c, but here we have
2994
- * bfloat16 inputs. In particular that means that we do not
2995
- * want the FPCR.FZ16 flush semantics, so we use the normal
2996
- * float_status for the input handling here.
2997
- */
2998
- float64 e1r = float32_to_float64 (e1 << 16 , fpst );
2999
- float64 e1c = float32_to_float64 (e1 & 0xffff0000u , fpst );
3000
- float64 e2r = float32_to_float64 (e2 << 16 , fpst );
3001
- float64 e2c = float32_to_float64 (e2 & 0xffff0000u , fpst );
3002
- float64 t64 ;
2992
+ float32 s1r = e1 << 16 ;
2993
+ float32 s1c = e1 & 0xffff0000u ;
2994
+ float32 s2r = e2 << 16 ;
2995
+ float32 s2c = e2 & 0xffff0000u ;
3003
2996
float32 t32 ;
3004
2997
3005
- /*
3006
- * The ARM pseudocode function FPDot performs both multiplies
3007
- * and the add with a single rounding operation. Emulate this
3008
- * by performing the first multiply in round-to-odd, then doing
3009
- * the second multiply as fused multiply-add, and rounding to
3010
- * float32 all in one step.
3011
- */
3012
- t64 = float64_mul (e1r , e2r , fpst_odd );
3013
- t64 = float64r32_muladd (e1c , e2c , t64 , 0 , fpst );
2998
+ /* C.f. FPProcessNaNs4 */
2999
+ if (float32_is_any_nan (s1r ) || float32_is_any_nan (s1c ) ||
3000
+ float32_is_any_nan (s2r ) || float32_is_any_nan (s2c )) {
3001
+ if (float32_is_signaling_nan (s1r , fpst )) {
3002
+ t32 = s1r ;
3003
+ } else if (float32_is_signaling_nan (s1c , fpst )) {
3004
+ t32 = s1c ;
3005
+ } else if (float32_is_signaling_nan (s2r , fpst )) {
3006
+ t32 = s2r ;
3007
+ } else if (float32_is_signaling_nan (s2c , fpst )) {
3008
+ t32 = s2c ;
3009
+ } else if (float32_is_any_nan (s1r )) {
3010
+ t32 = s1r ;
3011
+ } else if (float32_is_any_nan (s1c )) {
3012
+ t32 = s1c ;
3013
+ } else if (float32_is_any_nan (s2r )) {
3014
+ t32 = s2r ;
3015
+ } else {
3016
+ t32 = s2c ;
3017
+ }
3018
+ /*
3019
+ * FPConvertNaN(FPProcessNaN(t32)) will be done as part
3020
+ * of the final addition below.
3021
+ */
3022
+ } else {
3023
+ /*
3024
+ * Compare f16_dotadd() in sme_helper.c, but here we have
3025
+ * bfloat16 inputs. In particular that means that we do not
3026
+ * want the FPCR.FZ16 flush semantics, so we use the normal
3027
+ * float_status for the input handling here.
3028
+ */
3029
+ float64 e1r = float32_to_float64 (s1r , fpst );
3030
+ float64 e1c = float32_to_float64 (s1c , fpst );
3031
+ float64 e2r = float32_to_float64 (s2r , fpst );
3032
+ float64 e2c = float32_to_float64 (s2c , fpst );
3033
+ float64 t64 ;
3014
3034
3015
- /* This conversion is exact, because we've already rounded. */
3016
- t32 = float64_to_float32 (t64 , fpst );
3035
+ /*
3036
+ * The ARM pseudocode function FPDot performs both multiplies
3037
+ * and the add with a single rounding operation. Emulate this
3038
+ * by performing the first multiply in round-to-odd, then doing
3039
+ * the second multiply as fused multiply-add, and rounding to
3040
+ * float32 all in one step.
3041
+ */
3042
+ t64 = float64_mul (e1r , e2r , fpst_odd );
3043
+ t64 = float64r32_muladd (e1c , e2c , t64 , 0 , fpst );
3044
+
3045
+ /* This conversion is exact, because we've already rounded. */
3046
+ t32 = float64_to_float32 (t64 , fpst );
3047
+ }
3017
3048
3018
3049
/* The final accumulation step is not fused. */
3019
3050
return float32_add (sum , t32 , fpst );
0 commit comments