Implement bfloat16 round-to-even

thecoop · thecoop · commit 32137234e99f · 2025-11-28T11:40:17.000Z
diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/BFloat16.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/BFloat16.java
@@ -18,18 +18,29 @@ public final class BFloat16 {
     public static final int BYTES = Short.BYTES;
 
     public static short floatToBFloat16(float f) {
-        // this rounds towards 0
+        // this rounds towards even
         // zero - zero exp, zero fraction
         // denormal - zero exp, non-zero fraction
         // infinity - all-1 exp, zero fraction
         // NaN - all-1 exp, non-zero fraction
         // the Float.NaN constant is 0x7fc0_0000, so this won't turn the most common NaN values into
         // infinities
-        return (short) (Float.floatToIntBits(f) >>> 16);
+
+        int bits = Float.floatToIntBits(f);
+        int bfloat16 = bits >>> 16;
+
+        // if highest discarded bit is 1,
+        // and there's other non-zero discarded bits, or the bfloat16 is odd
+        // then round up
+        if ((bits & 0x8000) == 0x8000 && ((bits & 0x7fff) != 0 || (bfloat16 & 1) == 1)) {
+            bfloat16++;
+        }
+
+        return (short) bfloat16;
     }
 
     public static float truncateToBFloat16(float f) {
-        return Float.intBitsToFloat(Float.floatToIntBits(f) & 0xffff0000);
+        return Float.intBitsToFloat(floatToBFloat16(f) << 16);
     }
 
     public static float bFloat16ToFloat(short bf) {
diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/BFloat16Tests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/BFloat16Tests.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.index.codec.vectors;
+
+import org.elasticsearch.test.ESTestCase;
+
+import static org.hamcrest.Matchers.closeTo;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+
+public class BFloat16Tests extends ESTestCase {
+
+    public void testRoundToEven() {
+        int exp = 0b001111110;  // to create floating numbers around 1.0
+
+        // exact bfloat16 value
+        float bfloat16 = construct(exp, 0b1111001_00000000_00000000);
+        assertRounding(bfloat16, bfloat16);
+
+        // round down
+        assertRounding(construct(exp, 0b0000001_01111111_11111111), construct(exp, 0b0000001_00000000_00000000));
+
+        // round up
+        assertRounding(construct(exp, 0b0000001_10000000_00000001), construct(exp, 0b0000010_00000000_00000000));
+
+        // split down to even
+        assertRounding(construct(exp, 0b000010_10000000_00000000), construct(exp, 0b000010_00000000_00000000));
+
+        // split up to even
+        assertRounding(construct(exp, 0b000001_10000000_00000000), construct(exp, 0b000010_00000000_00000000));
+
+        // round up, overflowing into exponent
+        assertRounding(construct(0b000111111, 0b1111111_10000000_00000000), construct(0b001000000, 0b0000000_00000000_00000000));
+
+        // round up, overflowing from denormal to normal number
+        assertRounding(construct(0b000000000, 0b1111111_10000000_00000000), construct(0b000000001, 0b0000000_00000000_00000000));
+
+        // round to positive infinity
+        assertThat(BFloat16.truncateToBFloat16(construct(0b011111110, 0b1111111_10000000_00000000)), equalTo(Float.POSITIVE_INFINITY));
+
+        // round to negative infinity
+        assertThat(BFloat16.truncateToBFloat16(construct(0b111111110, 0b1111111_10000000_00000000)), equalTo(Float.NEGATIVE_INFINITY));
+
+        // round to zero
+        assertRounding(construct(0b000000000, 0b0000000_10000000_00000000), 0f);
+
+        // rounding the standard NaN value should be unchanged
+        assertThat(Float.floatToIntBits(BFloat16.truncateToBFloat16(Float.NaN)), equalTo(Float.floatToIntBits(Float.NaN)));
+    }
+
+    private static float construct(int exp, int mantissa) {
+        assert (exp & 0xfffffe00) == 0;
+        assert (mantissa & 0xf8000000) == 0;
+        return Float.intBitsToFloat((exp << 23) | mantissa);
+    }
+
+    private static void assertRounding(float value, float expectedRounded) {
+        assert (Float.floatToIntBits(expectedRounded) & 0xffff) == 0;
+
+        // rounded float value to check should be close to input value
+        // this checks the bit representations in the tests are actually sensible
+        assertThat((double) expectedRounded, closeTo(value, 0.002));
+
+        float rounded = BFloat16.truncateToBFloat16(value);
+
+        // System.out.println(value + " rounds to " + rounded);
+        assertEquals(value + " rounded to " + rounded + ", not " + expectedRounded,
+            Float.floatToIntBits(expectedRounded), Float.floatToIntBits(rounded));
+
+        // there should not be a closer bfloat16 value (comparing using FP math) than the expected rounded value
+        float delta = Math.abs(value - rounded);
+        float higherValue = Float.intBitsToFloat(Float.floatToIntBits(rounded) + 0x10000);
+        assertThat(Math.abs(value - higherValue), greaterThanOrEqualTo(delta));
+
+        float lowerValue = Float.intBitsToFloat(Float.floatToIntBits(rounded) - 0x10000);
+        assertThat(Math.abs(value - lowerValue), greaterThanOrEqualTo(delta));
+    }
+}