graphcore-research
diff --git a/‎docs/source/04-benchmark.ipynb
Lines changed: 128 additions & 0 deletions b/‎docs/source/04-benchmark.ipynb
Lines changed: 128 additions & 0 deletions
diff --git a/‎docs/source/api.rst
Lines changed: 8 additions & 1 deletion b/‎docs/source/api.rst
Lines changed: 8 additions & 1 deletion
diff --git a/‎docs/source/conf.py
Lines changed: 3 additions & 0 deletions b/‎docs/source/conf.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/index.rst
Lines changed: 17 additions & 6 deletions b/‎docs/source/index.rst
Lines changed: 17 additions & 6 deletions
diff --git a/‎docs/source/notebooks.rst
Lines changed: 1 addition & 0 deletions b/‎docs/source/notebooks.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements-dev.txt
Lines changed: 2 additions & 0 deletions b/‎requirements-dev.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/gfloat/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/gfloat/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/gfloat/decode_ndarray.py
Lines changed: 78 additions & 0 deletions b/‎src/gfloat/decode_ndarray.py
Lines changed: 78 additions & 0 deletions
@@ -0,0 +1,128 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (c) 2024 Graphcore Ltd. All rights reserved.\n",
+    "\n",
+    "import numpy as np\n",
+    "import jax\n",
+    "import jax.numpy as jnp\n",
+    "import ml_dtypes\n",
+    "import gfloat\n",
+    "from gfloat.formats import format_info_ocp_e5m2\n",
+    "from timeit import Timer\n",
+    "\n",
+    "jax.config.update(\"jax_enable_x64\", True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Timing tests\n",
+    "\n",
+    "The `gfloat` library is designed for readability over performance, and the reference code for computations is the (slow) scalar code e.g. `round_float`.\n",
+    "\n",
+    "There are vectorized implementations (e.g. `round_ndarray`), and when combined with JAX, these can go reasonably fast.\n",
+    "\n",
+    "Let's see how long it takes to encode some values to FP8..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GFloat scalar                  :  6062.08 nsec (25 runs at size 10000)\n",
+      "GFloat vectorized, numpy arrays:    53.39 nsec (25 runs at size 1000000)\n",
+      "GFloat vectorized, JAX JIT     :     3.48 nsec (500 runs at size 1000000)\n",
+      "ML_dtypes                      :     3.27 nsec (500 runs at size 1000000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "N = 1_000_000\n",
+    "a = np.random.rand(N)\n",
+    "\n",
+    "jax_round_jit = jax.jit(lambda x: gfloat.round_ndarray(format_info_ocp_e5m2, x, np=jnp))\n",
+    "ja = jnp.array(a)\n",
+    "jax_round_jit(ja)  # Cache compilation\n",
+    "\n",
+    "\n",
+    "def slow_round_ndarray(fi, a):\n",
+    "    return np.array([gfloat.round_float(fi, x) for x in a])\n",
+    "\n",
+    "\n",
+    "# About how many seconds to run for (autorange will take at least .2 sec)\n",
+    "ACCURACY = 1.0\n",
+    "\n",
+    "\n",
+    "def time(f, problem_size=1.0):\n",
+    "    units = 1e9  # nsec\n",
+    "    t = Timer(f)\n",
+    "    f()  # pre-run\n",
+    "    n = int(t.autorange()[0] * ACCURACY / 0.2)\n",
+    "    ts = t.repeat(repeat=3, number=n)  # best of 3\n",
+    "    ts = [((t / n) / problem_size) * units for t in ts]  # per run\n",
+    "    return f\"{min(ts):8.2f} nsec ({n} runs at size {problem_size})\"\n",
+    "\n",
+    "\n",
+    "# fmt: off\n",
+    "print(\"GFloat scalar                  :\", time(lambda: slow_round_ndarray(format_info_ocp_e5m2, a[: N // 100]), N // 100))\n",
+    "print(\"GFloat vectorized, numpy arrays:\", time(lambda: gfloat.round_ndarray(format_info_ocp_e5m2, a), N))\n",
+    "print(\"GFloat vectorized, JAX JIT     :\", time(lambda: jax_round_jit(ja), N))\n",
+    "print(\"ML_dtypes                      :\", time(lambda: a.astype(ml_dtypes.float8_e5m2), N))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "On one CPU platform the timings were:\n",
+    "```\n",
+    "GFloat scalar                  :  6996.75 nsec (50 runs at size 10000)\n",
+    "GFloat vectorized, numpy arrays:    75.04 nsec (50 runs at size 1000000)\n",
+    "GFloat vectorized, JAX JIT     :     3.18 nsec (1000 runs at size 1000000)\n",
+    "ML_dtypes                      :     3.13 nsec (1000 runs at size 1000000)\n",
+    "```\n",
+    "So the JAX JIT code is ~1000x faster than the scalar code, and comparable to `ml_dtypes`'s C++ CPU implementation."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -8,9 +8,16 @@ API
 Scalar Functions
 ----------------
 
-.. autofunction:: decode_float
 .. autofunction:: round_float
 .. autofunction:: encode_float
+.. autofunction:: decode_float
+
+Array Functions
+---------------
+
+.. autofunction:: round_ndarray
+.. autofunction:: encode_ndarray
+.. autofunction:: decode_ndarray
 
 Block format functions
 ----------------------
 
@@ -52,3 +52,6 @@
 
 # -- Options for EPUB output
 epub_show_urls = "footnote"
+
+# -- Options for myst_nb
+nb_execution_mode = "off"
@@ -17,21 +17,32 @@ of:
   * Precision (p)
   * Maximum exponent (emax)
 
-with additional fields defining the encoding of infinities, Not-a-number (NaN) values,
-and negative zero, among others (see :class:`gfloat.FormatInfo`.)
+with additional fields defining the presence/encoding of:
+
+  * Infinities
+  * Not-a-number (NaN) values
+  * Negative zero
+  * Subnormal numbers
+  * Signed/unsigned
+  * Two's complement encoding (of the significand)
 
 This allows an implementation of generic floating point encode/decode logic,
 handling various current and proposed floating point types:
 
  - `IEEE 754 <https://en.wikipedia.org/wiki/IEEE_754>`_: Binary16, Binary32
- - `OCP Float8 <https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf>`_: E5M2, E4M3, and MX formats
+ - `Brain floating point <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_: BFloat16
+ - `OCP Float8 <https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf>`_: E5M2, E4M3
  - `IEEE WG P3109 <https://github.com/awf/P3109-Public/blob/main/Shared%20Reports/P3109%20WG%20Interim%20report.pdf>`_: P{p} for p in 1..7
+ - Types from the `OCP MX <https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf>`_ spec: E8M0, INT8, and FP4, FP6 types
+
 
-The library favours readability and extensibility over speed - for fast
-implementations of these datatypes see, for example,
+GFloat, being a pure Python library, favours readability and extensibility over speed
+(although the `*_ndarray` functions are reasonably fast for large arrays).
+For fast implementations of these datatypes see, for example,
 `ml_dtypes <https://github.com/jax-ml/ml_dtypes>`_,
 `bitstring <https://github.com/scott-griffiths/bitstring>`_,
-`MX PyTorch Emulation Library <https://github.com/microsoft/microxcaling>`_.
+`MX PyTorch Emulation Library <https://github.com/microsoft/microxcaling>`_,
+`APyTypes <https://apytypes.github.io/apytypes>`_.
 
 To get started with the library, we recommend perusing the notebooks,
 otherwise you may wish to jump straight into the API.
 
@@ -11,3 +11,4 @@ Some notebooks to illustrate uses of the library
    01-decode.ipynb
    02-value-stats.ipynb
    03-value-tables.ipynb
+   04-benchmark.ipynb
@@ -19,3 +19,5 @@ myst_nb
 # Requirements for notebooks
 airium
 pandas
+jaxlib
+jax
@@ -10,6 +10,8 @@
 from .decode import decode_float
 from .printing import float_pow2str, float_tilde_unless_roundtrip_str
 from .round import encode_float, round_float
+from .round_ndarray import encode_ndarray, round_ndarray
+from .decode_ndarray import decode_ndarray
 from .types import FloatClass, FloatValue, FormatInfo, RoundMode
 
 # Don't automatically import from .formats.
 
@@ -0,0 +1,78 @@
+# Copyright (c) 2024 Graphcore Ltd. All rights reserved.
+
+from types import ModuleType
+import numpy as np
+from .types import FormatInfo
+
+
+def decode_ndarray(
+    fi: FormatInfo, codes: np.ndarray, np: ModuleType = np
+) -> np.ndarray:
+    r"""
+    Vectorized version of :meth:`decode_float`
+
+    Args:
+      fi (FormatInfo): Floating point format descriptor.
+      i (array of int):  Integer code points, in the range :math:`0 \le i < 2^{k}`,
+                where :math:`k` = ``fi.k``
+
+    Returns:
+      Decoded float values
+
+    Raises:
+      ValueError:
+        If any :paramref:`i` is outside the range of valid code points in :paramref:`fi`.
+    """
+    assert np.issubdtype(codes.dtype, np.integer)
+
+    k = fi.k
+    p = fi.precision
+    t = p - 1  # Trailing significand field width
+    num_signbits = 1 if fi.is_signed else 0
+    w = k - t - num_signbits  # Exponent field width
+
+    if np.any(codes < 0) or np.any(codes >= 2**k):
+        raise ValueError(f"Code point not in range [0, 2**{k})")
+
+    if fi.is_signed:
+        signmask = 1 << (k - 1)
+        sign = np.where(codes & signmask, -1.0, 1.0)
+    else:
+        signmask = None
+        sign = 1.0
+
+    exp = ((codes >> t) & ((1 << w) - 1)).astype(np.int64)
+    significand = codes & ((1 << t) - 1)
+    if fi.is_twos_complement:
+        significand = np.where(sign < 0, (1 << t) - significand, significand)
+
+    expBias = fi.expBias
+
+    iszero = (exp == 0) & (significand == 0) & fi.has_zero
+    issubnormal = (exp == 0) & (significand != 0) & fi.has_subnormals
+    isnormal = ~iszero & ~issubnormal
+    expval = np.where(~isnormal, 1 - expBias, exp - expBias)
+    fsignificand = np.where(~isnormal, significand * 2**-t, 1.0 + significand * 2**-t)
+
+    # Normal/Subnormal/Zero case, other values will be overwritten
+    fval = np.where(iszero, 0.0, sign * fsignificand * 2.0**expval)
+
+    if fi.has_infs:
+        fval = np.where(codes == fi.code_of_posinf, np.inf, fval)
+        fval = np.where(codes == fi.code_of_neginf, -np.inf, fval)
+
+    if fi.num_nans > 0:
+        code_is_nan = codes == fi.code_of_nan
+        if w > 0:
+            # All-bits-special exponent (ABSE)
+            abse = exp == 2**w - 1
+            min_code_with_nan = 2 ** (p - 1) - fi.num_high_nans
+            code_is_nan |= abse & (significand >= min_code_with_nan)
+
+        fval = np.where(code_is_nan, np.nan, fval)
+
+    # Negative zero
+    if fi.has_nz:
+        fval = np.where(iszero & (sign < 0), -0.0, fval)
+
+    return fval
-Original file line number
+Diff line change
 # Requirements for notebooks
 airium
 pandas
 +jaxlib
 +jax