Improve timing presentation

awf · awf · commit 00515b62b582 · 2024-06-17T18:06:05.000+01:00
diff --git a/docs/source/05-speed.ipynb b/docs/source/05-speed.ipynb
@@ -2,9 +2,18 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
    "source": [
     "# Copyright (c) 2024 Graphcore Ltd. All rights reserved.\n",
     "\n",
@@ -14,6 +23,9 @@
     "import ml_dtypes\n",
     "import gfloat\n",
     "from gfloat.formats import format_info_ocp_e5m2\n",
+    "from timeit import Timer\n",
+    "\n",
+    "jax.config.update(\"jax_enable_x64\", True)\n",
     "\n",
     "%load_ext autoreload\n",
     "%autoreload 2"
@@ -32,31 +44,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.\n",
-      "/home/awf/.micromamba/envs/gfloat-clean/lib/python3.10/site-packages/jax/_src/numpy/array_methods.py:68: UserWarning: Explicitly requested dtype <class 'jax.numpy.int64'> requested in astype is not available, and will be truncated to dtype int32. To enable more dtypes, set the jax_enable_x64 configuration option or the JAX_ENABLE_X64 shell environment variable. See https://github.com/google/jax#current-gotchas for more.\n",
-      "  return lax_numpy.astype(arr, dtype, copy=copy, device=device)\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "GFloat scalar                  :616 ms ± 23.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
-      "GFloat vectorized, numpy arrays:4.49 ms ± 255 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
-      "GFloat vectorized, JAX JIT     :596 µs ± 13.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
-      "ML_dtypes                      :266 µs ± 16.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
+      "GFloat scalar                  :  6996.75 nsec (50 runs at size 10000)\n",
+      "GFloat vectorized, numpy arrays:    75.04 nsec (50 runs at size 1000000)\n",
+      "GFloat vectorized, JAX JIT     :     3.18 nsec (1000 runs at size 1000000)\n",
+      "ML_dtypes                      :     3.13 nsec (1000 runs at size 1000000)\n"
      ]
     }
    ],
    "source": [
-    "N = 100_000\n",
+    "N = 1_000_000\n",
     "a = np.random.rand(N)\n",
     "\n",
     "jax_round_jit = jax.jit(lambda x: gfloat.round_ndarray(format_info_ocp_e5m2, x, np=jnp))\n",
@@ -68,17 +71,20 @@
     "    return np.array([gfloat.round_float(fi, x) for x in a])\n",
     "\n",
     "\n",
-    "print(\"GFloat scalar                  :\", end=\"\")\n",
-    "%timeit slow_round_ndarray(format_info_ocp_e5m2, a)\n",
-    "\n",
-    "print(\"GFloat vectorized, numpy arrays:\", end=\"\")\n",
-    "%timeit gfloat.round_ndarray(format_info_ocp_e5m2, a)\n",
+    "def time(f, problem_size=1.0):\n",
+    "    units = 1e9  # nsec\n",
+    "    t = Timer(f)\n",
+    "    n = t.autorange()[0] * 10  # About 2 sec per run\n",
+    "    ts = t.repeat(repeat=3, number=n)  # best of 3\n",
+    "    ts = [((t / n) / problem_size) * units for t in ts]  # per run\n",
+    "    return f\"{min(ts):8.2f} nsec ({n} runs at size {problem_size})\"\n",
     "\n",
-    "print(\"GFloat vectorized, JAX JIT     :\", end=\"\")\n",
-    "%timeit jax_round_jit(ja)\n",
     "\n",
-    "print(\"ML_dtypes                      :\", end=\"\")\n",
-    "%timeit a.astype(ml_dtypes.float8_e5m2)"
+    "# fmt: off\n",
+    "print(\"GFloat scalar                  :\", time(lambda: slow_round_ndarray(format_info_ocp_e5m2, a[: N // 100]), N // 100))\n",
+    "print(\"GFloat vectorized, numpy arrays:\", time(lambda: gfloat.round_ndarray(format_info_ocp_e5m2, a), N))\n",
+    "print(\"GFloat vectorized, JAX JIT     :\", time(lambda: jax_round_jit(ja), N))\n",
+    "print(\"ML_dtypes                      :\", time(lambda: a.astype(ml_dtypes.float8_e5m2), N))"
    ]
   },
   {
@@ -87,12 +93,12 @@
    "source": [
     "On one CPU platform the timings were:\n",
     "```\n",
-    "GFloat scalar                  :629     ms ± 22.3 ms \n",
-    "GFloat vectorized, numpy arrays:  4.420 ms ± 153 µs \n",
-    "GFloat vectorized, JAX JIT     :    585 µs ± 13.7 µs \n",
-    "ML_dtypes                      :    253 µs ± 12 µs \n",
+    "GFloat scalar                  :  6996.75 nsec (50 runs at size 10000)\n",
+    "GFloat vectorized, numpy arrays:    75.04 nsec (50 runs at size 1000000)\n",
+    "GFloat vectorized, JAX JIT     :     3.18 nsec (1000 runs at size 1000000)\n",
+    "ML_dtypes                      :     3.13 nsec (1000 runs at size 1000000)\n",
     "```\n",
-    "So the JAX JIT code is 1000x faster than the scalar code, although `ml_dtypes`'s C++ is 2-3x faster still."
+    "So the JAX JIT code is ~2000x faster than the scalar code, and comparable to `ml_dtypes`'s C++ CPU implementation."
    ]
   }
  ],