Merge pull request numpy#27087 from ngoldbaum/fix-dragon4

rgommers · web-flow · commit bb391b5d9fa0 · 2024-08-01T22:36:17.000+02:00
ENH: mark the dragon4 scratch space as thread-local
diff --git a/numpy/_core/src/multiarray/dragon4.c b/numpy/_core/src/multiarray/dragon4.c
@@ -163,28 +163,7 @@ typedef struct {
     char repr[16384];
 } Dragon4_Scratch;
 
-static int _bigint_static_in_use = 0;
-static Dragon4_Scratch _bigint_static;
-
-static Dragon4_Scratch*
-get_dragon4_bigint_scratch(void) {
-    /* this test+set is not threadsafe, but no matter because we have GIL */
-    if (_bigint_static_in_use) {
-        PyErr_SetString(PyExc_RuntimeError,
-            "numpy float printing code is not re-entrant. "
-            "Ping the devs to fix it.");
-        return NULL;
-    }
-    _bigint_static_in_use = 1;
-
-    /* in this dummy implementation we only return the static allocation */
-    return &_bigint_static;
-}
-
-static void
-free_dragon4_bigint_scratch(Dragon4_Scratch *mem){
-    _bigint_static_in_use = 0;
-}
+static NPY_TLS Dragon4_Scratch _bigint_static;
 
 /* Copy integer */
 static void
@@ -2210,11 +2189,11 @@ Format_floatbits(char *buffer, npy_uint32 bufferSize, BigInt *mantissa,
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary16(
-        Dragon4_Scratch *scratch, npy_half *value, Dragon4_Options *opt)
+        npy_half *value, Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     npy_uint16 val = *value;
     npy_uint32 floatExponent, floatMantissa, floatSign;
@@ -2297,12 +2276,12 @@ Dragon4_PrintFloat_IEEE_binary16(
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary32(
-        Dragon4_Scratch *scratch, npy_float32 *value,
+        npy_float32 *value,
         Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     union
     {
@@ -2390,11 +2369,11 @@ Dragon4_PrintFloat_IEEE_binary32(
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary64(
-        Dragon4_Scratch *scratch, npy_float64 *value, Dragon4_Options *opt)
+        npy_float64 *value, Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     union
     {
@@ -2505,11 +2484,11 @@ typedef struct FloatVal128 {
  */
 static npy_uint32
 Dragon4_PrintFloat_Intel_extended(
-    Dragon4_Scratch *scratch, FloatVal128 value, Dragon4_Options *opt)
+    FloatVal128 value, Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     npy_uint32 floatExponent, floatSign;
     npy_uint64 floatMantissa;
@@ -2603,7 +2582,7 @@ Dragon4_PrintFloat_Intel_extended(
  */
 static npy_uint32
 Dragon4_PrintFloat_Intel_extended80(
-    Dragon4_Scratch *scratch, npy_float80 *value, Dragon4_Options *opt)
+    npy_float80 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     union {
@@ -2619,15 +2598,15 @@ Dragon4_PrintFloat_Intel_extended80(
     val128.lo = buf80.integer.a;
     val128.hi = buf80.integer.b;
 
-    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+    return Dragon4_PrintFloat_Intel_extended(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_INTEL_EXTENDED_10_BYTES_LE */
 
 #ifdef HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
 /* Intel's 80-bit IEEE extended precision format, 96-bit storage */
 static npy_uint32
 Dragon4_PrintFloat_Intel_extended96(
-    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+    npy_float96 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     union {
@@ -2643,15 +2622,15 @@ Dragon4_PrintFloat_Intel_extended96(
     val128.lo = buf96.integer.a;
     val128.hi = buf96.integer.b;
 
-    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+    return Dragon4_PrintFloat_Intel_extended(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE */
 
 #ifdef HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE
 /* Motorola Big-endian equivalent of the Intel-extended 96 fp format */
 static npy_uint32
 Dragon4_PrintFloat_Motorola_extended96(
-    Dragon4_Scratch *scratch, npy_float96 *value, Dragon4_Options *opt)
+    npy_float96 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     union {
@@ -2668,7 +2647,7 @@ Dragon4_PrintFloat_Motorola_extended96(
     val128.hi = buf96.integer.a >> 16;
     /* once again we assume the int has same endianness as the float */
 
-    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+    return Dragon4_PrintFloat_Intel_extended(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE */
 
@@ -2688,7 +2667,7 @@ typedef union FloatUnion128
 /* Intel's 80-bit IEEE extended precision format, 128-bit storage */
 static npy_uint32
 Dragon4_PrintFloat_Intel_extended128(
-    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+    npy_float128 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     FloatUnion128 buf128;
@@ -2698,7 +2677,7 @@ Dragon4_PrintFloat_Intel_extended128(
     val128.lo = buf128.integer.a;
     val128.hi = buf128.integer.b;
 
-    return Dragon4_PrintFloat_Intel_extended(scratch, val128, opt);
+    return Dragon4_PrintFloat_Intel_extended(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE */
 
@@ -2717,11 +2696,11 @@ Dragon4_PrintFloat_Intel_extended128(
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary128(
-    Dragon4_Scratch *scratch, FloatVal128 val128, Dragon4_Options *opt)
+    FloatVal128 val128, Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     npy_uint32 floatExponent, floatSign;
 
@@ -2802,7 +2781,7 @@ Dragon4_PrintFloat_IEEE_binary128(
 #if defined(HAVE_LDOUBLE_IEEE_QUAD_LE)
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary128_le(
-    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+    npy_float128 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     FloatUnion128 buf128;
@@ -2811,7 +2790,7 @@ Dragon4_PrintFloat_IEEE_binary128_le(
     val128.lo = buf128.integer.a;
     val128.hi = buf128.integer.b;
 
-    return Dragon4_PrintFloat_IEEE_binary128(scratch, val128, opt);
+    return Dragon4_PrintFloat_IEEE_binary128(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_IEEE_QUAD_LE */
 
@@ -2822,7 +2801,7 @@ Dragon4_PrintFloat_IEEE_binary128_le(
  */
 static npy_uint32
 Dragon4_PrintFloat_IEEE_binary128_be(
-    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+    npy_float128 *value, Dragon4_Options *opt)
 {
     FloatVal128 val128;
     FloatUnion128 buf128;
@@ -2831,7 +2810,7 @@ Dragon4_PrintFloat_IEEE_binary128_be(
     val128.lo = buf128.integer.b;
     val128.hi = buf128.integer.a;
 
-    return Dragon4_PrintFloat_IEEE_binary128(scratch, val128, opt);
+    return Dragon4_PrintFloat_IEEE_binary128(val128, opt);
 }
 #endif /* HAVE_LDOUBLE_IEEE_QUAD_BE */
 
@@ -2877,11 +2856,11 @@ Dragon4_PrintFloat_IEEE_binary128_be(
  */
 static npy_uint32
 Dragon4_PrintFloat_IBM_double_double(
-    Dragon4_Scratch *scratch, npy_float128 *value, Dragon4_Options *opt)
+    npy_float128 *value, Dragon4_Options *opt)
 {
-    char *buffer = scratch->repr;
-    const npy_uint32 bufferSize = sizeof(scratch->repr);
-    BigInt *bigints = scratch->bigints;
+    char *buffer = _bigint_static.repr;
+    const npy_uint32 bufferSize = sizeof(_bigint_static.repr);
+    BigInt *bigints = _bigint_static.bigints;
 
     FloatVal128 val128;
     FloatUnion128 buf128;
@@ -3068,16 +3047,10 @@ PyObject *\
 Dragon4_Positional_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
 {\
     PyObject *ret;\
-    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
-    if (scratch == NULL) {\
-        return NULL;\
-    }\
-    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
-        free_dragon4_bigint_scratch(scratch);\
+    if (Dragon4_PrintFloat_##format(val, opt) < 0) {\
         return NULL;\
     }\
-    ret = PyUnicode_FromString(scratch->repr);\
-    free_dragon4_bigint_scratch(scratch);\
+    ret = PyUnicode_FromString(_bigint_static.repr);\
     return ret;\
 }\
 \
@@ -3106,16 +3079,10 @@ PyObject *\
 Dragon4_Scientific_##Type##_opt(npy_type *val, Dragon4_Options *opt)\
 {\
     PyObject *ret;\
-    Dragon4_Scratch *scratch = get_dragon4_bigint_scratch();\
-    if (scratch == NULL) {\
-        return NULL;\
-    }\
-    if (Dragon4_PrintFloat_##format(scratch, val, opt) < 0) {\
-        free_dragon4_bigint_scratch(scratch);\
+    if (Dragon4_PrintFloat_##format(val, opt) < 0) {    \
         return NULL;\
     }\
-    ret = PyUnicode_FromString(scratch->repr);\
-    free_dragon4_bigint_scratch(scratch);\
+    ret = PyUnicode_FromString(_bigint_static.repr);\
     return ret;\
 }\
 PyObject *\
diff --git a/numpy/_core/tests/test_arrayprint.py b/numpy/_core/tests/test_arrayprint.py
@@ -9,6 +9,7 @@
     assert_, assert_equal, assert_raises, assert_warns, HAS_REFCOUNT,
     assert_raises_regex, IS_WASM
     )
+from numpy.testing._private.utils import run_threaded
 from numpy._core.arrayprint import _typelessdata
 import textwrap
 
@@ -1249,3 +1250,10 @@ async def main():
     loop = asyncio.new_event_loop()
     asyncio.run(main())
     loop.close()
+
+@pytest.mark.skipif(IS_WASM, reason="wasm doesn't support threads")
+def test_multithreaded_array_printing():
+    # the dragon4 implementation uses a static scratch space for performance
+    # reasons this test makes sure it is set up in a thread-safe manner
+
+    run_threaded(TestPrintOptions().test_floatmode, 500)
diff --git a/numpy/_core/tests/test_multithreading.py b/numpy/_core/tests/test_multithreading.py
@@ -1,25 +1,15 @@
-import concurrent.futures
 import threading
 
 import numpy as np
 import pytest
 
 from numpy.testing import IS_WASM
+from numpy.testing._private.utils import run_threaded
 
 if IS_WASM:
     pytest.skip(allow_module_level=True, reason="no threading support in wasm")
 
 
-def run_threaded(func, iters, pass_count=False):
-    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as tpe:
-        if pass_count:
-            futures = [tpe.submit(func, i) for i in range(iters)]
-        else:
-            futures = [tpe.submit(func) for _ in range(iters)]
-        for f in futures:
-            f.result()
-
-
 def test_parallel_randomstate_creation():
     # if the coercion cache is enabled and not thread-safe, creating
     # RandomState instances simultaneously leads to a data race
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
@@ -17,6 +17,7 @@
 from warnings import WarningMessage
 import pprint
 import sysconfig
+import concurrent.futures
 
 import numpy as np
 from numpy._core import (
@@ -40,7 +41,7 @@
         'HAS_REFCOUNT', "IS_WASM", 'suppress_warnings', 'assert_array_compare',
         'assert_no_gc_cycles', 'break_cycles', 'HAS_LAPACK64', 'IS_PYSTON',
         '_OLD_PROMOTION', 'IS_MUSL', '_SUPPORTS_SVE', 'NOGIL_BUILD',
-        'IS_EDITABLE'
+        'IS_EDITABLE', 'run_threaded',
         ]
 
 
@@ -2697,3 +2698,14 @@ def _get_glibc_version():
 
 _glibcver = _get_glibc_version()
 _glibc_older_than = lambda x: (_glibcver != '0.0' and _glibcver < x)
+
+
+def run_threaded(func, iters, pass_count=False):
+    """Runs a function many times in parallel"""
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as tpe:
+        if pass_count:
+            futures = [tpe.submit(func, i) for i in range(iters)]
+        else:
+            futures = [tpe.submit(func) for _ in range(iters)]
+        for f in futures:
+            f.result()