Add dace::float32sr type to DaCe (#2148)

creavin · tbennun · web-flow · commit c9576df9a3e5 · 2026-01-20T14:14:24.000Z
This new type enables DaCe users to perform calculations with stochastic
rounding in single precision.

This change is validated with unit tests.

---------

Co-authored-by: Tal Ben-Nun &lt;tbennun@users.noreply.github.com&gt;
diff --git a/dace/dtypes.py b/dace/dtypes.py
@@ -817,6 +817,43 @@ def to_python(self, obj_id: int):
         return ctypes.cast(obj_id, ctypes.py_object).value
 
 
+class Float32sr(typeclass):
+
+    def __init__(self):
+        self.type = numpy.float32
+        self.bytes = 4
+        self.dtype = self
+        self.typename = "float"
+        self.stochastically_rounded = True
+
+    def to_json(self):
+        return 'float32sr'
+
+    @staticmethod
+    def from_json(json_obj, context=None):
+        from dace.symbolic import pystr_to_symbolic  # must be included!
+        return float32sr()
+
+    @property
+    def ctype(self):
+        return "dace::float32sr"
+
+    @property
+    def ctype_unaligned(self):
+        return self.ctype
+
+    def as_ctypes(self):
+        """ Returns the ctypes version of the typeclass. """
+        return _FFI_CTYPES[self.type]
+
+    def as_numpy_dtype(self):
+        return numpy.dtype(self.type)
+
+    @property
+    def base_type(self):
+        return self
+
+
 class compiletime:
     """
     Data descriptor type hint signalling that argument evaluation is
@@ -1175,6 +1212,7 @@ def isconstant(var):
 complex128 = typeclass(numpy.complex128)
 string = stringtype()
 MPI_Request = opaque('MPI_Request')
+float32sr = Float32sr()
 
 
 @undefined_safe_enum
diff --git a/dace/frontend/python/replacements/operators.py b/dace/frontend/python/replacements/operators.py
@@ -113,6 +113,36 @@ def _is_op_boolean(op: str):
     return False
 
 
+def _handle_casting_for_stochastically_rounded_types(input_datatypes, restype, cast_types):
+    float_to_sr = {
+        dace.float32: dace.float32sr,
+    }
+
+    for i, dtype in enumerate(input_datatypes):
+        if hasattr(dtype, "stochastically_rounded"):
+            if cast_types[i] and dtype.type == eval(cast_types[i]).type:
+                cast_types[i] = None
+
+    # check if stoc rounded inputs
+    stochastically_rounded = True
+    for i, dtype in enumerate(input_datatypes):
+        if not hasattr(dtype, "stochastically_rounded"):
+            stochastically_rounded = False
+            break
+
+    if stochastically_rounded:
+        # make the result SR
+        if restype in float_to_sr:
+            restype = float_to_sr[restype]
+
+        # cast the intermediate types
+        for i, dtype in enumerate(cast_types):
+            if dtype in float_to_sr:
+                cast_types[i] = float_to_sr[dtype]
+
+    return restype
+
+
 def result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basic]],
                 operator: str = None) -> Tuple[Union[List[dtypes.typeclass], dtypes.typeclass, str], ...]:
 
@@ -144,12 +174,16 @@ def result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basic
             raise TypeError("Type {t} of argument {a} is not supported".format(t=type(arg), a=arg))
 
     complex_types = {dtypes.complex64, dtypes.complex128, np.complex64, np.complex128}
-    float_types = {dtypes.float16, dtypes.float32, dtypes.float64, np.float16, np.float32, np.float64}
+    float_types = {dace.float16, dace.float32, dace.float32sr, dace.float64, np.float16, np.float32, np.float64}
     signed_types = {dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64, np.int8, np.int16, np.int32, np.int64}
     # unsigned_types = {np.uint8, np.uint16, np.uint32, np.uint64}
 
     coarse_types = []
-    for dtype in datatypes:
+    for dt in datatypes:
+        dtype = dt
+        if hasattr(dt, "srtype"):  # unwrap stochastically rounded vars
+            dtype = dt.srtype
+
         if dtype in complex_types:
             coarse_types.append(3)  # complex
         elif dtype in float_types:
@@ -336,18 +370,20 @@ def result_type(arguments: Sequence[Union[str, Number, symbolic.symbol, sp.Basic
     else:  # Operators with 3 or more arguments
         restype = np_result_type(dtypes_for_result)
         coarse_result_type = None
-        if result_type in complex_types:
+        if restype in complex_types:
             coarse_result_type = 3  # complex
-        elif result_type in float_types:
+        elif restype in float_types:
             coarse_result_type = 2  # float
-        elif result_type in signed_types:
+        elif restype in signed_types:
             coarse_result_type = 1  # signed integer, bool
         else:
             coarse_result_type = 0  # unsigned integer
         for i, t in enumerate(coarse_types):
             if t != coarse_result_type:
                 casting[i] = cast_str(restype)
 
+    restype = _handle_casting_for_stochastically_rounded_types(datatypes, restype, casting)
+
     return restype, casting
 
 
diff --git a/dace/libraries/blas/nodes/dot.py b/dace/libraries/blas/nodes/dot.py
@@ -70,6 +70,7 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         (desc_x, stride_x), (desc_y, stride_y), desc_res, sz = node.validate(parent_sdfg, parent_state)
         dtype = desc_x.dtype.base_type
         veclen = desc_x.dtype.veclen
+        cast = "(float *)" if dtype == dace.float32sr else ""
 
         try:
             func, _, _ = blas_helpers.cublas_type_metadata(dtype)
@@ -82,7 +83,8 @@ def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         n = n or node.n or sz
         if veclen != 1:
             n /= veclen
-        code = f"_result = cblas_{func}({n}, _x, {stride_x}, _y, {stride_y});"
+
+        code = f"_result = cblas_{func}({n}, {cast} _x, {stride_x}, {cast} _y, {stride_y});"
         # The return type is scalar in cblas_?dot signature
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
                                           node.in_connectors, {'_result': dtype},
@@ -204,7 +206,16 @@ def validate(self, sdfg, state):
         if desc_x.dtype != desc_y.dtype:
             raise TypeError(f"Data types of input operands must be equal: {desc_x.dtype}, {desc_y.dtype}")
         if desc_x.dtype.base_type != desc_res.dtype.base_type:
-            raise TypeError(f"Data types of input and output must be equal: {desc_x.dtype}, {desc_res.dtype}")
+            arg_types = (desc_x.dtype.base_type, desc_res.dtype.base_type)
+            if dace.float32 in arg_types and dace.float32sr in arg_types:
+                """
+                When using stocastic rounding, a legitimate (i.e not a bug) mismatch between the input and output
+                arguments may arise where one argument is a float32sr and the other is a float32 (round-to-nearest).
+                The underlying data type is the same so this should not cause the validation to fail.
+                """
+                pass
+            else:
+                raise TypeError(f"Data types of input and output must be equal: {desc_x.dtype}, {desc_res.dtype}")
 
         # Squeeze input memlets
         squeezed1 = copy.deepcopy(in_memlets[0].subset)
diff --git a/dace/libraries/blas/nodes/gemm.py b/dace/libraries/blas/nodes/gemm.py
@@ -163,6 +163,7 @@ def expansion(node, state, sdfg):
         node.validate(sdfg, state)
         (_, adesc, _, _, _, _), (_, bdesc, _, _, _, _), _ = _get_matmul_operands(node, state, sdfg)
         dtype = adesc.dtype.base_type
+
         func = to_blastype(dtype.type).lower() + 'gemm'
         alpha = f'{dtype.ctype}({node.alpha})'
         beta = f'{dtype.ctype}({node.beta})'
@@ -178,6 +179,7 @@ def expansion(node, state, sdfg):
         check_access(dtypes.ScheduleType.CPU_Multicore, adesc, bdesc, cdesc)
 
         opt = _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta, dtype.ctype, func)
+        opt['cast'] = "(float *)" if dtype == dace.float32sr else ""
 
         # Adaptations for BLAS API
         opt['ta'] = 'CblasNoTrans' if opt['ta'] == 'N' else 'CblasTrans'
@@ -193,7 +195,7 @@ def expansion(node, state, sdfg):
             opt['beta'] = '&__beta'
 
         code += ("cblas_{func}(CblasColMajor, {ta}, {tb}, "
-                 "{M}, {N}, {K}, {alpha}, {x}, {lda}, {y}, {ldb}, {beta}, "
+                 "{M}, {N}, {K}, {alpha},{cast} {x}, {lda}, {cast} {y}, {ldb}, {beta}, "
                  "_c, {ldc});").format_map(opt)
 
         tasklet = dace.sdfg.nodes.Tasklet(
diff --git a/dace/libraries/blas/nodes/gemv.py b/dace/libraries/blas/nodes/gemv.py
@@ -232,6 +232,8 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs):
                                                                                    name_out="_y")
         dtype_a = outer_array_a.dtype.type
         dtype = outer_array_x.dtype.base_type
+        cast = "(float *)" if dtype == dace.float32sr else ""
+
         veclen = outer_array_x.dtype.veclen
         alpha = f'{dtype.ctype}({node.alpha})'
         beta = f'{dtype.ctype}({node.beta})'
@@ -280,7 +282,7 @@ def expansion(node: 'Gemv', state, sdfg, m=None, n=None, **kwargs):
             alpha = '&__alpha'
             beta = '&__beta'
 
-        code += f"""cblas_{func}({layout}, {trans}, {m}, {n}, {alpha}, _A, {lda},
+        code += f"""cblas_{func}({layout}, {trans}, {m}, {n}, {alpha}, {cast} _A, {lda},
                                 _x, {strides_x[0]}, {beta}, _y, {strides_y[0]});"""
 
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
diff --git a/dace/libraries/lapack/nodes/potrf.py b/dace/libraries/lapack/nodes/potrf.py
@@ -32,13 +32,14 @@ class ExpandPotrfOpenBLAS(ExpandTransformation):
     def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
         (desc_x, stride_x, rows_x, cols_x), desc_result = node.validate(parent_sdfg, parent_state)
         dtype = desc_x.dtype.base_type
+        cast = "(float *)" if dtype == dace.float32sr else ""
         lapack_dtype = blas_helpers.to_blastype(dtype.type).lower()
         if desc_x.dtype.veclen > 1:
             raise (NotImplementedError)
 
         n = n or node.n
         uplo = "'L'" if node.lower else "'U'"
-        code = f"_res = LAPACKE_{lapack_dtype}potrf(LAPACK_ROW_MAJOR, {uplo}, {rows_x}, _xin, {stride_x});"
+        code = f"_res = LAPACKE_{lapack_dtype}potrf(LAPACK_ROW_MAJOR, {uplo}, {rows_x}, {cast} _xin, {stride_x});"
         tasklet = dace.sdfg.nodes.Tasklet(node.name,
                                           node.in_connectors,
                                           node.out_connectors,
diff --git a/dace/libraries/linalg/nodes/cholesky.py b/dace/libraries/linalg/nodes/cholesky.py
@@ -16,6 +16,7 @@ def _make_sdfg(node, parent_state, parent_sdfg, implementation):
 
     inp_desc, inp_shape, out_desc, out_shape = node.validate(parent_sdfg, parent_state)
     dtype = inp_desc.dtype
+    cast = "(float *)" if dtype == dace.float32sr else ""
     storage = inp_desc.storage
 
     sdfg = dace.SDFG("{l}_sdfg".format(l=node.label))
@@ -36,7 +37,7 @@ def _make_sdfg(node, parent_state, parent_sdfg, implementation):
     _, me, mx = state.add_mapped_tasklet('_uzero_',
                                          dict(__i="0:%s" % out_shape[0], __j="0:%s" % out_shape[1]),
                                          dict(_inp=Memlet.simple('_b', '__i, __j')),
-                                         '_out = (__i < __j) ? 0 : _inp;',
+                                         f'_out = (__i < __j) ? {cast}(0) : _inp;',
                                          dict(_out=Memlet.simple('_b', '__i, __j')),
                                          language=dace.dtypes.Language.CPP,
                                          external_edges=True)
diff --git a/dace/runtime/include/dace/dace.h b/dace/runtime/include/dace/dace.h
@@ -24,6 +24,7 @@
 #include "perf/reporting.h"
 #include "comm.h"
 #include "serialization.h"
+#include "stocastic_rounding.h"
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
 #include "cuda/cudacommon.cuh"
diff --git a/dace/runtime/include/dace/reduction.h b/dace/runtime/include/dace/reduction.h
@@ -3,6 +3,7 @@
 #define __DACE_REDUCTION_H
 
 #include <cstdint>
+#include <dace/stocastic_rounding.h>
 
 #include "types.h"
 #include "vector.h"
@@ -121,6 +122,40 @@ namespace dace {
         }
     };
 
+    template <>
+    struct wcr_custom<dace::float32sr> {
+        template <typename WCR>
+        static DACE_HDFI dace::float32sr reduce_atomic(WCR wcr, dace::float32sr *ptr, const dace::float32sr& value) {
+            #ifdef DACE_USE_GPU_ATOMICS
+                // Stochastic rounding version of atomic float reduction
+                int *iptr = reinterpret_cast<int *>(ptr);
+                int old = *iptr, assumed;
+                do {
+                    assumed = old;
+                    float old_val = __int_as_float(assumed);
+                    float new_val = static_cast<float>(wcr(static_cast<dace::float32sr>(old_val), value));
+                    old = atomicCAS(iptr, assumed, __float_as_int(new_val));
+                } while (assumed != old);
+                return static_cast<dace::float32sr>(__int_as_float(old));
+            #else
+                dace::float32sr old;
+                #pragma omp critical
+                {
+                    old = *ptr;
+                    *ptr = wcr(old, value);
+                }
+                return old;
+            #endif
+        }
+
+        template <typename WCR>
+        static DACE_HDFI dace::float32sr reduce(WCR wcr, dace::float32sr *ptr, const dace::float32sr& value) {
+            dace::float32sr old = *ptr;
+            *ptr = wcr(old, value);
+            return old;
+        }
+    };
+
     template <>
     struct wcr_custom<double> {
         template <typename WCR>
@@ -313,6 +348,31 @@ namespace dace {
         DACE_HDFI float operator()(const float &a, const float &b) const { return ::max(a, b); }
     };
 
+
+    template <>
+    struct _wcr_fixed<ReductionType::Min, dace::float32sr> {
+
+        static DACE_HDFI dace::float32sr reduce_atomic(dace::float32sr *ptr, const dace::float32sr& value) {
+            return wcr_custom<dace::float32sr>::reduce_atomic(
+                _wcr_fixed<ReductionType::Min, dace::float32sr>(), ptr, value);
+        }
+
+
+        DACE_HDFI dace::float32sr operator()(const dace::float32sr &a, const dace::float32sr &b) const { return ::min(a, b); }
+    };
+
+    template <>
+    struct _wcr_fixed<ReductionType::Max, dace::float32sr> {
+
+        static DACE_HDFI dace::float32sr reduce_atomic(dace::float32sr *ptr, const dace::float32sr& value) {
+            return wcr_custom<dace::float32sr>::reduce_atomic(
+                _wcr_fixed<ReductionType::Max, dace::float32sr>(), ptr, value);
+        }
+
+        DACE_HDFI dace::float32sr operator()(const dace::float32sr &a, const dace::float32sr &b) const { return ::max(a, b); }
+    };
+
+
     template <>
     struct _wcr_fixed<ReductionType::Min, double> {
 
diff --git a/dace/runtime/include/dace/stocastic_rounding.h b/dace/runtime/include/dace/stocastic_rounding.h
diff --git a/tests/codegen/float32sr.py b/tests/codegen/float32sr.py