fix: consistency with NNlib (#1328)

avik-pal · github-actions[bot] · web-flow · commit a93a83ad2df1 · 2025-05-26T05:59:32.000-04:00
* fix: conditionals in softmax

* fix: default to HIGH precision

* fix: revert high precision

* docs: add a dedicated FAQs section in the docs

* feat: add a convolution precision scopedvalue

* Update test/config.jl

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;

* fix: config

* chore: bump version for release

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Reactant"
 uuid = "3c362404-f566-11ee-1572-e11a4b42c853"
 authors = ["William Moses <wmoses@mit.edu>", "Valentin Churavy <vchuravy@mit.edu>", "Sergio Sánchez Ramírez <sergio.sanchez.ramirez@bsc.es>", "Paul Berg <paul@plutojl.org>", "Avik Pal <avikpal@mit.edu>", "Mosè Giordano <mose@gnu.org>"]
-version = "0.2.116"
+version = "0.2.117"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -76,6 +76,7 @@ export default defineConfig({
         items: [
           { text: "Introduction", link: "/introduction" },
           { text: "Configuration", link: "/introduction/configuration" },
+          { text: "FAQs", link: "/introduction/FAQs" },
         ],
       },
       { text: "Benchmarks", link: "https://enzymead.github.io/Reactant.jl/benchmarks/" },
@@ -140,6 +141,7 @@ export default defineConfig({
         items: [
           { text: "Introduction", link: "/introduction" },
           { text: "Configuration", link: "/introduction/configuration" },
+          { text: "FAQs", link: "/introduction/FAQs" },
         ],
       }
     ],
diff --git a/docs/src/api/config.md b/docs/src/api/config.md
@@ -29,7 +29,7 @@ Reactant.with_config
 
 ```@docs
 Reactant.DotGeneralAlgorithmPreset
-Reactant.DotGeneralPrecision
+Reactant.PrecisionConfig
 Reactant.DotGeneralAlgorithm
 ```
 
diff --git a/docs/src/introduction/FAQs.md b/docs/src/introduction/FAQs.md
@@ -0,0 +1,83 @@
+# FAQs
+
+## XLA auto-tuner: Results do not match the reference. This is likely a bug/unexpected loss of precision
+
+If you see this error with the CUDA backend, use a scoped value to increase the precision
+of the dot-general algorithm.
+
+```julia
+Reactant.with_config(; dot_general_precision=PrecisionConfig.HIGH) do
+    @compile ...
+end
+```
+
+For more information, see [this XLA issue](https://github.com/openxla/xla/issues/23934).
+
+## Emptying the cache to avoid OOM issues
+
+When you encounter OOM (Out of Memory) errors, you can try to clear the cache by using
+Julia's builtin `GC.gc()` between memory-intensive operations.
+
+!!! note
+    This will only free memory which is not currently live. If the result of compiled
+    function was stored in a vector, it will still be alive and `GC.gc()` won't free it.
+
+```julia
+using Reactant
+n = 500_000_000
+input1 = Reactant.ConcreteRArray(ones(n))
+input2 = Reactant.ConcreteRArray(ones(n))
+
+function sin_add(x, y)
+   return sin.(x) .+ y
+end
+
+f = @compile sin_add(input1,input2)
+
+for i = 1:10
+   GC.gc()
+   @info "gc... $i"
+   f(input1, input2) # May cause OOM here for a 24GB GPU if GC is not used
+end
+```
+
+If you **don't** use `GC.gc()` here, this may cause an OOM:
+
+```bash
+[ Info: gc... 1
+[ Info: gc... 2
+[ Info: gc... 3
+...
+E0105 09:48:28.755177  110350 pjrt_stream_executor_client.cc:3088] Execution of replica 0 failed: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 4000000000 bytes.
+ERROR: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 4000000000 bytes.
+
+Stacktrace:
+ [1] reactant_err(msg::Cstring)
+   @ Reactant.XLA ~/.julia/packages/Reactant/7m11i/src/XLA.jl:104
+ [2] macro expansion
+   @ ~/.julia/packages/Reactant/7m11i/src/XLA.jl:357 [inlined]
+ [3] ExecutableCall
+   @ ~/.julia/packages/Reactant/7m11i/src/XLA.jl:334 [inlined]
+ [4] macro expansion
+   @ ~/.julia/packages/Reactant/7m11i/src/Compiler.jl:798 [inlined]
+ [5] (::Reactant.Compiler.Thunk{…})(::ConcreteRArray{…}, ::ConcreteRArray{…})
+   @ Reactant.Compiler ~/.julia/packages/Reactant/7m11i/src/Compiler.jl:909
+ [6] top-level scope
+   @ ./REPL[7]:4
+Some type information was truncated. Use `show(err)` to see complete types.
+```
+
+After using Julia's built-in `GC.gc()`:
+
+```bash
+[ Info: gc... 1
+[ Info: gc... 2
+[ Info: gc... 3
+[ Info: gc... 4
+[ Info: gc... 5
+[ Info: gc... 6
+[ Info: gc... 7
+[ Info: gc... 8
+[ Info: gc... 9
+[ Info: gc... 10
+```
diff --git a/docs/src/introduction/index.md b/docs/src/introduction/index.md
@@ -53,83 +53,3 @@ f = @compile sinsum_add(input1,input2)
 # one can now run the program
 f(input1, input2)
 ```
-
-
-## Tips
-
-### Empty Cache
-
-When you encounter OOM (Out of Memory) errors, you can try to clear the cache by using Julia's builtin `GC.gc()` between memory-intensive operations.
-
-!!! note
-    This will only free memory which is not currently live. If the result of compiled function was stored in a vector, it will still be alive and `GC.gc()` won't free it.
-
-```julia
-using Reactant
-n = 500_000_000
-input1 = Reactant.ConcreteRArray(ones(n))
-input2 = Reactant.ConcreteRArray(ones(n))
-
-function sin_add(x, y)
-   return sin.(x) .+ y
-end
-
-f = @compile sin_add(input1,input2)
-
-for i = 1:10
-   GC.gc()
-   @info "gc... $i"
-   f(input1, input2) # May cause OOM here for a 24GB GPU if GC is not used
-end
-```
-
-If you **don't** use `GC.gc()` here, this may cause an OOM:
-
-
-
-```bash
-[ Info: gc... 1
-[ Info: gc... 2
-[ Info: gc... 3
-...
-E0105 09:48:28.755177  110350 pjrt_stream_executor_client.cc:3088] Execution of replica 0 failed: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 4000000000 bytes.
-ERROR: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 4000000000 bytes.
-
-Stacktrace:
- [1] reactant_err(msg::Cstring)
-   @ Reactant.XLA ~/.julia/packages/Reactant/7m11i/src/XLA.jl:104
- [2] macro expansion
-   @ ~/.julia/packages/Reactant/7m11i/src/XLA.jl:357 [inlined]
- [3] ExecutableCall
-   @ ~/.julia/packages/Reactant/7m11i/src/XLA.jl:334 [inlined]
- [4] macro expansion
-   @ ~/.julia/packages/Reactant/7m11i/src/Compiler.jl:798 [inlined]
- [5] (::Reactant.Compiler.Thunk{…})(::ConcreteRArray{…}, ::ConcreteRArray{…})
-   @ Reactant.Compiler ~/.julia/packages/Reactant/7m11i/src/Compiler.jl:909
- [6] top-level scope
-   @ ./REPL[7]:4
-Some type information was truncated. Use `show(err)` to see complete types.
-```
-
-
-After using Julia's built-in `GC.gc()`:
-
-
-
-```bash
-[ Info: gc... 1
-[ Info: gc... 2
-[ Info: gc... 3
-[ Info: gc... 4
-[ Info: gc... 5
-[ Info: gc... 6
-[ Info: gc... 7
-[ Info: gc... 8
-[ Info: gc... 9
-[ Info: gc... 10
-```
-
-
-
-
-
diff --git a/ext/ReactantNNlibExt/Implementations.jl b/ext/ReactantNNlibExt/Implementations.jl
@@ -7,36 +7,26 @@ for (jlop, hloop) in (
 end
 
 function NNlib.softmax!(out::AnyTracedRArray{T,N}, x::AbstractArray; dims=1) where {T,N}
-    max_ = NNlib.fast_maximum(x; dims)
-    # XXX: Once reverse mode of if is properly supported, we can make it @trace
-    # zero_num = TracedUtils.promote_to(TracedRNumber{T}, 0)
-    # one_num = TracedUtils.promote_to(TracedRNumber{T}, 1)
-    # @trace if all(isfinite, max_)
-    @. out = exp(x - max_)
-    # else
-    #     cond = max_ .== Inf
-    #     true_pred = ifelse.(x .== Inf, one_num, zero_num)
-    #     @. out = ifelse(cond, true_pred, exp(x - max_))
-    # end
-    tmp = dims isa Colon ? sum(out) : sum!(max_, out)
-    out ./= tmp
+    max_ = maximum(x; dims)
+    diff = exp.(x .- max_)
+    @trace if all(isfinite, max_)
+        @. out = diff
+    else
+        @. out = ifelse(isinf(max_), ifelse(isinf(x), T(1), T(0)), diff)
+    end
+    out ./= sum(out; dims)
     return out
 end
 
 function NNlib.logsoftmax!(out::AnyTracedRArray{T}, x::AbstractArray; dims=1) where {T}
-    max_ = NNlib.fast_maximum(x; dims)
-    # XXX: Once reverse mode of if is properly supported, we can make it @trace
-    # inf_num = TracedUtils.promote_to(TracedRNumber{T}, Inf)
-    # zero_num = TracedUtils.promote_to(TracedRNumber{T}, 0)
-    # @trace if all(isfinite, max_)
-    @. out = x - max_
-    # else
-    #     cond = max_ .== Inf
-    #     true_pred = ifelse.(x .== Inf, zero_num, -inf_num)
-    #     @. out = ifelse(cond, true_pred, x - max_)
-    # end
-    @fastmath log_ = log.(sum(exp, out; dims))
-    out .-= log_
+    max_ = maximum(x; dims)
+    diff = x .- max_
+    @trace if all(isfinite, max_)
+        @. out = diff
+    else
+        @. out = ifelse(isinf(max_), ifelse(isinf(x), T(0), -T(Inf)), diff)
+    end
+    out .-= log.(sum(exp, out; dims))
     return out
 end
 
@@ -111,6 +101,10 @@ function overloaded_conv!(
         rhs_dilation=collect(dilation),
         feature_group_count,
         batch_group_count=1,
+        precision_config=MLIR.IR.Attribute([
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+        ]),
     )
     set_mlir_data!(y, Reactant.MLIR.IR.result(conv))
     return y
@@ -206,6 +200,10 @@ function overloaded_∇conv_filter!(
         rhs_dilation=collect(stride),
         feature_group_count,
         batch_group_count,
+        precision_config=MLIR.IR.Attribute([
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+        ]),
     )
     set_mlir_data!(dw, MLIR.IR.result(conv))
 
@@ -326,6 +324,10 @@ function overloaded_∇conv_data!(
         dimension_numbers,
         feature_group_count,
         batch_group_count=1,
+        precision_config=MLIR.IR.Attribute([
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+            MLIR.IR.Attribute(Reactant.CONVOLUTION_PRECISION[]),
+        ]),
     )
     set_mlir_data!(dx, MLIR.IR.result(conv))
 
diff --git a/src/Configuration.jl b/src/Configuration.jl
@@ -1,7 +1,7 @@
 using ScopedValues: ScopedValues, ScopedValue
 
 export with_config
-export DotGeneralAlgorithmPreset, DotGeneralPrecision, DotGeneralAlgorithm
+export DotGeneralAlgorithmPreset, PrecisionConfig, DotGeneralAlgorithm
 
 """
     with_config(f; kwargs...)
@@ -27,12 +27,15 @@ scope will use the provided values.
     [`DotGeneralAlgorithm`](@ref) or [`DotGeneralAlgorithmPreset`](@ref). Defaults to
     `DotGeneralAlgorithmPreset.DEFAULT`.
   - `dot_general_precision`: Precision for `stablehlo.dot_general`. Can be `nothing`,
-    or [`DotGeneralPrecision`](@ref). Defaults to `DotGeneralPrecision.DEFAULT`.
+    or [`PrecisionConfig`](@ref). Defaults to `PrecisionConfig.DEFAULT`.
+  - `convolution_precision`: Precision for `stablehlo.convolution`. Can be `nothing`,
+    or [`PrecisionConfig`](@ref). Defaults to `PrecisionConfig.DEFAULT`.
 """
 function with_config(
     f;
     dot_general_algorithm=missing,
     dot_general_precision=missing,
+    convolution_precision=missing,
     lower_partialsort_to_approx_top_k=missing,
     fallback_approx_top_k_lowering=missing,
 )
@@ -41,6 +44,8 @@ function with_config(
         (config_vars = (config_vars..., DOT_GENERAL_ALGORITHM => dot_general_algorithm))
     dot_general_precision !== missing &&
         (config_vars = (config_vars..., DOT_GENERAL_PRECISION => dot_general_precision))
+    convolution_precision !== missing &&
+        (config_vars = (config_vars..., CONVOLUTION_PRECISION => convolution_precision))
     lower_partialsort_to_approx_top_k !== missing && (
         config_vars = (
             config_vars...,
@@ -63,7 +68,7 @@ const FALLBACK_APPROX_TOP_K_LOWERING = ScopedValue(true)
 
 # DotGeneral Attributes Configuration
 """
-    DotGeneralPrecision
+    PrecisionConfig
 
 Controls the `precision_config` for `stablehlo.dot_general`. Valid values are:
 
@@ -73,26 +78,34 @@ Controls the `precision_config` for `stablehlo.dot_general`. Valid values are:
 
 The following functions are available:
 
-  `MLIR.IR.Attribute(precision::DotGeneralPrecision.T)`
+  `MLIR.IR.Attribute(precision::PrecisionConfig.T)`
 """
-@enumx DotGeneralPrecision begin
+@enumx PrecisionConfig begin
     DEFAULT
     HIGH
     HIGHEST
 end
 
+Base.@deprecate_binding DotGeneralPrecision PrecisionConfig
+
 const DOT_GENERAL_PRECISION = ScopedValue{
-    Union{DotGeneralPrecision.T,Nothing,Tuple{DotGeneralPrecision.T,DotGeneralPrecision.T}}
+    Union{PrecisionConfig.T,Nothing,Tuple{PrecisionConfig.T,PrecisionConfig.T}}
+}(
+    PrecisionConfig.DEFAULT
+)
+
+const CONVOLUTION_PRECISION = ScopedValue{
+    Union{PrecisionConfig.T,Nothing,Tuple{PrecisionConfig.T,PrecisionConfig.T}}
 }(
-    DotGeneralPrecision.DEFAULT
+    PrecisionConfig.DEFAULT
 )
 
-function MLIR.IR.Attribute(precision::DotGeneralPrecision.T)
-    precision_str = if precision == DotGeneralPrecision.DEFAULT
+function MLIR.IR.Attribute(precision::PrecisionConfig.T)
+    precision_str = if precision == PrecisionConfig.DEFAULT
         "DEFAULT"
-    elseif precision == DotGeneralPrecision.HIGH
+    elseif precision == PrecisionConfig.HIGH
         "HIGH"
-    elseif precision == DotGeneralPrecision.HIGHEST
+    elseif precision == PrecisionConfig.HIGHEST
         "HIGHEST"
     end
     return MLIR.IR.Attribute(
diff --git a/src/Overlay.jl b/src/Overlay.jl
@@ -165,15 +165,15 @@ end
 
 @reactant_overlay @noinline function Base._all(f, x::AbstractArray{T}, dims) where {T}
     if T <: TracedRNumber && T !== Union{}
-        return TracedRArrayOverrides.overloaded_all(f, x, dims)
+        return TracedRArrayOverrides.overloaded_mapreduce(f, &, x; dims)
     else
         return Base.inferencebarrier(Base._all)(f, x, dims)
     end
 end
 
 @reactant_overlay @noinline function Base.any(f, x::AbstractArray{T}, dims) where {T}
     if T <: TracedRNumber && T !== Union{}
-        return TracedRArrayOverrides.overloaded_any(f, x, dims)
+        return TracedRArrayOverrides.overloaded_mapreduce(f, |, x; dims)
     else
         return Base.inferencebarrier(Base.any)(f, x, dims)
     end
diff --git a/src/TracedRArray.jl b/src/TracedRArray.jl
diff --git a/test/config.jl b/test/config.jl
diff --git a/test/nn/lux.jl b/test/nn/lux.jl
diff --git a/test/nn/nnlib.jl b/test/nn/nnlib.jl