Include Bumper precompilation

lkdvos · lkdvos · commit 716456012523 · 2025-03-13T09:13:53.000-04:00
diff --git a/ext/TensorOperationsBumperExt.jl b/ext/TensorOperationsBumperExt.jl
@@ -1,6 +1,9 @@
 module TensorOperationsBumperExt
 
 using TensorOperations
+using TensorOperations: tensoralloc_add, tensoralloc_contract
+using VectorInterface: One, Zero
+using PrecompileTools
 using Bumper
 
 # Hack to normalize StridedView type to avoid too many specializations
@@ -50,4 +53,97 @@ function TensorOperations._butensor(src, ex...)
     return return Base.remove_linenums!(newex)
 end
 
+if PrecompileTools.workload_enabled(@__MODULE__)
+    buf = typeof(Bumper.default_buffer())
+    backend = TensorOperations.DefaultBackend
+
+    # tensoradd!
+    # ----------
+    for T in TensorOperations.PRECOMPILE_ELTYPES
+        for N in 0:(TensorOperations.PRECOMPILE_ADD_NDIMS)
+            TA = Array{T,N}
+            pA = Index2Tuple{N,0}
+            TA_buf = Core.Compiler.return_type(tensoralloc_add,
+                                               Tuple{T,TA,pA,Bool,Val{true},buf})
+            for (C, A) in Iterators.product((TA, TA_buf), (TA, TA_buf))
+                C == A == TA && continue
+                precompile(tensoradd!, (C, A, pA, Bool, One, Zero))
+                precompile(tensoradd!, (C, A, pA, Bool, T, Zero))
+                precompile(tensoradd!, (C, A, pA, Bool, T, T))
+            end
+
+            precompile(tensoralloc_add, (T, TA_buf, pA, Bool, Val{true}, buf))
+            precompile(tensoralloc_add, (T, TA_buf, pA, Bool, Val{false}, buf))
+        end
+    end
+
+    # tensortrace!
+    # ------------
+    for T in TensorOperations.PRECOMPILE_ELTYPES
+        for N1 in 0:TensorOperations.PRECOMPILE_TRACE_NDIMS[1],
+            N2 in 0:TensorOperations.PRECOMPILE_TRACE_NDIMS[2]
+
+            TC = Array{T,N1}
+            TA = Array{T,N1 + 2N2}
+            p = Index2Tuple{N1,0}
+            q = Index2Tuple{N2,N2}
+            r = Index2Tuple{N1 + 2N2,0}
+
+            TA_buf = Core.Compiler.return_type(tensoralloc_add,
+                                               Tuple{T,TA,r,Bool,Val{true},buf})
+            TC_buf = Core.Compiler.return_type(tensoralloc_add,
+                                               Tuple{T,TA,p,Bool,Val{true},buf})
+
+            for (C, A) in Iterators.product((TC, TC_buf), (TA, TA_buf))
+                C == TC && A == TA && continue
+                precompile(tensortrace!, (C, A, p, q, Bool, One, Zero))
+                precompile(tensortrace!, (C, A, p, q, Bool, T, Zero))
+                precompile(tensortrace!, (C, A, p, q, Bool, T, T))
+            end
+
+            # allocation re-uses tensoralloc_add
+        end
+    end
+
+    # tensorcontract!
+    # ---------------
+    for T in TensorOperations.PRECOMPILE_ELTYPES
+        for N1 in 0:TensorOperations.PRECOMPILE_CONTRACT_NDIMS[1],
+            N2 in 0:TensorOperations.PRECOMPILE_CONTRACT_NDIMS[2],
+            N3 in 0:TensorOperations.PRECOMPILE_CONTRACT_NDIMS[1]
+
+            NA = N1 + N2
+            NB = N2 + N3
+            NC = N1 + N3
+            TC, TA, TB = Array{T,NC}, Array{T,NA}, Array{T,NB}
+            pA = Index2Tuple{N1,N2}
+            pB = Index2Tuple{N2,N3}
+            pAB = Index2Tuple{NC,0}
+
+            TC_buf = Core.Compiler.return_type(tensoralloc_contract,
+                                               Tuple{T,TA,pA,Bool,TB,pB,Bool,pAB,Val{true},
+                                                     buf})
+            TA_buf = Core.Compiler.return_type(tensoralloc_add,
+                                               Tuple{T,TA,pA,Bool,Val{true},buf})
+            TB_buf = Core.Compiler.return_type(tensoralloc_add,
+                                               Tuple{T,TB,pB,Bool,Val{true},buf})
+            for (C, A, B) in Iterators.product((TC, TC_buf), (TA, TA_buf), (TB, TB_buf))
+                precompile(tensorcontract!,
+                           (C, A, pA, Bool, B, pB, Bool, pAB, One, Zero, backend, buf))
+                precompile(tensorcontract!,
+                           (C, A, pA, Bool, B, pB, Bool, pAB, T, Zero, backend, buf))
+                precompile(tensorcontract!,
+                           (C, A, pA, Bool, B, pB, Bool, pAB, T, T, backend, buf))
+            end
+
+            for (A, B) in Iterators.product((TA, TA_buf), (TB, TB_buf))
+                precompile(tensoralloc_contract,
+                           (T, A, pA, Bool, B, pB, Bool, pAB, Val{true}, buf))
+                precompile(tensoralloc_contract,
+                           (T, A, pA, Bool, B, pB, Bool, pAB, Val{false}, buf))
+            end
+        end
+    end
+end
+
 end