Skip to content

[CUBLAS] Update wrapppers to use the ILP64 API #2845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from

Conversation

amontoison
Copy link
Member

@amontoison amontoison commented Aug 12, 2025

I checked the symbols with nm -D .../libcusolver.so and it seems that they are in the library.

@amontoison amontoison changed the title Update wrapppers to use the ILP64 API [CUBLAS] Update wrapppers to use the ILP64 API Aug 12, 2025
Copy link
Contributor

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl
index 07c584860..510d0fbde 100644
--- a/lib/cublas/wrappers.jl
+++ b/lib/cublas/wrappers.jl
@@ -400,8 +400,10 @@ for (fname, fname_64, elty) in ((:cublasSrotm_v2, :cublasSrotm_v2_64, :Float32),
 end
 
 ## rotmg
-for (fname, fname_64, elty) in ((:cublasSrotmg_v2, :cublasSrotmg_v2_64, :Float32),
-                                (:cublasDrotmg_v2, :cublasSrotmg_v2_64, :Float64))
+for (fname, fname_64, elty) in (
+        (:cublasSrotmg_v2, :cublasSrotmg_v2_64, :Float32),
+        (:cublasDrotmg_v2, :cublasSrotmg_v2_64, :Float64),
+    )
     @eval begin
         function rotmg!(d1::$elty,
                         d2::$elty,
@@ -1120,7 +1122,7 @@ end
 ## (GE) general matrix-matrix multiplication
 for (fname, fname_64, elty) in ((:cublasDgemm_v2, :cublasDgemm_v2_64, :Float64),
                                 (:cublasSgemm_v2, :cublasSgemm_v2_64, :Float32),
-                                (:cublasHgemm, :cublasHgemm_64, :Float16),
+        (:cublasHgemm, :cublasHgemm_64, :Float16),
                                 (:cublasZgemm_v2, :cublasZgemm_v2_64, :ComplexF64),
                                 (:cublasCgemm_v2, :cublasCgemm_v2_64, :ComplexF32))
     @eval begin
@@ -1531,7 +1533,7 @@ end
 ## (GE) general matrix-matrix multiplication batched
 for (fname, fname_64, elty) in ((:cublasDgemmBatched, :cublasDgemmBatched_64, :Float64),
                                 (:cublasSgemmBatched, :cublasSgemmBatched_64, :Float32),
-                                (:cublasHgemmBatched, :cublasHgemmBatched_64, :Float16),
+        (:cublasHgemmBatched, :cublasHgemmBatched_64, :Float16),
                                 (:cublasZgemmBatched, :cublasZgemmBatched_64, :ComplexF64),
                                 (:cublasCgemmBatched, :cublasCgemmBatched_64, :ComplexF32))
     @eval begin
@@ -1598,7 +1600,7 @@ end
 ## (GE) general matrix-matrix multiplication strided batched
 for (fname, fname_64, elty) in ((:cublasDgemmStridedBatched, :cublasDgemmStridedBatched_64, :Float64),
                                 (:cublasSgemmStridedBatched, :cublasSgemmStridedBatched_64, :Float32),
-                                (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched_64, :Float16),
+        (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched_64, :Float16),
                                 (:cublasZgemmStridedBatched, :cublasZgemmStridedBatched_64, :ComplexF64),
                                 (:cublasCgemmStridedBatched, :cublasCgemmStridedBatched_64, :ComplexF32))
     @eval begin
@@ -1950,10 +1952,12 @@ end
 
 ## (TR) Triangular matrix and vector multiplication and solution
 for (mmname, mmname_64, elty) in
-        ((:cublasDtrmm_v2, :cublasDtrmm_v2_64, :Float64),
-         (:cublasStrmm_v2, :cublasStrmm_v2_64, :Float32),
-         (:cublasZtrmm_v2, :cublasZtrmm_v2_64, :ComplexF64),
-         (:cublasCtrmm_v2, :cublasCtrmm_v2_64, :ComplexF32))
+    (
+        (:cublasDtrmm_v2, :cublasDtrmm_v2_64, :Float64),
+        (:cublasStrmm_v2, :cublasStrmm_v2_64, :Float32),
+        (:cublasZtrmm_v2, :cublasZtrmm_v2_64, :ComplexF64),
+        (:cublasCtrmm_v2, :cublasCtrmm_v2_64, :ComplexF32),
+    )
     @eval begin
         # Note: CUBLAS differs from BLAS API for trmm
         #   BLAS: inplace modification of B
@@ -1987,10 +1991,12 @@ for (mmname, mmname_64, elty) in
 end
 
 for (smname, smname_64, elty) in
-        ((:cublasDtrsm_v2, :cublasDtrsm_v2_64, :Float64),
-         (:cublasStrsm_v2, :cublasStrsm_v2_64, :Float32),
-         (:cublasZtrsm_v2, :cublasZtrsm_v2_64, :ComplexF64),
-         (:cublasCtrsm_v2, :cublasCtrsm_v2_64, :ComplexF32))
+    (
+        (:cublasDtrsm_v2, :cublasDtrsm_v2_64, :Float64),
+        (:cublasStrsm_v2, :cublasStrsm_v2_64, :Float32),
+        (:cublasZtrsm_v2, :cublasZtrsm_v2_64, :ComplexF64),
+        (:cublasCtrsm_v2, :cublasCtrsm_v2_64, :ComplexF32),
+    )
     @eval begin
         function trsm!(side::Char,
                        uplo::Char,

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: ace28c3 Previous: c05359d Ratio
latency/precompile 42966135947 ns 42922336650.5 ns 1.00
latency/ttfp 7012136452 ns 7015168424 ns 1.00
latency/import 3573568156 ns 3571269514 ns 1.00
integration/volumerhs 9625284.5 ns 9608723 ns 1.00
integration/byval/slices=1 147213 ns 146920.5 ns 1.00
integration/byval/slices=3 426506 ns 425845 ns 1.00
integration/byval/reference 145291 ns 145020 ns 1.00
integration/byval/slices=2 286867 ns 286380 ns 1.00
integration/cudadevrt 103634 ns 103554 ns 1.00
kernel/indexing 14158.5 ns 14235 ns 0.99
kernel/indexing_checked 15011 ns 14711 ns 1.02
kernel/occupancy 670.632911392405 ns 672.5506329113924 ns 1.00
kernel/launch 2238.1111111111113 ns 2270.3333333333335 ns 0.99
kernel/rand 15214 ns 14669 ns 1.04
array/reverse/1d 19943 ns 19682 ns 1.01
array/reverse/2d 24868.5 ns 23613.5 ns 1.05
array/reverse/1d_inplace 10397 ns 10461 ns 0.99
array/reverse/2d_inplace 12028 ns 13212 ns 0.91
array/copy 20985 ns 20972 ns 1.00
array/iteration/findall/int 157586.5 ns 157808 ns 1.00
array/iteration/findall/bool 140117 ns 139837 ns 1.00
array/iteration/findfirst/int 165035.5 ns 164937 ns 1.00
array/iteration/findfirst/bool 158235 ns 165868 ns 0.95
array/iteration/scalar 72865 ns 73041 ns 1.00
array/iteration/logical 215963.5 ns 214850 ns 1.01
array/iteration/findmin/1d 46181 ns 46704 ns 0.99
array/iteration/findmin/2d 96327.5 ns 96962.5 ns 0.99
array/reductions/reduce/Int64/1d 43700.5 ns 46033 ns 0.95
array/reductions/reduce/Int64/dims=1 49010 ns 55193 ns 0.89
array/reductions/reduce/Int64/dims=2 62902.5 ns 62917 ns 1.00
array/reductions/reduce/Int64/dims=1L 89016 ns 88869 ns 1.00
array/reductions/reduce/Int64/dims=2L 88505 ns 87079 ns 1.02
array/reductions/reduce/Float32/1d 34730 ns 34606 ns 1.00
array/reductions/reduce/Float32/dims=1 41845 ns 43875 ns 0.95
array/reductions/reduce/Float32/dims=2 59974 ns 59705 ns 1.00
array/reductions/reduce/Float32/dims=1L 52527 ns 52260 ns 1.01
array/reductions/reduce/Float32/dims=2L 70389 ns 70051.5 ns 1.00
array/reductions/mapreduce/Int64/1d 43580 ns 42671.5 ns 1.02
array/reductions/mapreduce/Int64/dims=1 48210.5 ns 45980 ns 1.05
array/reductions/mapreduce/Int64/dims=2 62641 ns 62143.5 ns 1.01
array/reductions/mapreduce/Int64/dims=1L 89035 ns 88812 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 87202 ns 86818 ns 1.00
array/reductions/mapreduce/Float32/1d 34464 ns 34742 ns 0.99
array/reductions/mapreduce/Float32/dims=1 42028 ns 43090.5 ns 0.98
array/reductions/mapreduce/Float32/dims=2 60389 ns 60061 ns 1.01
array/reductions/mapreduce/Float32/dims=1L 52805 ns 52528 ns 1.01
array/reductions/mapreduce/Float32/dims=2L 70688 ns 70191 ns 1.01
array/broadcast 20331 ns 20155 ns 1.01
array/copyto!/gpu_to_gpu 12820 ns 11294 ns 1.14
array/copyto!/cpu_to_gpu 215251 ns 216503 ns 0.99
array/copyto!/gpu_to_cpu 283637 ns 284237 ns 1.00
array/accumulate/Int64/1d 124894.5 ns 125529 ns 0.99
array/accumulate/Int64/dims=1 83631 ns 84037 ns 1.00
array/accumulate/Int64/dims=2 158233 ns 159166 ns 0.99
array/accumulate/Int64/dims=1L 1719529 ns 1720376 ns 1.00
array/accumulate/Int64/dims=2L 967882 ns 968348 ns 1.00
array/accumulate/Float32/1d 109433 ns 109984 ns 0.99
array/accumulate/Float32/dims=1 80815 ns 81082 ns 1.00
array/accumulate/Float32/dims=2 147970 ns 148760 ns 0.99
array/accumulate/Float32/dims=1L 1618276 ns 1629307.5 ns 0.99
array/accumulate/Float32/dims=2L 698936 ns 701479 ns 1.00
array/construct 1305.6 ns 1287.2 ns 1.01
array/random/randn/Float32 44977 ns 44176 ns 1.02
array/random/randn!/Float32 25089 ns 24930 ns 1.01
array/random/rand!/Int64 27518 ns 27547 ns 1.00
array/random/rand!/Float32 8792.666666666666 ns 8724.666666666666 ns 1.01
array/random/rand/Int64 30018 ns 30114 ns 1.00
array/random/rand/Float32 12998 ns 13059 ns 1.00
array/permutedims/4d 60364.5 ns 60761 ns 0.99
array/permutedims/2d 54202 ns 54037 ns 1.00
array/permutedims/3d 55093 ns 54954 ns 1.00
array/sorting/1d 2755872.5 ns 2756544 ns 1.00
array/sorting/by 3354726 ns 3343249 ns 1.00
array/sorting/2d 1084530 ns 1080799 ns 1.00
cuda/synchronization/stream/auto 1026.3 ns 1040.3 ns 0.99
cuda/synchronization/stream/nonblocking 8291.400000000001 ns 7220 ns 1.15
cuda/synchronization/stream/blocking 805.3258426966293 ns 802.3333333333334 ns 1.00
cuda/synchronization/context/auto 1197.8 ns 1203.5 ns 1.00
cuda/synchronization/context/nonblocking 7976.1 ns 7276.700000000001 ns 1.10
cuda/synchronization/context/blocking 932.25 ns 900.4347826086956 ns 1.04

This comment was automatically generated by workflow using github-action-benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant