-
Notifications
You must be signed in to change notification settings - Fork 244
[CUBLAS] Update wrapppers to use the ILP64 API #2845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
amontoison
wants to merge
2
commits into
JuliaGPU:master
Choose a base branch
from
amontoison:cublas_64
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+37
−17
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cublas/wrappers.jl b/lib/cublas/wrappers.jl
index 07c584860..510d0fbde 100644
--- a/lib/cublas/wrappers.jl
+++ b/lib/cublas/wrappers.jl
@@ -400,8 +400,10 @@ for (fname, fname_64, elty) in ((:cublasSrotm_v2, :cublasSrotm_v2_64, :Float32),
end
## rotmg
-for (fname, fname_64, elty) in ((:cublasSrotmg_v2, :cublasSrotmg_v2_64, :Float32),
- (:cublasDrotmg_v2, :cublasSrotmg_v2_64, :Float64))
+for (fname, fname_64, elty) in (
+ (:cublasSrotmg_v2, :cublasSrotmg_v2_64, :Float32),
+ (:cublasDrotmg_v2, :cublasSrotmg_v2_64, :Float64),
+ )
@eval begin
function rotmg!(d1::$elty,
d2::$elty,
@@ -1120,7 +1122,7 @@ end
## (GE) general matrix-matrix multiplication
for (fname, fname_64, elty) in ((:cublasDgemm_v2, :cublasDgemm_v2_64, :Float64),
(:cublasSgemm_v2, :cublasSgemm_v2_64, :Float32),
- (:cublasHgemm, :cublasHgemm_64, :Float16),
+ (:cublasHgemm, :cublasHgemm_64, :Float16),
(:cublasZgemm_v2, :cublasZgemm_v2_64, :ComplexF64),
(:cublasCgemm_v2, :cublasCgemm_v2_64, :ComplexF32))
@eval begin
@@ -1531,7 +1533,7 @@ end
## (GE) general matrix-matrix multiplication batched
for (fname, fname_64, elty) in ((:cublasDgemmBatched, :cublasDgemmBatched_64, :Float64),
(:cublasSgemmBatched, :cublasSgemmBatched_64, :Float32),
- (:cublasHgemmBatched, :cublasHgemmBatched_64, :Float16),
+ (:cublasHgemmBatched, :cublasHgemmBatched_64, :Float16),
(:cublasZgemmBatched, :cublasZgemmBatched_64, :ComplexF64),
(:cublasCgemmBatched, :cublasCgemmBatched_64, :ComplexF32))
@eval begin
@@ -1598,7 +1600,7 @@ end
## (GE) general matrix-matrix multiplication strided batched
for (fname, fname_64, elty) in ((:cublasDgemmStridedBatched, :cublasDgemmStridedBatched_64, :Float64),
(:cublasSgemmStridedBatched, :cublasSgemmStridedBatched_64, :Float32),
- (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched_64, :Float16),
+ (:cublasHgemmStridedBatched, :cublasHgemmStridedBatched_64, :Float16),
(:cublasZgemmStridedBatched, :cublasZgemmStridedBatched_64, :ComplexF64),
(:cublasCgemmStridedBatched, :cublasCgemmStridedBatched_64, :ComplexF32))
@eval begin
@@ -1950,10 +1952,12 @@ end
## (TR) Triangular matrix and vector multiplication and solution
for (mmname, mmname_64, elty) in
- ((:cublasDtrmm_v2, :cublasDtrmm_v2_64, :Float64),
- (:cublasStrmm_v2, :cublasStrmm_v2_64, :Float32),
- (:cublasZtrmm_v2, :cublasZtrmm_v2_64, :ComplexF64),
- (:cublasCtrmm_v2, :cublasCtrmm_v2_64, :ComplexF32))
+ (
+ (:cublasDtrmm_v2, :cublasDtrmm_v2_64, :Float64),
+ (:cublasStrmm_v2, :cublasStrmm_v2_64, :Float32),
+ (:cublasZtrmm_v2, :cublasZtrmm_v2_64, :ComplexF64),
+ (:cublasCtrmm_v2, :cublasCtrmm_v2_64, :ComplexF32),
+ )
@eval begin
# Note: CUBLAS differs from BLAS API for trmm
# BLAS: inplace modification of B
@@ -1987,10 +1991,12 @@ for (mmname, mmname_64, elty) in
end
for (smname, smname_64, elty) in
- ((:cublasDtrsm_v2, :cublasDtrsm_v2_64, :Float64),
- (:cublasStrsm_v2, :cublasStrsm_v2_64, :Float32),
- (:cublasZtrsm_v2, :cublasZtrsm_v2_64, :ComplexF64),
- (:cublasCtrsm_v2, :cublasCtrsm_v2_64, :ComplexF32))
+ (
+ (:cublasDtrsm_v2, :cublasDtrsm_v2_64, :Float64),
+ (:cublasStrsm_v2, :cublasStrsm_v2_64, :Float32),
+ (:cublasZtrsm_v2, :cublasZtrsm_v2_64, :ComplexF64),
+ (:cublasCtrsm_v2, :cublasCtrsm_v2_64, :ComplexF32),
+ )
@eval begin
function trsm!(side::Char,
uplo::Char, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: ace28c3 | Previous: c05359d | Ratio |
---|---|---|---|
latency/precompile |
42966135947 ns |
42922336650.5 ns |
1.00 |
latency/ttfp |
7012136452 ns |
7015168424 ns |
1.00 |
latency/import |
3573568156 ns |
3571269514 ns |
1.00 |
integration/volumerhs |
9625284.5 ns |
9608723 ns |
1.00 |
integration/byval/slices=1 |
147213 ns |
146920.5 ns |
1.00 |
integration/byval/slices=3 |
426506 ns |
425845 ns |
1.00 |
integration/byval/reference |
145291 ns |
145020 ns |
1.00 |
integration/byval/slices=2 |
286867 ns |
286380 ns |
1.00 |
integration/cudadevrt |
103634 ns |
103554 ns |
1.00 |
kernel/indexing |
14158.5 ns |
14235 ns |
0.99 |
kernel/indexing_checked |
15011 ns |
14711 ns |
1.02 |
kernel/occupancy |
670.632911392405 ns |
672.5506329113924 ns |
1.00 |
kernel/launch |
2238.1111111111113 ns |
2270.3333333333335 ns |
0.99 |
kernel/rand |
15214 ns |
14669 ns |
1.04 |
array/reverse/1d |
19943 ns |
19682 ns |
1.01 |
array/reverse/2d |
24868.5 ns |
23613.5 ns |
1.05 |
array/reverse/1d_inplace |
10397 ns |
10461 ns |
0.99 |
array/reverse/2d_inplace |
12028 ns |
13212 ns |
0.91 |
array/copy |
20985 ns |
20972 ns |
1.00 |
array/iteration/findall/int |
157586.5 ns |
157808 ns |
1.00 |
array/iteration/findall/bool |
140117 ns |
139837 ns |
1.00 |
array/iteration/findfirst/int |
165035.5 ns |
164937 ns |
1.00 |
array/iteration/findfirst/bool |
158235 ns |
165868 ns |
0.95 |
array/iteration/scalar |
72865 ns |
73041 ns |
1.00 |
array/iteration/logical |
215963.5 ns |
214850 ns |
1.01 |
array/iteration/findmin/1d |
46181 ns |
46704 ns |
0.99 |
array/iteration/findmin/2d |
96327.5 ns |
96962.5 ns |
0.99 |
array/reductions/reduce/Int64/1d |
43700.5 ns |
46033 ns |
0.95 |
array/reductions/reduce/Int64/dims=1 |
49010 ns |
55193 ns |
0.89 |
array/reductions/reduce/Int64/dims=2 |
62902.5 ns |
62917 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
89016 ns |
88869 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
88505 ns |
87079 ns |
1.02 |
array/reductions/reduce/Float32/1d |
34730 ns |
34606 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
41845 ns |
43875 ns |
0.95 |
array/reductions/reduce/Float32/dims=2 |
59974 ns |
59705 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52527 ns |
52260 ns |
1.01 |
array/reductions/reduce/Float32/dims=2L |
70389 ns |
70051.5 ns |
1.00 |
array/reductions/mapreduce/Int64/1d |
43580 ns |
42671.5 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1 |
48210.5 ns |
45980 ns |
1.05 |
array/reductions/mapreduce/Int64/dims=2 |
62641 ns |
62143.5 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1L |
89035 ns |
88812 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
87202 ns |
86818 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34464 ns |
34742 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=1 |
42028 ns |
43090.5 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=2 |
60389 ns |
60061 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1L |
52805 ns |
52528 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2L |
70688 ns |
70191 ns |
1.01 |
array/broadcast |
20331 ns |
20155 ns |
1.01 |
array/copyto!/gpu_to_gpu |
12820 ns |
11294 ns |
1.14 |
array/copyto!/cpu_to_gpu |
215251 ns |
216503 ns |
0.99 |
array/copyto!/gpu_to_cpu |
283637 ns |
284237 ns |
1.00 |
array/accumulate/Int64/1d |
124894.5 ns |
125529 ns |
0.99 |
array/accumulate/Int64/dims=1 |
83631 ns |
84037 ns |
1.00 |
array/accumulate/Int64/dims=2 |
158233 ns |
159166 ns |
0.99 |
array/accumulate/Int64/dims=1L |
1719529 ns |
1720376 ns |
1.00 |
array/accumulate/Int64/dims=2L |
967882 ns |
968348 ns |
1.00 |
array/accumulate/Float32/1d |
109433 ns |
109984 ns |
0.99 |
array/accumulate/Float32/dims=1 |
80815 ns |
81082 ns |
1.00 |
array/accumulate/Float32/dims=2 |
147970 ns |
148760 ns |
0.99 |
array/accumulate/Float32/dims=1L |
1618276 ns |
1629307.5 ns |
0.99 |
array/accumulate/Float32/dims=2L |
698936 ns |
701479 ns |
1.00 |
array/construct |
1305.6 ns |
1287.2 ns |
1.01 |
array/random/randn/Float32 |
44977 ns |
44176 ns |
1.02 |
array/random/randn!/Float32 |
25089 ns |
24930 ns |
1.01 |
array/random/rand!/Int64 |
27518 ns |
27547 ns |
1.00 |
array/random/rand!/Float32 |
8792.666666666666 ns |
8724.666666666666 ns |
1.01 |
array/random/rand/Int64 |
30018 ns |
30114 ns |
1.00 |
array/random/rand/Float32 |
12998 ns |
13059 ns |
1.00 |
array/permutedims/4d |
60364.5 ns |
60761 ns |
0.99 |
array/permutedims/2d |
54202 ns |
54037 ns |
1.00 |
array/permutedims/3d |
55093 ns |
54954 ns |
1.00 |
array/sorting/1d |
2755872.5 ns |
2756544 ns |
1.00 |
array/sorting/by |
3354726 ns |
3343249 ns |
1.00 |
array/sorting/2d |
1084530 ns |
1080799 ns |
1.00 |
cuda/synchronization/stream/auto |
1026.3 ns |
1040.3 ns |
0.99 |
cuda/synchronization/stream/nonblocking |
8291.400000000001 ns |
7220 ns |
1.15 |
cuda/synchronization/stream/blocking |
805.3258426966293 ns |
802.3333333333334 ns |
1.00 |
cuda/synchronization/context/auto |
1197.8 ns |
1203.5 ns |
1.00 |
cuda/synchronization/context/nonblocking |
7976.1 ns |
7276.700000000001 ns |
1.10 |
cuda/synchronization/context/blocking |
932.25 ns |
900.4347826086956 ns |
1.04 |
This comment was automatically generated by workflow using github-action-benchmark.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
I checked the symbols with
nm -D .../libcusolver.so
and it seems that they are in the library.