-
Notifications
You must be signed in to change notification settings - Fork 244
Make resize!
run faster
#2828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Make resize!
run faster
#2828
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/array.jl b/src/array.jl
index 64a568504..d00c1a75b 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -888,41 +888,41 @@ with undefined values.
function Base.resize!(A::CuVector{T}, n::Integer) where T
n == length(A) && return A
- cap = A.maxsize ÷ aligned_sizeof(T)
-
- # do nothing when new length is smaller than maxsize
- if n > cap # n > length(A)
-
- # if maxsize is larger than 10 MiB
- if A.maxsize > RESIZE_THRESHOLD
- len = max(cap + RESIZE_INCREMENT ÷ aligned_sizeof(T), n) # add at least 1 MiB
- else
- len = max(n, 2 * length(A))
- end
+ cap = A.maxsize ÷ aligned_sizeof(T)
- maxsize = len * aligned_sizeof(T)
- bufsize = if isbitstype(T)
- maxsize
- else
- # type tag array past the data
- maxsize + len
- end
+ # do nothing when new length is smaller than maxsize
+ if n > cap # n > length(A)
- new_data = context!(context(A)) do
- mem = pool_alloc(memory_type(A), bufsize)
- ptr = convert(CuPtr{T}, mem)
- m = min(length(A), n)
- if m > 0
- GC.@preserve A unsafe_copyto!(ptr, pointer(A), m)
- end
- DataRef(pool_free, mem)
+ # if maxsize is larger than 10 MiB
+ if A.maxsize > RESIZE_THRESHOLD
+ len = max(cap + RESIZE_INCREMENT ÷ aligned_sizeof(T), n) # add at least 1 MiB
+ else
+ len = max(n, 2 * length(A))
+ end
+
+ maxsize = len * aligned_sizeof(T)
+ bufsize = if isbitstype(T)
+ maxsize
+ else
+ # type tag array past the data
+ maxsize + len
+ end
+
+ new_data = context!(context(A)) do
+ mem = pool_alloc(memory_type(A), bufsize)
+ ptr = convert(CuPtr{T}, mem)
+ m = min(length(A), n)
+ if m > 0
+ GC.@preserve A unsafe_copyto!(ptr, pointer(A), m)
+ end
+ DataRef(pool_free, mem)
+ end
+ unsafe_free!(A)
+ A.data = new_data
+ A.maxsize = maxsize
+ A.offset = 0
end
- unsafe_free!(A)
- A.data = new_data
- A.maxsize = maxsize
- A.offset = 0
- end
- A.dims = (n,)
+ A.dims = (n,)
A
end
diff --git a/test/base/array.jl b/test/base/array.jl
index a0573c73f..b89c88cb0 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -550,68 +550,68 @@ end
end
@testset "resizing" begin
- # 1) small arrays (<=10 MiB): should still use doubling policy
- a = CuArray([1, 2, 3])
-
- # reallocation (add less than half)
- CUDA.resize!(a, 4)
- @test length(a) == 4
- @test Array(a)[1:3] == [1, 2, 3]
- @test a.maxsize == max(4, 2*3) * sizeof(eltype(a))
-
- # no reallocation
- CUDA.resize!(a, 5)
- @test length(a) == 5
- @test Array(a)[1:3] == [1, 2, 3]
- @test a.maxsize == 6 * sizeof(eltype(a))
-
- # reallocation (add more than half)
- CUDA.resize!(a, 12)
- @test length(a) == 12
- @test Array(a)[1:3] == [1, 2, 3]
- @test a.maxsize == max(12, 2*5) * sizeof(eltype(a))
-
- # 2) large arrays (>10 MiB): should use 1 MiB increments
- b = CUDA.fill(1, 2*1024^2)
- maxsize = b.maxsize
-
- # should bump by exactly 1 MiB
- CUDA.resize!(b, 2*1024^2 + 1)
- @test length(b) == 2*1024^2 + 1
- @test b.maxsize == maxsize + CUDA.RESIZE_INCREMENT
- @test all(Array(b)[1:2*1024^2] .== 1)
-
- b = CUDA.fill(1, 2*1024^2)
- maxsize = b.maxsize
-
- # should bump greater than 1 MiB
- new = CUDA.RESIZE_INCREMENT ÷ sizeof(eltype(b))
- CUDA.resize!(b, 2*1024^2 + new + 1)
- @test length(b) == 2*1024^2 + new + 1
- @test b.maxsize > maxsize + CUDA.RESIZE_INCREMENT
- @test all(Array(b)[1:2*1024^2] .== 1)
-
- b = CUDA.fill(1, 2*1024^2)
- maxsize = b.maxsize
-
- # no reallocation
- CUDA.resize!(b, 2*1024^2 - 1)
- @test length(b) == 2*1024^2 - 1
- @test b.maxsize == maxsize
- @test all(Array(b)[1:2*1024^2 - 1] .== 1)
-
- # 3) corner cases
- c = CuArray{Int}(undef, 0)
- @test length(c) == 0
- CUDA.resize!(c, 1)
- @test length(c) == 1
- @test c.maxsize == 1 * sizeof(eltype(c))
-
- c = CuArray{Int}(undef, 1)
- @test length(c) == 1
- CUDA.resize!(c, 0)
- @test length(c) == 0
- @test c.maxsize == 1 * sizeof(eltype(c))
+ # 1) small arrays (<=10 MiB): should still use doubling policy
+ a = CuArray([1, 2, 3])
+
+ # reallocation (add less than half)
+ CUDA.resize!(a, 4)
+ @test length(a) == 4
+ @test Array(a)[1:3] == [1, 2, 3]
+ @test a.maxsize == max(4, 2 * 3) * sizeof(eltype(a))
+
+ # no reallocation
+ CUDA.resize!(a, 5)
+ @test length(a) == 5
+ @test Array(a)[1:3] == [1, 2, 3]
+ @test a.maxsize == 6 * sizeof(eltype(a))
+
+ # reallocation (add more than half)
+ CUDA.resize!(a, 12)
+ @test length(a) == 12
+ @test Array(a)[1:3] == [1, 2, 3]
+ @test a.maxsize == max(12, 2 * 5) * sizeof(eltype(a))
+
+ # 2) large arrays (>10 MiB): should use 1 MiB increments
+ b = CUDA.fill(1, 2 * 1024^2)
+ maxsize = b.maxsize
+
+ # should bump by exactly 1 MiB
+ CUDA.resize!(b, 2 * 1024^2 + 1)
+ @test length(b) == 2 * 1024^2 + 1
+ @test b.maxsize == maxsize + CUDA.RESIZE_INCREMENT
+ @test all(Array(b)[1:(2 * 1024^2)] .== 1)
+
+ b = CUDA.fill(1, 2 * 1024^2)
+ maxsize = b.maxsize
+
+ # should bump greater than 1 MiB
+ new = CUDA.RESIZE_INCREMENT ÷ sizeof(eltype(b))
+ CUDA.resize!(b, 2 * 1024^2 + new + 1)
+ @test length(b) == 2 * 1024^2 + new + 1
+ @test b.maxsize > maxsize + CUDA.RESIZE_INCREMENT
+ @test all(Array(b)[1:(2 * 1024^2)] .== 1)
+
+ b = CUDA.fill(1, 2 * 1024^2)
+ maxsize = b.maxsize
+
+ # no reallocation
+ CUDA.resize!(b, 2 * 1024^2 - 1)
+ @test length(b) == 2 * 1024^2 - 1
+ @test b.maxsize == maxsize
+ @test all(Array(b)[1:(2 * 1024^2 - 1)] .== 1)
+
+ # 3) corner cases
+ c = CuArray{Int}(undef, 0)
+ @test length(c) == 0
+ CUDA.resize!(c, 1)
+ @test length(c) == 1
+ @test c.maxsize == 1 * sizeof(eltype(c))
+
+ c = CuArray{Int}(undef, 1)
+ @test length(c) == 1
+ CUDA.resize!(c, 0)
+ @test length(c) == 0
+ @test c.maxsize == 1 * sizeof(eltype(c))
end
@testset "aliasing" begin |
@maleadt Please review. Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you modify resize!
instead of adding a new_resize!
?
The change should also make use of the maxsize
property of CuArray
, which is there to allow additional data to be allocated without the dimensions of the array having to match. resize
should be made aware of that, not doing anything when the needed size is smaller than maxsize
.
In addition, IIUC you're using a growth factor of 2 now (ignoring resizes when shrinking by up to a half, resizing by 2 when requesting a larger array), which I think may be too aggressive for GPU arrays. For small arrays it's probably fine, but at some point (> 10MB?) we should probably use a fixed (1MB?) increment instead.
In addition,
👍
👍 but why do we choose 10MB and 1MB?
Do you have any other comments? |
cap = A.maxsize ÷ aligned_sizeof(T) | ||
|
||
# do nothing when new length is smaller than maxsize | ||
if n > cap # n > length(A) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
resize should be made aware of that, not doing anything when the needed size is smaller than maxsize
Is that really ok if we never shrink the GPU arrays?
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #2828 +/- ##
==========================================
- Coverage 89.78% 89.71% -0.08%
==========================================
Files 150 150
Lines 13229 13234 +5
==========================================
- Hits 11878 11873 -5
- Misses 1351 1361 +10 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Can you @maleadt point me to the existing CI benchmark results? I could not find them. Thanks! |
maxsize | ||
else | ||
# type tag array past the data | ||
maxsize + len |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have to cover the test for this line since https://app.codecov.io/gh/JuliaGPU/CUDA.jl/pull/2828?src=pr&el=tree&utm_medium=referral&utm_source=github&utm_content=comment&utm_campaign=pr+comments&utm_term=JuliaGPU?
But the original test also did not cover this line, I think.
@maleadt Could you please have a review if you get time? It's good if it can be merged ASAP. |
I need
resize!
function run faster when it is called frequently.I'm not sure about whether 2 is a good resize factor but I think it might be a reasonable number. And I do the benchmarks based on the assumption that the resize length is uniformly distributed within a range.
Click for benchmark script
Click for benchmark result
But it is still not as fast as CPU
resize!
function.Do you have any other suggestions to make it run faster (like any other good resize factor)? And do we need to benchmark it based on other assumptions (like the resize length linearly grows or shrinks with the times)? I guess the performance will not be good on some corner cases (like when we keep expanding the GPU array to less than double its length).
I separate the new
resize!
and the oldresize!
temporarily for better comparison.