-
Couldn't load subscription status.
- Fork 46
MTLSharedEvent for synchronization take 2
#690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/mtl/capture.jl b/lib/mtl/capture.jl
index c855ea34..6c702036 100644
--- a/lib/mtl/capture.jl
+++ b/lib/mtl/capture.jl
@@ -12,13 +12,13 @@ Use [`beginScope()`](@ref) and [`endScope()`](@ref) to set the boundaries for a
"""
MTLCaptureScope
-function MTLCaptureScope(queue::MTLDevice, manager=MTLCaptureManager())
+function MTLCaptureScope(queue::MTLDevice, manager = MTLCaptureManager())
handle = @objc [manager::id{MTLCaptureManager} newCaptureScopeWithDevice:queue::id{MTLDevice}]::id{MTLCaptureScope}
- MTLCaptureScope(handle)
+ return MTLCaptureScope(handle)
end
-function MTLCaptureScope(queue::MTLCommandQueue, manager=MTLCaptureManager())
+function MTLCaptureScope(queue::MTLCommandQueue, manager = MTLCaptureManager())
handle = @objc [manager::id{MTLCaptureManager} newCaptureScopeWithCommandQueue:queue::id{MTLCommandQueue}]::id{MTLCaptureScope}
- MTLCaptureScope(handle)
+ return MTLCaptureScope(handle)
end
# @objcwrapper MTLCaptureScope <: NSObject
@@ -68,7 +68,8 @@ function MTLCaptureDescriptor()
end
# TODO: Add capture state
-function MTLCaptureDescriptor(obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
+function MTLCaptureDescriptor(
+ obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
destination::MTLCaptureDestination;
folder::String=nothing)
desc = MTLCaptureDescriptor()
@@ -119,7 +120,8 @@ end
Start GPU frame capture using the default capture object and specifying capture descriptor parameters directly.
"""
-function startCapture(obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
+function startCapture(
+ obj::Union{MTLDevice, MTLCommandQueue, MTLCaptureScope},
destination::MTLCaptureDestination=MTLCaptureDestinationGPUTraceDocument;
folder::String=nothing)
if destination == MTLCaptureDestinationGPUTraceDocument && folder === nothing
diff --git a/lib/mtl/events.jl b/lib/mtl/events.jl
index 374f4285..8d2345ed 100644
--- a/lib/mtl/events.jl
+++ b/lib/mtl/events.jl
@@ -29,9 +29,11 @@ function MTLSharedEvent(dev::MTLDevice)
return obj
end
-function waitUntilSignaledValue(ev::MTLSharedEvent, value, timeoutMS=typemax(UInt64))
- @objc [ev::id{MTLSharedEvent} waitUntilSignaledValue:value::UInt64
- timeoutMS:timeoutMS::UInt64]::Bool
+function waitUntilSignaledValue(ev::MTLSharedEvent, value, timeoutMS = typemax(UInt64))
+ return @objc [
+ ev::id{MTLSharedEvent} waitUntilSignaledValue:value::UInt64
+ timeoutMS:timeoutMS::UInt64
+ ]::Bool
end
## shared event handle
diff --git a/src/state.jl b/src/state.jl
index ffa5f53d..918731c4 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -61,7 +61,7 @@ end
Return the `MTLSharedEvent` used to synchronize a queue
"""
function queue_event(queue::MTLCommandQueue)
- get!(task_local_storage(), (:MTLSharedEvent, queue)) do
+ return get!(task_local_storage(), (:MTLSharedEvent, queue)) do
MTLSharedEvent(queue.device)
end::MTLSharedEvent
end
diff --git a/test/capturing.jl b/test/capturing.jl
index 19b8b989..1810d501 100644
--- a/test/capturing.jl
+++ b/test/capturing.jl
@@ -37,9 +37,9 @@ desc.captureObject = dev
desc.destination = MTL.MTLCaptureDestinationGPUTraceDocument
@test desc.destination == MTL.MTLCaptureDestinationGPUTraceDocument
-# Capture Manager supports destination
-@test !supports_destination(manager, MTL.MTLCaptureDestinationDeveloperTools)
-@test supports_destination(manager, MTL.MTLCaptureDestinationGPUTraceDocument)
+ # Capture Manager supports destination
+ @test !supports_destination(manager, MTL.MTLCaptureDestinationDeveloperTools)
+ @test supports_destination(manager, MTL.MTLCaptureDestinationGPUTraceDocument)
# Output URL
@test desc.outputURL === nothing
@@ -70,17 +70,17 @@ bufferA = MtlArray{Float32,1,SharedStorage}(undef, tuple(4))
@test manager.isCapturing == false
startCapture(manager, desc)
@test manager.isCapturing
-@test_throws "Capture manager is already capturing." startCapture(manager, desc)
+ @test_throws "Capture manager is already capturing." startCapture(manager, desc)
Metal.@sync @metal threads=4 tester(bufferA)
stopCapture(manager)
-synchronize()
-if manager.isCapturing
- start = Dates.now()
- while manager.isCapturing
- end
- endtime = Dates.now()
- @info endtime - start
-end
+ synchronize()
+ if manager.isCapturing
+ start = Dates.now()
+ while manager.isCapturing
+ end
+ endtime = Dates.now()
+ @info endtime - start
+ end
@test manager.isCapturing == false
@test isdir(path)
release(new_scope) |
f695de3 to
7804676
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
| Benchmark suite | Current: 14d3d0e | Previous: 822abde | Ratio |
|---|---|---|---|
latency/precompile |
25011252458 ns |
24961495792 ns |
1.00 |
latency/ttfp |
2125622750 ns |
2126570334 ns |
1.00 |
latency/import |
1219892458 ns |
1221863250 ns |
1.00 |
integration/metaldevrt |
835667 ns |
920646 ns |
0.91 |
integration/byval/slices=1 |
1549167 ns |
1636875 ns |
0.95 |
integration/byval/slices=3 |
9198916 ns |
8533229 ns |
1.08 |
integration/byval/reference |
1536583 ns |
1625125 ns |
0.95 |
integration/byval/slices=2 |
2564729 ns |
2708917 ns |
0.95 |
kernel/indexing |
574458.5 ns |
674520.5 ns |
0.85 |
kernel/indexing_checked |
594125 ns |
692979.5 ns |
0.86 |
kernel/launch |
12042 ns |
12292 ns |
0.98 |
kernel/rand |
560792 ns |
623458 ns |
0.90 |
array/construct |
6125 ns |
6250 ns |
0.98 |
array/broadcast |
419833 ns |
667000 ns |
0.63 |
array/random/randn/Float32 |
788000 ns |
822375 ns |
0.96 |
array/random/randn!/Float32 |
614583 ns |
627417 ns |
0.98 |
array/random/rand!/Int64 |
558208 ns |
555042 ns |
1.01 |
array/random/rand!/Float32 |
586333 ns |
588291 ns |
1.00 |
array/random/rand/Int64 |
784625 ns |
806208 ns |
0.97 |
array/random/rand/Float32 |
597083.5 ns |
649083 ns |
0.92 |
array/accumulate/Int64/1d |
1255209 ns |
1351708 ns |
0.93 |
array/accumulate/Int64/dims=1 |
1811333 ns |
1895167 ns |
0.96 |
array/accumulate/Int64/dims=2 |
2119500 ns |
2224583 ns |
0.95 |
array/accumulate/Int64/dims=1L |
11697666 ns |
11807458 ns |
0.99 |
array/accumulate/Int64/dims=2L |
9718000 ns |
9845792 ns |
0.99 |
array/accumulate/Float32/1d |
1121250 ns |
1211020.5 ns |
0.93 |
array/accumulate/Float32/dims=1 |
1529625 ns |
1623125 ns |
0.94 |
array/accumulate/Float32/dims=2 |
1843875 ns |
1913375 ns |
0.96 |
array/accumulate/Float32/dims=1L |
9757021 ns |
10039875 ns |
0.97 |
array/accumulate/Float32/dims=2L |
7335458 ns |
7298229.5 ns |
1.01 |
array/reductions/reduce/Int64/1d |
1437792 ns |
1341854 ns |
1.07 |
array/reductions/reduce/Int64/dims=1 |
1077667 ns |
1141520.5 ns |
0.94 |
array/reductions/reduce/Int64/dims=2 |
1168000 ns |
1277145.5 ns |
0.91 |
array/reductions/reduce/Int64/dims=1L |
2006520.5 ns |
2034687.5 ns |
0.99 |
array/reductions/reduce/Int64/dims=2L |
3417042 ns |
3549500 ns |
0.96 |
array/reductions/reduce/Float32/1d |
830500 ns |
1067125 ns |
0.78 |
array/reductions/reduce/Float32/dims=1 |
815292 ns |
883229.5 ns |
0.92 |
array/reductions/reduce/Float32/dims=2 |
739562.5 ns |
810979.5 ns |
0.91 |
array/reductions/reduce/Float32/dims=1L |
1329896 ns |
1373208.5 ns |
0.97 |
array/reductions/reduce/Float32/dims=2L |
1757854.5 ns |
1888250 ns |
0.93 |
array/reductions/mapreduce/Int64/1d |
1428667 ns |
1375438 ns |
1.04 |
array/reductions/mapreduce/Int64/dims=1 |
1078729 ns |
1176500 ns |
0.92 |
array/reductions/mapreduce/Int64/dims=2 |
1179084 ns |
1285062.5 ns |
0.92 |
array/reductions/mapreduce/Int64/dims=1L |
2002624.5 ns |
2087041 ns |
0.96 |
array/reductions/mapreduce/Int64/dims=2L |
3302958 ns |
3445167 ns |
0.96 |
array/reductions/mapreduce/Float32/1d |
1020125 ns |
1065917 ns |
0.96 |
array/reductions/mapreduce/Float32/dims=1 |
821708 ns |
892125 ns |
0.92 |
array/reductions/mapreduce/Float32/dims=2 |
752459 ns |
809583 ns |
0.93 |
array/reductions/mapreduce/Float32/dims=1L |
1319750 ns |
1359000 ns |
0.97 |
array/reductions/mapreduce/Float32/dims=2L |
1762083 ns |
1898271 ns |
0.93 |
array/private/copyto!/gpu_to_gpu |
638958 ns |
654166 ns |
0.98 |
array/private/copyto!/cpu_to_gpu |
783459 ns |
826625 ns |
0.95 |
array/private/copyto!/gpu_to_cpu |
787458.5 ns |
813104.5 ns |
0.97 |
array/private/iteration/findall/int |
1557979 ns |
1668417 ns |
0.93 |
array/private/iteration/findall/bool |
1433354 ns |
1504500 ns |
0.95 |
array/private/iteration/findfirst/int |
1681959 ns |
1934521 ns |
0.87 |
array/private/iteration/findfirst/bool |
1662229 ns |
1855687 ns |
0.90 |
array/private/iteration/scalar |
5472833 ns |
4774709 ns |
1.15 |
array/private/iteration/logical |
2512333 ns |
2574750 ns |
0.98 |
array/private/iteration/findmin/1d |
1863958 ns |
2064500 ns |
0.90 |
array/private/iteration/findmin/2d |
1516166 ns |
1624500 ns |
0.93 |
array/private/copy |
557396 ns |
550458 ns |
1.01 |
array/shared/copyto!/gpu_to_gpu |
85334 ns |
83333 ns |
1.02 |
array/shared/copyto!/cpu_to_gpu |
81833 ns |
94500 ns |
0.87 |
array/shared/copyto!/gpu_to_cpu |
83041 ns |
82958 ns |
1.00 |
array/shared/iteration/findall/int |
1586333 ns |
1643125 ns |
0.97 |
array/shared/iteration/findall/bool |
1428917 ns |
1516125 ns |
0.94 |
array/shared/iteration/findfirst/int |
1307166.5 ns |
1457083 ns |
0.90 |
array/shared/iteration/findfirst/bool |
1296229.5 ns |
1419167 ns |
0.91 |
array/shared/iteration/scalar |
206083 ns |
159229.5 ns |
1.29 |
array/shared/iteration/logical |
2444666 ns |
2361416 ns |
1.04 |
array/shared/iteration/findmin/1d |
1384417 ns |
1620625 ns |
0.85 |
array/shared/iteration/findmin/2d |
1511292 ns |
1622208 ns |
0.93 |
array/shared/copy |
249667 ns |
245750 ns |
1.02 |
array/permutedims/4d |
2360833 ns |
2439396 ns |
0.97 |
array/permutedims/2d |
1138104.5 ns |
1226959 ns |
0.93 |
array/permutedims/3d |
1649667 ns |
1746270.5 ns |
0.94 |
metal/synchronization/stream |
19292 ns |
14833 ns |
1.30 |
metal/synchronization/context |
20375 ns |
15334 ns |
1.33 |
This comment was automatically generated by workflow using github-action-benchmark.
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #690 +/- ##
==========================================
+ Coverage 80.89% 81.19% +0.30%
==========================================
Files 62 62
Lines 2790 2808 +18
==========================================
+ Hits 2257 2280 +23
+ Misses 533 528 -5 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
5c77cfd to
63af0ed
Compare
63af0ed to
2a7c978
Compare
|
I suspect The current spinlock is a potential option. Another is maybe ensuring we synchronize after @maleadt any thoughts? |
Seems like the cleaner solution, no? |
|
Unfortunately synchronizing doesn't work. With 14d3d0e locally, when it fails, it usually takes ~90 ms for capture to stop. Should I just use a spinlock? |
|
Yeah, that seems reasonable. Do add some code comments about the reason though. |
Not ready to merge as I haven't changed anything since #645 but leaving it up.
I'm considering maybe making it an experimental local preference that's off by default and then adding a special CI to try and catch failures.