diff --git a/lib/mtl/events.jl b/lib/mtl/events.jl
index b3b9cb7bf..374f42857 100644
--- a/lib/mtl/events.jl
+++ b/lib/mtl/events.jl
@@ -29,6 +29,10 @@ function MTLSharedEvent(dev::MTLDevice)
     return obj
 end
 
+function waitUntilSignaledValue(ev::MTLSharedEvent, value, timeoutMS=typemax(UInt64))
+    @objc [ev::id{MTLSharedEvent} waitUntilSignaledValue:value::UInt64
+                        timeoutMS:timeoutMS::UInt64]::Bool
+end
 
 ## shared event handle
 
diff --git a/src/state.jl b/src/state.jl
index 3a0512e52..0782db68f 100644
--- a/src/state.jl
+++ b/src/state.jl
@@ -55,6 +55,17 @@ function global_queue(dev::MTLDevice)
     end::MTLCommandQueue
 end
 
+"""
+    queue_event(queue::MTLCommandQueue)::MTLSharedEvent
+
+Return the `MTLSharedEvent` used to synchronize a queue
+"""
+function queue_event(queue::MTLCommandQueue)
+    get!(task_local_storage(), (:MTLSharedEvent, queue)) do
+        MTLSharedEvent(queue.device)
+    end::MTLSharedEvent
+end
+
 # TODO: Increase performance (currently ~15us)
 """
     synchronize(queue)
@@ -66,9 +77,13 @@ and simply wait for it to be completed. Since command buffers *should* execute i
 First-In-First-Out manner, this synchronizes the GPU.
 """
 @autoreleasepool function synchronize(queue::MTLCommandQueue=global_queue(device()))
+    ev = queue_event(queue)
+    val = ev.signaledValue + 1
     cmdbuf = MTLCommandBuffer(queue)
+    MTL.encode_signal!(cmdbuf, ev, val)
     commit!(cmdbuf)
-    wait_completed(cmdbuf)
+    MTL.waitUntilSignaledValue(ev,val)
+    return
 end
 
 """