Ensure metal tensors are send/sync via thread isolated command buffer map

ivarflakstad · ivarflakstad · commit 0bbf9c7c6a63 · 2025-09-08T17:21:29.000+02:00
diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs
@@ -123,7 +123,7 @@ impl MetalDevice {
         if flushed {
             self.drop_unused_buffers()?
         }
-        Ok(command_buffer)
+        Ok(command_buffer.clone())
     }
 
     pub fn wait_until_completed(&self) -> Result<()> {
diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs
@@ -1694,6 +1694,27 @@ test_device!(asort, asort_cpu, asort_gpu, asort_metal);
 test_device!(var, var_cpu, var_gpu, var_metal);
 test_device!(zero_dim, zero_dim_cpu, zero_dim_gpu, zero_dim_metal);
 
+fn tensor_send_sync(device: &Device) -> Result<()> {
+    let tensor = Tensor::new(vec![1.0f32, 2.0, 3.0], device)?;
+
+    for _ in 0..10 {
+        let tensor = tensor.clone();
+        std::thread::spawn(move || {
+            let new = tensor.add(&tensor).unwrap();
+            let result: Vec<f32> = new.to_vec1().unwrap();
+            assert_eq!(result, vec![2.0f32, 4.0, 6.0]);
+        });
+    }
+
+    Ok(())
+}
+test_device!(
+    tensor_send_sync,
+    tensor_send_sync_cpu,
+    tensor_send_sync_gpu,
+    tensor_send_sync_metal
+);
+
 // There was originally a bug on the CPU implementation for randn
 // https://github.com/huggingface/candle/issues/381
 #[test]
diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs
@@ -168,7 +168,7 @@ pub mod binary {
 
 #[derive(thiserror::Error, Debug)]
 pub enum MetalKernelError {
-    #[error("Could not lock kernel map: {0}")]
+    #[error("Could not lock: {0}")]
     LockError(String),
     #[error("Error while loading library: {0}")]
     LoadLibraryError(String),
diff --git a/candle-metal-kernels/src/metal_utils.rs b/candle-metal-kernels/src/metal_utils.rs
@@ -7,7 +7,13 @@ use objc2_metal::{
     MTLCreateSystemDefaultDevice, MTLDataType, MTLDevice, MTLFunction, MTLFunctionConstantValues,
     MTLLibrary, MTLResource, MTLResourceUsage, MTLSize,
 };
-use std::{collections::HashMap, ffi::c_void, ptr, sync::Arc};
+use std::{
+    collections::HashMap,
+    ffi::c_void,
+    ptr,
+    sync::{Arc, Mutex},
+    thread,
+};
 
 // Use Retained when appropriate. Gives us a more elegant way of handling memory (peaks) than autoreleasepool.
 // https://docs.rs/objc2/latest/objc2/rc/struct.Retained.html
@@ -382,6 +388,7 @@ impl BlitCommandEncoder {
 }
 
 pub type BufferMap = HashMap<(usize, MTLResourceOptions), Vec<Arc<Buffer>>>;
+type CommandBufferMap = HashMap<thread::ThreadId, CommandBuffer>;
 pub struct Commands {
     /// Single command queue for the entire device.
     command_queue: CommandQueue,
@@ -394,7 +401,7 @@ pub struct Commands {
     /// Despite what the documentation says, command buffers are NOT ordered. They are ordered
     /// for their START time, but there's no guarantee that command buffer1 will finish before
     /// command buffer2 starts (or there are metal bugs there)
-    command_buffer: CommandBuffer,
+    command_buffers: Arc<Mutex<CommandBufferMap>>,
     /// Keeps track of the current amount of compute command encoders on the current
     /// command buffer
     /// Arc, RwLock because of the interior mutability.
@@ -422,44 +429,60 @@ impl Commands {
     pub fn new(command_queue: CommandQueue) -> Result<Self, MetalKernelError> {
         let command_buffer = create_command_buffer(&command_queue)?;
         command_buffer.enqueue();
+        let command_buffers = HashMap::from([(thread::current().id(), command_buffer)]);
+        let command_buffers = Arc::new(Mutex::new(command_buffers));
+
         let compute_per_buffer = match std::env::var("CANDLE_METAL_COMPUTE_PER_BUFFER") {
             Ok(val) => val.parse().unwrap_or(50),
             _ => 50,
         };
         Ok(Self {
             command_queue,
-            command_buffer,
+            command_buffers,
             command_buffer_index: 0,
             compute_per_buffer,
         })
     }
 
     pub fn command_buffer(&mut self) -> Result<(bool, CommandBuffer), MetalKernelError> {
-        let mut command_buffer = self.command_buffer.to_owned();
+        let mut command_buffers = self.command_buffers.lock()?;
+        let command_buffer =
+            command_buffers
+                .get_mut(&thread::current().id())
+                .ok_or(MetalKernelError::LockError(
+                    "Command buffer map".to_string(),
+                ))?;
+
         let mut flushed = false;
         if self.command_buffer_index > self.compute_per_buffer {
-            self.command_buffer.commit();
-            command_buffer = create_command_buffer(&self.command_queue)?;
-            self.command_buffer = command_buffer.clone();
+            command_buffer.commit();
+            *command_buffer = create_command_buffer(&self.command_queue)?;
             self.command_buffer_index = 0;
             flushed = true;
         }
         self.command_buffer_index += 1;
-        Ok((flushed, command_buffer))
+        Ok((flushed, command_buffer.clone()))
     }
 
     pub fn wait_until_completed(&mut self) -> Result<(), MetalKernelError> {
-        match self.command_buffer.status() {
+        let mut command_buffers = self.command_buffers.lock()?;
+        let command_buffer =
+            command_buffers
+                .get_mut(&thread::current().id())
+                .ok_or(MetalKernelError::LockError(
+                    "Command buffer map".to_string(),
+                ))?;
+        match command_buffer.status() {
             MTLCommandBufferStatus::Committed
             | MTLCommandBufferStatus::Scheduled
             | MTLCommandBufferStatus::Completed => {
                 panic!("Already committed");
             }
             _ => {}
         }
-        self.command_buffer.commit();
-        self.command_buffer.wait_until_completed();
-        self.command_buffer = create_command_buffer(&self.command_queue)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+        *command_buffer = create_command_buffer(&self.command_queue)?;
 
         Ok(())
     }

Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,7 @@ impl MetalDevice {`
`123`	`123`	`if flushed {`
`124`	`124`	`self.drop_unused_buffers()?`
`125`	`125`	`}`
`126`		`- Ok(command_buffer)`
	`126`	`+ Ok(command_buffer.clone())`
`127`	`127`	`}`
`128`	`128`
`129`	`129`	`pub fn wait_until_completed(&self) -> Result<()> {`