Merge pull request #3079 from huggingface/metal-tensor-fix-send-sync

ivarflakstad · web-flow · commit 65055f6010e9 · 2025-09-09T13:09:15.000+02:00
[Metal] Ensure tensors are send/sync
diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs
@@ -123,7 +123,7 @@ impl MetalDevice {
         if flushed {
             self.drop_unused_buffers()?
         }
-        Ok(command_buffer)
+        Ok(command_buffer.clone())
     }
 
     pub fn wait_until_completed(&self) -> Result<()> {
diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs
@@ -1694,6 +1694,27 @@ test_device!(asort, asort_cpu, asort_gpu, asort_metal);
 test_device!(var, var_cpu, var_gpu, var_metal);
 test_device!(zero_dim, zero_dim_cpu, zero_dim_gpu, zero_dim_metal);
 
+fn tensor_send_sync(device: &Device) -> Result<()> {
+    let tensor = Tensor::new(vec![1.0f32, 2.0, 3.0], device)?;
+
+    for _ in 0..10 {
+        let tensor = tensor.clone();
+        std::thread::spawn(move || {
+            let new = tensor.add(&tensor).unwrap();
+            let result: Vec<f32> = new.to_vec1().unwrap();
+            assert_eq!(result, vec![2.0f32, 4.0, 6.0]);
+        });
+    }
+
+    Ok(())
+}
+test_device!(
+    tensor_send_sync,
+    tensor_send_sync_cpu,
+    tensor_send_sync_gpu,
+    tensor_send_sync_metal
+);
+
 // There was originally a bug on the CPU implementation for randn
 // https://github.com/huggingface/candle/issues/381
 #[test]
diff --git a/candle-metal-kernels/src/metal/commands.rs b/candle-metal-kernels/src/metal/commands.rs
@@ -2,12 +2,19 @@ use crate::metal::CommandBuffer;
 use crate::MetalKernelError;
 use objc2::{rc::Retained, runtime::ProtocolObject};
 use objc2_metal::{MTLCommandBufferStatus, MTLCommandQueue, MTLCounterSet};
+use std::{
+    collections::HashMap,
+    sync::{Arc, Mutex},
+    thread,
+};
 
 // Use Retained when appropriate. Gives us a more elegant way of handling memory (peaks) than autoreleasepool.
 // https://docs.rs/objc2/latest/objc2/rc/struct.Retained.html
 pub type CommandQueue = Retained<ProtocolObject<dyn MTLCommandQueue>>;
 pub type CounterSet = Retained<ProtocolObject<dyn MTLCounterSet>>;
 
+type CommandBufferMap = HashMap<thread::ThreadId, CommandBuffer>;
+
 pub struct Commands {
     /// Single command queue for the entire device.
     command_queue: CommandQueue,
@@ -20,13 +27,15 @@ pub struct Commands {
     /// Despite what the documentation says, command buffers are NOT ordered. They are ordered
     /// for their START time, but there's no guarantee that command buffer1 will finish before
     /// command buffer2 starts (or there are metal bugs there)
-    command_buffer: CommandBuffer,
+    command_buffers: Arc<Mutex<CommandBufferMap>>,
     /// Keeps track of the current amount of compute command encoders on the current
     /// command buffer
     /// Arc, RwLock because of the interior mutability.
     command_buffer_index: usize,
     /// The maximum amount of [compute command encoder](https://developer.apple.com/documentation/metal/mtlcomputecommandencoder?language=objc) per [command buffer](https://developer.apple.com/documentation/metal/mtlcommandbuffer?language=objc)
     compute_per_buffer: usize,
+    //capture: Option<Retained<MTLCaptureManager>>,
+    //timestamp_counter_set: Option<CounterSet>,
 }
 unsafe impl Send for Commands {}
 unsafe impl Sync for Commands {}
@@ -43,44 +52,60 @@ impl Commands {
     pub fn new(command_queue: CommandQueue) -> Result<Self, MetalKernelError> {
         let command_buffer = create_command_buffer(&command_queue)?;
         command_buffer.enqueue();
+        let command_buffers = HashMap::from([(thread::current().id(), command_buffer)]);
+        let command_buffers = Arc::new(Mutex::new(command_buffers));
+
         let compute_per_buffer = match std::env::var("CANDLE_METAL_COMPUTE_PER_BUFFER") {
             Ok(val) => val.parse().unwrap_or(50),
             _ => 50,
         };
         Ok(Self {
             command_queue,
-            command_buffer,
+            command_buffers,
             command_buffer_index: 0,
             compute_per_buffer,
         })
     }
 
     pub fn command_buffer(&mut self) -> Result<(bool, CommandBuffer), MetalKernelError> {
-        let mut command_buffer = self.command_buffer.to_owned();
+        let mut command_buffers = self.command_buffers.lock()?;
+        let command_buffer =
+            command_buffers
+                .get_mut(&thread::current().id())
+                .ok_or(MetalKernelError::LockError(
+                    "Command buffer map".to_string(),
+                ))?;
+
         let mut flushed = false;
         if self.command_buffer_index > self.compute_per_buffer {
-            self.command_buffer.commit();
-            command_buffer = create_command_buffer(&self.command_queue)?;
-            self.command_buffer = command_buffer.clone();
+            command_buffer.commit();
+            *command_buffer = create_command_buffer(&self.command_queue)?;
             self.command_buffer_index = 0;
             flushed = true;
         }
         self.command_buffer_index += 1;
-        Ok((flushed, command_buffer))
+        Ok((flushed, command_buffer.clone()))
     }
 
     pub fn wait_until_completed(&mut self) -> Result<(), MetalKernelError> {
-        match self.command_buffer.status() {
+        let mut command_buffers = self.command_buffers.lock()?;
+        let command_buffer =
+            command_buffers
+                .get_mut(&thread::current().id())
+                .ok_or(MetalKernelError::LockError(
+                    "Command buffer map".to_string(),
+                ))?;
+        match command_buffer.status() {
             MTLCommandBufferStatus::Committed
             | MTLCommandBufferStatus::Scheduled
             | MTLCommandBufferStatus::Completed => {
                 panic!("Already committed");
             }
             _ => {}
         }
-        self.command_buffer.commit();
-        self.command_buffer.wait_until_completed();
-        self.command_buffer = create_command_buffer(&self.command_queue)?;
+        command_buffer.commit();
+        command_buffer.wait_until_completed();
+        *command_buffer = create_command_buffer(&self.command_queue)?;
 
         Ok(())
     }
diff --git a/candle-metal-kernels/src/metal_utils.rs b/candle-metal-kernels/src/metal_utils.rs

Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,7 @@ impl MetalDevice {`
`123`	`123`	`if flushed {`
`124`	`124`	`self.drop_unused_buffers()?`
`125`	`125`	`}`
`126`		`- Ok(command_buffer)`
	`126`	`+ Ok(command_buffer.clone())`
`127`	`127`	`}`
`128`	`128`
`129`	`129`	`pub fn wait_until_completed(&self) -> Result<()> {`