Metal backend: track tensors (pytorch#15342)

manuelcandales · web-flow · commit 81a3acc5b8a1 · 2025-10-22T00:35:04.000-07:00
This pull request refactors and improves the memory management and
reference counting system for tensors in the Metal backend. The main
change is the replacement of the previous ownership tracking
(`is_tensor_own_memory`) with a more robust reference counting map
(`memory_to_n_tensor`), which tracks how many tensors share a memory
address and whether the memory is owned or not. Additional improvements
include safer tensor deletion, proper Metal buffer deallocation, and
consistent handling of tensor views and resource cleanup.
diff --git a/backends/apple/metal/runtime/shims/et_metal.h b/backends/apple/metal/runtime/shims/et_metal.h
@@ -354,6 +354,7 @@ extern "C" {
 
 // Memory management functions for Metal
 void* metal_allocate_buffer(long bytes);
+void metal_deallocate_buffer(void* ptr);
 bool metal_is_device_pointer(void* ptr);
 int metal_copy_memory(
     void* dst,
diff --git a/backends/apple/metal/runtime/shims/et_metal.mm b/backends/apple/metal/runtime/shims/et_metal.mm
@@ -86,6 +86,21 @@ void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
     }
 }
 
+void metal_deallocate_buffer(void* ptr) {
+    @autoreleasepool {
+        auto it = ptr_to_mtl_buffer.find(ptr);
+        if (it != ptr_to_mtl_buffer.end()) {
+            id<MTLBuffer> buffer = it->second;
+            [buffer release];
+            ptr_to_mtl_buffer.erase(it);
+            ET_LOG(Debug, "Deallocated Metal buffer for pointer %p", ptr);
+            ptr = nullptr;
+        } else {
+            ET_LOG(Error, "Failed to find Metal buffer for pointer %p", ptr);
+        }
+    }
+}
+
 void metal_cleanup_resources() {
     if (!ptr_to_mtl_buffer.empty()) {
         @autoreleasepool {
diff --git a/backends/apple/metal/runtime/shims/et_metal_ops.mm b/backends/apple/metal/runtime/shims/et_metal_ops.mm
@@ -736,9 +736,12 @@ AOTITorchError aoti_torch_mps_convolution(
         throw std::runtime_error("Tensor size mismatch");
       }
 
-      // Store the tensor handle - mark that we own the memory since we manually allocated it with malloc
+      // Store the tensor handle - mark that we own the memory since we manually allocated it
       *ret0 = output_tensor_handle;
-      is_tensor_own_memory[et_tensor] = true;  // We allocated the GPU memory
+      // Note: memory_to_n_tensor is managed automatically in aoti_torch_create_tensor_from_blob_v2
+      // The function sets it to NOT_OWN, but we need to change it to 1 since we allocated it
+      extern std::unordered_map<void*, int32_t> memory_to_n_tensor;
+      memory_to_n_tensor[tensor_data] = 1;
 
       ET_LOG(Debug, "aoti_torch_mps_convolution: Created output tensor with %zu elements using MPSGraph", actual_numel);
 
@@ -1327,10 +1330,11 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
         }
 
         // Mark that we own the memory for these tensors
-        auto* out_et_tensor = reinterpret_cast<Tensor*>(out_tensor_handle);
-        auto* attn_et_tensor = reinterpret_cast<Tensor*>(attn_tensor_handle);
-        is_tensor_own_memory[out_et_tensor] = true;
-        is_tensor_own_memory[attn_et_tensor] = true;
+        // Note: memory_to_n_tensor is managed automatically in aoti_torch_create_tensor_from_blob_v2
+        // The function sets it to NOT_OWN, but we need to change it to 1 since we allocated it
+        extern std::unordered_map<void*, int32_t> memory_to_n_tensor;
+        memory_to_n_tensor[out_contents_ptr] = 1;
+        memory_to_n_tensor[attn_contents_ptr] = 1;
 
         // Set output tensor handles
         *ret0 = out_tensor_handle;
diff --git a/backends/apple/metal/runtime/shims/memory.cpp b/backends/apple/metal/runtime/shims/memory.cpp
@@ -31,7 +31,12 @@ using namespace executorch::backends::aoti;
 
 // Global storage for tensors and their metadata
 std::unordered_set<std::shared_ptr<Tensor>> tensors;
-std::unordered_map<Tensor*, bool> is_tensor_own_memory;
+
+// Reference counting for memory addresses
+// Maps memory address to number of tensors using it
+// Special value: NOT_OWN (-1) means tensor never owns the memory
+constexpr int32_t NOT_OWN = -1;
+std::unordered_map<void*, int32_t> memory_to_n_tensor;
 
 extern "C" {
 
@@ -110,7 +115,18 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
   *ret_new_tensor = tensor.get();
-  is_tensor_own_memory[tensor.get()] = false;
+
+  // Check if this memory address is already being tracked
+  auto memory_it = memory_to_n_tensor.find(adjusted_data);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it == memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is already being tracked by another tensor",
+      adjusted_data);
+
+  // Mark this memory as NOT_OWN since tensor created from blob never owns
+  // memory
+  memory_to_n_tensor[adjusted_data] = NOT_OWN;
 
   ET_LOG(Debug, "aoti_torch_create_tensor_from_blob_v2: successfull");
   return Error::Ok;
@@ -192,59 +208,98 @@ AOTITorchError aoti_torch_empty_strided(
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
   *ret_new_tensor = tensor.get();
-  is_tensor_own_memory[tensor.get()] = true;
+
+  // This tensor owns the memory it allocated, set reference count to 1
+  memory_to_n_tensor[ptr] = 1;
 
   ET_LOG(Debug, "aoti_torch_empty_strided: successfull");
   return Error::Ok;
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
   ET_LOG(Debug, "aoti_torch_delete_tensor_object: entered");
-  // Find tensor in the set
+
+  // Handle null tensor pointer
+  if (tensor == nullptr) {
+    ET_LOG(Debug, "aoti_torch_delete_tensor_object: null tensor");
+    return Error::Ok;
+  }
+
+  // Check if tensor exists in our tracking
+  bool found_in_tensors = false;
   for (auto it = tensors.begin(); it != tensors.end(); ++it) {
     if (it->get() == tensor) {
-      auto tensor_ptr = *it;
+      found_in_tensors = true;
+      break;
+    }
+  }
 
-      // Check ownership before cleaning up
-      auto ownership_it = is_tensor_own_memory.find(tensor);
-      bool owns_memory = (ownership_it != is_tensor_own_memory.end())
-          ? ownership_it->second
-          : false;
+  // If tensor not found in our tracking, it's invalid
+  ET_CHECK_OR_RETURN_ERROR(
+      found_in_tensors, InvalidArgument, "Didn't find tensor %p", tensor);
 
-      // Clean up ownership metadata
-      is_tensor_own_memory.erase(tensor);
+  // Find and delete the tensor
+  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+    if (it->get() == tensor) {
+      // Get the tensor before erasing
+      auto tensor_ptr = *it;
+      void* data_ptr = tensor_ptr->mutable_data_ptr();
 
-      if (owns_memory) {
-        // et tensor owns the memory; need to free it manually
-        void* data_ptr = tensor_ptr->mutable_data_ptr();
+      // Find the reference count for this memory address
+      auto memory_it = memory_to_n_tensor.find(data_ptr);
+      if (memory_it != memory_to_n_tensor.end()) {
+        int32_t ref_count = memory_it->second;
 
-        // Check if it's Metal GPU memory
-        if (metal_is_device_pointer(data_ptr)) {
-          // This is Metal GPU memory - the Metal helper will handle cleanup
-          // Metal buffers are automatically managed by ARC when the buffer is
-          // released
+        if (ref_count == NOT_OWN) {
+          // Tensor never owned the memory, skip freeing
+          // Just remove tensor from tracking
           tensors.erase(it);
           ET_LOG(
               Debug,
-              "aoti_torch_delete_tensor_object: successfull (Metal GPU memory)");
+              "aoti_torch_delete_tensor_object: tensor doesn't own memory, skipping free");
           return Error::Ok;
+        } else if (ref_count == 1) {
+          // Only current tensor using this memory, free it
+          // Check if it's Metal GPU memory
+          if (metal_is_device_pointer(data_ptr)) {
+            metal_deallocate_buffer(data_ptr);
+          } else {
+            // This is CPU memory - free immediately
+            free(data_ptr);
+            data_ptr = nullptr;
+            ET_LOG(
+                Debug, "aoti_torch_delete_tensor_object: freeing CPU memory");
+          }
+
+          // Remove from memory tracking
+          memory_to_n_tensor.erase(memory_it);
+        } else if (ref_count > 1) {
+          // Other tensors still using this memory, just decrement count
+          memory_to_n_tensor[data_ptr] = ref_count - 1;
+          ET_LOG(
+              Debug,
+              "aoti_torch_delete_tensor_object: decremented ref count from %d to %d",
+              ref_count,
+              ref_count - 1);
         }
-
-        // This is CPU memory - free immediately
-        free(data_ptr);
+      } else {
+        ET_CHECK_OR_RETURN_ERROR(
+            false,
+            Internal,
+            "Internal error: memory not found during deletion");
       }
-      // else: Don't free memory since the tensor doesn't own it
 
-      // Remove from set (this will call the destructor if it's the last
+      // Remove tensor from set (this will call the destructor if it's the last
       // reference)
       tensors.erase(it);
-      ET_LOG(
-          Debug, "aoti_torch_delete_tensor_object: successfull (CPU memory)");
+      ET_LOG(Debug, "aoti_torch_delete_tensor_object: successfull");
       return Error::Ok;
     }
   }
-  ET_LOG(Error, "Didn't find tensor %p", tensor);
-  return Error::InvalidArgument;
+
+  // This should never be reached since we found it above
+  ET_CHECK_OR_RETURN_ERROR(
+      false, Internal, "Internal error: tensor not found after validation");
 }
 
 AOTITorchError aoti_torch_copy_(
@@ -375,75 +430,105 @@ AOTITorchError aoti_torch__reinterpret_tensor(
       InvalidArgument,
       "aoti_torch__reinterpret_tensor failed: ret_new_tensor is null");
 
+  // Check if storage_offset is not 0 - return error if not
+  ET_CHECK_OK_OR_RETURN_ERROR(validate_storage_offset(storage_offset));
+
+  // Get the device info from the source tensor to perform device_index
+  // validation
+  int32_t device_type = 0;
+  int32_t device_index = 0;
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type));
+
+  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index));
+
+  // Ensure device_index is always 0
+  ET_CHECK_OR_RETURN_ERROR(
+      device_index == 0,
+      InvalidArgument,
+      "device_index must be 0, got: %d",
+      device_index);
+
   // Get the dtype from the source tensor
   int32_t dtype = 0;
   ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_dtype(self, &dtype));
 
   // Validate dtype using SupportedDTypes
   ET_CHECK_OK_OR_RETURN_ERROR(validate_dtype(dtype));
 
-  int32_t device_type = 0;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_type(self, &device_type));
+  // Get the original data pointer from the source tensor
+  void* data_ptr = self->mutable_data_ptr();
+  ET_CHECK_OR_RETURN_ERROR(
+      data_ptr != nullptr,
+      InvalidArgument,
+      "Source tensor has null data pointer");
 
-  int32_t device_index = 0;
-  ET_CHECK_OK_OR_RETURN_ERROR(aoti_torch_get_device_index(self, &device_index));
+  // Check if the given memory is in the map, if not return error
+  auto memory_it = memory_to_n_tensor.find(data_ptr);
+  ET_CHECK_OR_RETURN_ERROR(
+      memory_it != memory_to_n_tensor.end(),
+      InvalidArgument,
+      "Memory address %p is not being tracked by reference counting system",
+      data_ptr);
+
+  // Convert sizes using utility function from utils.h
+  std::vector<aten::SizesType> sizes = convert_sizes_to_vector(ndim, sizes_ptr);
+
+  // Convert strides using utility function from utils.h
+  std::vector<aten::StridesType> strides =
+      convert_strides_to_vector(ndim, sizes_ptr, strides_ptr);
+
+  // Create new tensor view that reinterprets the same memory with different
+  // shape/strides This creates a view, not a copy - the data pointer is shared
+  std::shared_ptr<Tensor> tensor = executorch::extension::from_blob(
+      data_ptr, // Reuse the same memory from source tensor
+      sizes, // New sizes with explicit SizesType
+      strides, // New strides with explicit StridesType
+      dtype_to_scalar_type(dtype) // Convert dtype with explicit type casting
+  );
 
-  // Get the base data pointer from the source tensor
-  void* base_data_ptr = self->mutable_data_ptr();
   ET_CHECK_OR_RETURN_ERROR(
-      base_data_ptr != nullptr,
+      tensor != nullptr,
       InvalidArgument,
-      "Source tensor has null data pointer");
+      "Failed to create reinterpreted tensor view");
 
-  // Calculate new tensor size in elements for logging
-  int64_t new_numel = 1;
-  for (int64_t i = 0; i < ndim; i++) {
-    new_numel *= sizes_ptr[i];
-  }
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
 
-  ET_LOG(
-      Debug,
-      "aoti_torch__reinterpret_tensor: base_data_ptr=%p, new_numel=%lld, storage_offset=%lld",
-      base_data_ptr,
-      new_numel,
-      storage_offset);
-
-  // Create a new tensor view that shares the same underlying storage
-  // This is the correct way to implement reinterpret_tensor - as a view, not a
-  // copy
-  AOTITorchError create_err = aoti_torch_create_tensor_from_blob_v2(
-      base_data_ptr, // Same underlying data pointer
-      ndim, // New dimensions
-      sizes_ptr, // New sizes
-      strides_ptr, // New strides
-      storage_offset, // Storage offset (will be handled properly now)
-      dtype,
-      device_type,
-      device_index,
-      ret_new_tensor,
-      0, // layout (default)
-      nullptr, // opaque_metadata
-      0 // opaque_metadata_size
-  );
+  *ret_new_tensor = tensor.get();
 
-  if (create_err != Error::Ok) {
-    ET_LOG(Error, "failed to create reinterpreted tensor view");
-    return create_err;
-  }
+  // Increment the reference count for this memory address only if it is owned
+  // by tensor
+  memory_to_n_tensor[data_ptr] = memory_to_n_tensor[data_ptr] == NOT_OWN
+      ? NOT_OWN
+      : memory_to_n_tensor[data_ptr] + 1;
 
   ET_LOG(Debug, "aoti_torch__reinterpret_tensor: successfull");
   return Error::Ok;
 }
 
 // Cleanup function for clearing global state
 void cleanup_memory() {
-  is_tensor_own_memory.clear();
-  if (!tensors.empty()) {
-    ET_LOG(Error, "Warning: tensors not empty during cleanup");
+  // Use aoti_torch_delete_tensor_object to properly delete each tensor
+  // Note: We need to collect tensor pointers first since deletion modifies the
+  // set
+  std::vector<Tensor*> tensor_ptrs;
+  tensor_ptrs.reserve(tensors.size());
+  for (const auto& tensor_shared : tensors) {
+    tensor_ptrs.push_back(tensor_shared.get());
   }
 
+  // Now delete each tensor - this will modify the global tensors set
+  for (Tensor* tensor_ptr : tensor_ptrs) {
+    aoti_torch_delete_tensor_object(tensor_ptr);
+  }
+
+  // tensors set should now be empty, but ensure it's cleared
+  tensors.clear();
+
   // Clean up Metal resources
   metal_cleanup_resources();
+
+  ET_LOG(Info, "Cleared all tensors and Metal resources");
 }
 
 } // extern "C"
diff --git a/backends/apple/metal/runtime/shims/memory.h b/backends/apple/metal/runtime/shims/memory.h
@@ -22,7 +22,7 @@ namespace metal {
 extern "C" {
 
 // Global storage declarations
-extern std::unordered_map<Tensor*, bool> is_tensor_own_memory;
+extern std::unordered_map<void*, int32_t> memory_to_n_tensor;
 extern std::unordered_set<std::shared_ptr<Tensor>> tensors;
 
 // Memory-related operations