Implement ggml_backend_metal_split_buffer_type for Metal backend support split-mode row

LeaveNhA · LeaveNhA · commit b53f09834f24 · 2025-09-10T06:40:12.000+03:00
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
@@ -51,6 +51,8 @@ GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backe
 
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_split_buffer_type(int main_device, const float * tensor_split);
+
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -8,6 +8,13 @@
 
 #import <Metal/Metal.h>
 
+#ifdef __cplusplus
+#include <array>
+#include <map>
+#include <mutex>
+#include <vector>
+#endif
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -1698,6 +1705,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     id rset;
 };
 
+// Helper function to calculate tensor size for split buffers
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    // Calculate the size based on the number of rows in the split
+    return nrows_split * ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
 // rset init
 static bool ggml_backend_metal_buffer_rset_init(
         struct ggml_backend_metal_buffer_context * ctx,
@@ -6579,6 +6592,9 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
 }
 
 static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_metal_split_buffer_type;
+    }
     if (strcmp(name, "ggml_backend_get_features") == 0) {
         return (void *)ggml_backend_metal_get_features;
     }
@@ -6599,6 +6615,333 @@ static void ggml_metal_cleanup(void) {
     ggml_backend_metal_device_rel(&g_ggml_ctx_dev_main);
 }
 
+//
+// Metal split buffer implementation
+//
+
+#ifdef __cplusplus
+
+#define MATRIX_ROW_PADDING 512 // As defined in CUDA implementation
+
+// Metal equivalent of ggml_tensor_extra_gpu
+struct ggml_tensor_extra_metal {
+    // Metal buffers for each device (Metal only supports one device in current implementation)
+    // But we'll keep the array structure for consistency with CUDA
+    id<MTLBuffer> data_device[1];  // Metal only supports one device currently
+};
+
+// Buffer type context
+struct ggml_backend_metal_split_buffer_type_context {
+    int main_device;
+    std::array<float, 1> tensor_split;  // Metal only supports one device, but keeping array for API consistency
+    std::string name;
+};
+
+// Buffer context
+struct ggml_backend_metal_split_buffer_context {
+    ~ggml_backend_metal_split_buffer_context() {
+        for (ggml_tensor_extra_metal * extra : tensor_extras) {
+            // Clean up Metal buffers
+            if (extra->data_device[0] != nullptr) {
+                [extra->data_device[0] release];
+            }
+            delete extra;
+        }
+    }
+    
+    std::vector<ggml_tensor_extra_metal *> tensor_extras;
+};
+
+// Helper function to calculate tensor size for split buffers
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    // Calculate the size based on the number of rows in the split
+    return nrows_split * ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+// Tensor split calculation
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, 1> & tensor_split, int id) {
+    // For Metal, we only have one device, so all rows go to device 0
+    if (id == 0) {
+        *row_low = 0;
+        *row_high = tensor->ne[1];
+    } else {
+        *row_low = 0;
+        *row_high = 0;
+    }
+    
+    GGML_UNUSED(tensor_split);
+}
+
+// Buffer free function
+static void ggml_backend_metal_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+// Buffer get base function
+static void * ggml_backend_metal_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // The pointers are stored in the tensor extras, this is just a dummy address
+    return (void *)0x1000;
+    
+    GGML_UNUSED(buffer);
+}
+
+// Buffer init tensor function
+static enum ggml_status ggml_backend_metal_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context;
+    ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    ggml_tensor_extra_metal * extra = new ggml_tensor_extra_metal{};
+    ctx->tensor_extras.push_back(extra);
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        tensor->extra = extra;
+        return GGML_STATUS_SUCCESS;
+    }
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+    const size_t original_size = size;
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    // Get Metal device context
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buffer->buft->device->context;
+    id<MTLDevice> device = ctx_dev->mtl_device;
+
+    // Allocate Metal buffer
+    extra->data_device[id] = [device newBufferWithLength:size options:MTLResourceStorageModePrivate];
+    
+    // Initialize buffer with zeros
+    memset([extra->data_device[id] contents], 0, size);
+
+    tensor->extra = extra;
+    return GGML_STATUS_SUCCESS;
+}
+
+// Buffer set tensor function
+static void ggml_backend_metal_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    // Split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra;
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        return;
+    }
+
+    const size_t offset_split = row_low * nb1;
+    size_t alloc_size = ggml_nbytes_split(tensor, nrows_split);
+    const size_t original_size = alloc_size;
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    const char * buf_host = (const char *)data + offset_split;
+    
+    // Copy data to Metal buffer
+    memcpy([extra->data_device[id] contents], buf_host, original_size);
+}
+
+// Buffer get tensor function
+static void ggml_backend_metal_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    // Split tensors must always be retrieved in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra;
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        return;
+    }
+
+    const size_t offset_split = row_low * nb1;
+    size_t alloc_size = ggml_nbytes_split(tensor, nrows_split);
+    const size_t original_size = alloc_size;
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    char * buf_host = (char *)data + offset_split;
+    
+    // Copy data from Metal buffer
+    memcpy(buf_host, [extra->data_device[id] contents], original_size);
+}
+
+// Buffer clear function
+static void ggml_backend_metal_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+    // Not implemented for split buffers
+}
+
+// Buffer interface
+static const ggml_backend_buffer_i ggml_backend_metal_split_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_metal_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_metal_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_metal_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_metal_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_metal_split_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_metal_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// Buffer type interface functions
+static const char * ggml_backend_metal_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context;
+    return ctx->name.c_str();
+}
+
+static bool ggml_backend_buft_is_metal_split(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_metal_split_buffer_type_get_name;
+}
+
+static ggml_backend_buffer_t ggml_backend_metal_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // Since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // Instead, we allocate them for each tensor separately in init_tensor
+    // However, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. This limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    ggml_backend_metal_split_buffer_context * ctx = new ggml_backend_metal_split_buffer_context();
+
+    return ggml_backend_buffer_init(buft, ggml_backend_metal_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_metal_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_metal_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context;
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        return total_size;
+    }
+
+    total_size += ggml_nbytes_split(tensor, nrows_split);
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_metal_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+    
+    GGML_UNUSED(buft);
+}
+
+// Buffer type interface
+static const ggml_backend_buffer_type_i ggml_backend_metal_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_metal_split_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_metal_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_metal_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_metal_split_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_metal_split_buffer_type_is_host,
+};
+
+#endif // __cplusplus
+
+// Main function to create Metal split buffer type
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_split_buffer_type(int main_device, const float * tensor_split) {
+    GGML_LOG_INFO("%s: creating Metal split buffer type, main_device=%d\n", __func__, main_device);
+#ifdef __cplusplus
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
+    static std::map<std::pair<int, std::array<float, 1>>, struct ggml_backend_buffer_type> buft_map;
+
+    std::array<float, 1> tensor_split_arr = {};
+
+    // For Metal, we only support one device, so we simplify the tensor split logic
+    tensor_split_arr[0] = 1.0f; // All tensors go to the single Metal device
+
+    auto it = buft_map.find({main_device, tensor_split_arr});
+    if (it != buft_map.end()) {
+        GGML_LOG_INFO("%s: returning existing buffer type\n", __func__);
+        return &it->second;
+    }
+    
+    auto * ctx = new ggml_backend_metal_split_buffer_type_context{
+        main_device,
+        tensor_split_arr,
+        std::string("Metal_Split"),
+    };
+
+    struct ggml_backend_buffer_type buft {
+        /* .iface   = */ ggml_backend_metal_split_buffer_type_interface,
+        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), main_device),
+        /* .context = */ ctx,
+    };
+
+    auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
+    GGML_LOG_INFO("%s: created new Metal split buffer type\n", __func__);
+    return &result.first->second;
+#else
+    // For C builds, return the regular Metal buffer type
+    GGML_LOG_INFO("%s: C build, returning regular Metal buffer type\n", __func__);
+    return ggml_backend_metal_buffer_type();
+#endif
+}
+
 // TODO: make thread-safe
 ggml_backend_reg_t ggml_backend_metal_reg(void) {
     ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);