diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h
index a610694423483..c36bffaaec7ee 100644
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -51,6 +51,8 @@ GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backe
 
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_split_buffer_type(int main_device, const float * tensor_split);
+
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 7646f3f1346a4..ad5c45f5c6410 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -533,7 +533,8 @@ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_t
 
 bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
     GGML_ASSERT(device);
-    return device->iface.supports_buft(device, buft);
+    bool res = device->iface.supports_buft(device, buft);
+    return res;
 }
 
 bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
@@ -762,7 +763,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
     return -1;
 }
 
-#if 0
+#if 1
 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index e76fb712631e1..44e6dfc2dcff9 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -14,7 +14,7 @@
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 // max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 64
+#define GGML_METAL_MAX_BUFFERS 32
 
 // max number of MTLCommandBuffer used to submit a graph for processing
 #define GGML_METAL_MAX_COMMAND_BUFFERS 8
@@ -36,6 +36,9 @@
 // overload of MTLGPUFamilyMetal3 (not available in some environments)
 static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
 
+// Matrix row padding to avoid out-of-bounds memory accesses
+#define MATRIX_ROW_PADDING 512
+
 // initialized in ggml_backend_metal_reg
 static struct ggml_backend_reg    g_ggml_backend_metal_reg;
 static struct ggml_backend_device g_ggml_backend_metal_device;
@@ -1698,6 +1701,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
     id rset;
 };
 
+// Helper function to calculate tensor size for split buffers
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    // Calculate the size based on the number of rows in the split
+    return nrows_split * ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
 // rset init
 static bool ggml_backend_metal_buffer_rset_init(
         struct ggml_backend_metal_buffer_context * ctx,
@@ -1841,6 +1850,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                     return false;
             }
         case GGML_OP_NONE:
+        //case GGML_OP_NOPE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_TRANSPOSE:
@@ -5865,6 +5875,403 @@ static enum ggml_status ggml_metal_graph_compute(
 
 // backend interface
 
+// Metal equivalent of ggml_tensor_extra_gpu
+struct ggml_tensor_extra_metal {
+    // Metal buffers for each device (Metal only supports one device in current implementation)
+    // But we'll keep the array structure for consistency with CUDA
+    id<MTLBuffer> data_device[1];  // Metal only supports one device currently
+};
+
+// Buffer type context
+struct ggml_backend_metal_split_buffer_type_context {
+    int main_device;
+    float tensor_split[1];  // Metal only supports one device, but keeping array for API consistency
+    char* name;
+};
+
+// Buffer context
+struct ggml_backend_metal_split_buffer_context {
+    struct ggml_tensor_extra_metal ** tensor_extras;
+    size_t tensor_extras_size;
+    size_t tensor_extras_capacity;
+};
+
+// Cleanup function for buffer context
+static void ggml_backend_metal_split_buffer_context_free(struct ggml_backend_metal_split_buffer_context * ctx) {
+    if (ctx == NULL) return;
+    
+    for (size_t i = 0; i < ctx->tensor_extras_size; i++) {
+        struct ggml_tensor_extra_metal * extra = ctx->tensor_extras[i];
+        if (extra != NULL) {
+            // Clean up Metal buffers
+            if (extra->data_device[0] != NULL) {
+                [extra->data_device[0] release];
+            }
+            free(extra);
+        }
+    }
+    
+    free(ctx->tensor_extras);
+    free(ctx);
+}
+
+// Tensor split calculation
+static void get_row_split(int64_t * row_low, int64_t * row_high, const struct ggml_tensor * tensor, const float tensor_split[1], int id) {
+    GGML_LOG_DEBUG("%s: tensor '%s', id=%d, ne[1]=%lld\n", __func__, tensor->name, id, tensor->ne[1]);
+    
+    const int64_t nrows = ggml_nrows(tensor);
+    
+    // For Metal, we only have one device, so all rows go to device 0
+    if (id == 0) {
+        // Use the tensor_split value to determine how much of the tensor goes to this device
+        *row_low = id == 0 ? 0 : (int64_t)(nrows * tensor_split[id]);
+        *row_high = id == 0 ? nrows : (int64_t)(nrows * tensor_split[id]);
+        GGML_LOG_DEBUG("%s: assigning rows [%lld, %lld] to device %d\n", __func__, *row_low, *row_high, id);
+    } else {
+        *row_low = 0;
+        *row_high = 0;
+        GGML_LOG_DEBUG("%s: device %d gets no rows\n", __func__, id);
+    }
+    
+    GGML_LOG_DEBUG("%s: tensor_split[0] = %f\n", __func__, (double)tensor_split[0]);
+}
+
+// Buffer free function
+static void ggml_backend_metal_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context;
+    ggml_backend_metal_split_buffer_context_free(ctx);
+}
+
+// Buffer get base function
+static void * ggml_backend_metal_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // The pointers are stored in the tensor extras, this is just a dummy address
+    return (void *)0x1000;
+    
+    GGML_UNUSED(buffer);
+}
+
+// Buffer init tensor function
+static enum ggml_status ggml_backend_metal_split_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->view_src == NULL); // views of split tensors are not supported
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+    
+    GGML_LOG_DEBUG("%s: initializing tensor '%s' with %d dimensions [%lld, %lld, %lld, %lld]\n", 
+                   __func__, tensor->name, ggml_n_dims(tensor), 
+                   tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    
+    struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context;
+    struct ggml_backend_metal_split_buffer_type_context * buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    struct ggml_tensor_extra_metal * extra = calloc(1, sizeof(struct ggml_tensor_extra_metal));
+    if (extra == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate tensor extra for '%s'\n", __func__, tensor->name);
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+    
+    // For a dynamic array, we need to manually manage the array with proper capacity tracking
+    if (ctx->tensor_extras_size >= ctx->tensor_extras_capacity) {
+        size_t new_capacity = ctx->tensor_extras_capacity == 0 ? 16 : ctx->tensor_extras_capacity * 2;
+        struct ggml_tensor_extra_metal ** new_tensor_extras = realloc(ctx->tensor_extras, new_capacity * sizeof(struct ggml_tensor_extra_metal *));
+        if (new_tensor_extras == NULL) {
+            GGML_LOG_ERROR("%s: failed to reallocate tensor_extras array\n", __func__);
+            free(extra);
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+        ctx->tensor_extras = new_tensor_extras;
+        ctx->tensor_extras_capacity = new_capacity;
+    }
+    ctx->tensor_extras[ctx->tensor_extras_size] = extra;
+    ctx->tensor_extras_size++;
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+    
+    GGML_LOG_DEBUG("%s: tensor '%s' row split: low=%lld, high=%lld\n", __func__, tensor->name, row_low, row_high);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        GGML_LOG_DEBUG("%s: tensor '%s' has 0 rows, skipping allocation\n", __func__, tensor->name);
+        tensor->extra = extra;
+        return GGML_STATUS_SUCCESS;
+    }
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+    GGML_LOG_DEBUG("%s: tensor '%s' size=%zu bytes\n", __func__, tensor->name, size);
+    
+    // const size_t original_size = size; // Not used in this implementation
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        size_t padding = ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        GGML_LOG_DEBUG("%s: tensor '%s' adding padding=%zu bytes\n", __func__, tensor->name, padding);
+        size += padding;
+    }
+
+    // Get Metal device context
+    struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buffer->buft->device->context;
+    GGML_LOG_DEBUG("%s: tensor '%s' using Metal device: %s\n", __func__, tensor->name, ctx_dev->name);
+
+    // Allocate Metal buffer directly using ctx_dev->mtl_device
+    GGML_LOG_DEBUG("%s: tensor '%s' allocating Metal buffer with size=%zu\n", __func__, tensor->name, size);
+    extra->data_device[id] = [ctx_dev->mtl_device newBufferWithLength:size 
+                                                  options:MTLResourceStorageModeShared];
+    
+    if (extra->data_device[id] == nil) {
+        GGML_LOG_ERROR("%s: failed to allocate Metal buffer for tensor '%s' with size=%zu\n", __func__, tensor->name, size);
+        free(extra);
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+    
+    GGML_LOG_DEBUG("%s: tensor '%s' Metal buffer allocated at %p\n", __func__, tensor->name, (void *)extra->data_device[id]);
+
+    // Initialize buffer with zeros
+    GGML_LOG_DEBUG("%s: tensor '%s' initializing buffer with zeros\n", __func__, tensor->name);
+    void * bufferContents = [extra->data_device[id] contents];
+    if (bufferContents == NULL) {
+        // If we can't access the buffer contents directly, we'll skip initialization
+        // Buffers with StorageModePrivate are typically zero-initialized by Metal
+        // or will be initialized when first used
+        GGML_LOG_DEBUG("%s: Metal buffer contents is NULL for tensor '%s', skipping zero initialization\n", __func__, tensor->name);
+    } else {
+        // We can access the buffer contents directly, so initialize with zeros
+        memset(bufferContents, 0, size);
+    }
+
+    tensor->extra = extra;
+    GGML_LOG_DEBUG("%s: tensor '%s' initialization completed successfully\n", __func__, tensor->name);
+    return GGML_STATUS_SUCCESS;
+}
+
+// Buffer set tensor function
+static void ggml_backend_metal_split_buffer_set_tensor(
+    ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
+    const void * data, size_t offset, size_t size) 
+{
+    // Must set entire tensor at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor)); 
+
+    struct ggml_backend_metal_split_buffer_context *ctx = (struct ggml_backend_metal_split_buffer_context *) buffer->context;
+    struct ggml_backend_metal_split_buffer_type_context *buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *) buffer->buft->context;
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    struct ggml_tensor_extra_metal *extra = (struct ggml_tensor_extra_metal *) tensor->extra;
+
+    // For Metal we treat id=0 as the (only) device; loop structure left in place
+    const int device_count = 1; 
+    for (int id = 0; id < device_count; ++id) {
+        const float id_ = 1.0f;
+        int64_t row_low = 0, row_high = 0;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+        int64_t nrows = row_high - row_low;
+        if (nrows <= 0) {
+            continue;
+        }
+        // Compute offset and sizes for this slice
+        const size_t offset_split = (size_t)row_low * nb1;
+        size_t original_size = ggml_nbytes_split(tensor, nrows);
+        size_t copy_size = original_size;
+        // Pad for alignment (if needed) but we only copy original_size bytes
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            copy_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - (ne0 % MATRIX_ROW_PADDING));
+        }
+        const char *buf_host = (const char *)data + offset_split;
+
+        // Copy the slice into the Metal buffer (contents pointer to GPU memory)
+        memcpy([extra->data_device[id] contents], buf_host, original_size);
+        // On macOS, inform Metal that buffer range was modified so GPU sees new data:contentReference[oaicite:2]{index=2}
+        [extra->data_device[id] didModifyRange:NSMakeRange(0, original_size)];
+    }
+}
+
+
+// Buffer get tensor function
+static void ggml_backend_metal_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    // Split tensors must always be retrieved in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context;
+    struct ggml_backend_metal_split_buffer_type_context * buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    struct ggml_tensor_extra_metal * extra = (struct ggml_tensor_extra_metal *)tensor->extra;
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        return;
+    }
+
+    const size_t offset_split = row_low * nb1;
+    size_t alloc_size = ggml_nbytes_split(tensor, nrows_split);
+    const size_t original_size = alloc_size;
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    char * buf_host = (char *)data + offset_split;
+    
+    // Copy data from Metal buffer
+    memcpy(buf_host, [extra->data_device[id] contents], original_size);
+}
+
+// Buffer clear function
+static void ggml_backend_metal_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+    // Not implemented for split buffers
+}
+
+// Buffer interface
+static const struct ggml_backend_buffer_i ggml_backend_metal_split_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_metal_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_metal_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_metal_split_buffer_init_tensor,
+    /* .memset_tensor   = */ NULL,
+    /* .set_tensor      = */ ggml_backend_metal_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_metal_split_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_metal_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// Buffer type interface functions
+static const char * ggml_backend_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    struct ggml_backend_metal_split_buffer_type_context * ctx = (struct ggml_backend_metal_split_buffer_type_context *)buft->context;
+    return ctx->name;
+}
+
+static bool ggml_backend_buft_is_metal_split(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_split_buffer_type_get_name;
+}
+
+static ggml_backend_buffer_t ggml_backend_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // Since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // Instead, we allocate them for each tensor separately in init_tensor
+    // However, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. This limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    struct ggml_backend_metal_split_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_split_buffer_context));
+    if (ctx == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate split buffer context\n", __func__);
+        return NULL;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_metal_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+    
+    GGML_UNUSED(buft);
+}
+
+static size_t ggml_backend_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
+    struct ggml_backend_metal_split_buffer_type_context * ctx = (struct ggml_backend_metal_split_buffer_type_context *)buft->context;
+    GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    // For Metal, we only have one device
+    int id = 0;
+    int64_t row_low, row_high;
+    get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+
+    int64_t nrows_split = row_high - row_low;
+    if (nrows_split == 0) {
+        return total_size;
+    }
+
+    total_size += ggml_nbytes_split(tensor, nrows_split);
+
+    // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+    if (ne0 % MATRIX_ROW_PADDING != 0) {
+        total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+    
+    GGML_UNUSED(buft);
+}
+
+// Buffer type interface
+static const struct ggml_backend_buffer_type_i ggml_backend_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_split_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_split_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_split_buffer_type_get_alloc_size,
+    /* .is_host          = */ ggml_backend_split_buffer_type_is_host,
+};
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_split_buffer_type(int main_device, const float * tensor_split) {
+    GGML_LOG_INFO("%s: creating Metal split buffer type, main_device=%d\n", __func__, main_device);
+    
+    // For Metal, we only support one device, so we simplify the implementation
+    // We'll just create a new buffer type context each time since Metal only has one device
+    
+    struct ggml_backend_metal_split_buffer_type_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_split_buffer_type_context));
+    if (ctx == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate buffer type context\n", __func__);
+        return NULL;
+    }
+    
+    ctx->main_device = main_device;
+    
+    // Properly handle tensor split values
+    if (tensor_split != NULL) {
+        ctx->tensor_split[0] = tensor_split[0];
+        GGML_LOG_DEBUG("%s: tensor_split[0] = %f (from input)\n", __func__, (double)ctx->tensor_split[0]);
+    } else {
+        ctx->tensor_split[0] = 1.0f; // All tensors go to the single Metal device
+        GGML_LOG_DEBUG("%s: tensor_split[0] = %f (default)\n", __func__, (double)ctx->tensor_split[0]);
+    }
+    
+    ctx->name = "Metal_Split";
+
+    // Allocate a new buffer type structure each time
+    struct ggml_backend_buffer_type * buft = calloc(1, sizeof(struct ggml_backend_buffer_type));
+    if (buft == NULL) {
+        GGML_LOG_ERROR("%s: failed to allocate buffer type\n", __func__);
+        free(ctx);
+        return NULL;
+    }
+    
+    buft->iface = ggml_backend_split_buffer_type_interface;
+    buft->device = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), main_device);
+    if (buft->device == NULL) {
+        GGML_LOG_ERROR("%s: failed to get device for main_device=%d\n", __func__, main_device);
+        free(ctx);
+        free(buft);
+        return NULL;
+    }
+    buft->context = ctx;
+    
+    GGML_LOG_DEBUG("%s: buffer type created successfully\n", __func__);
+
+    return buft;
+}
+
+
 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
@@ -6507,9 +6914,14 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
 }
 
 static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return
-        buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
-        buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
+    bool res =
+        buft->iface.get_name == ggml_backend_metal_buffer_type_get_name 
+    ||
+        buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name
+    ||
+        buft->iface.get_name == ggml_backend_split_buffer_type_get_name;
+
+    return res;
 
     GGML_UNUSED(dev);
 }
@@ -6579,6 +6991,9 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
 }
 
 static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_split_buffer_type;
+    }
     if (strcmp(name, "ggml_backend_get_features") == 0) {
         return (void *)ggml_backend_metal_get_features;
     }
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index d4833068d0016..ce5c3283b5f0d 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -33,6 +33,9 @@
 
 namespace fs = std::filesystem;
 
+// Forward declaration for device map access
+static std::unordered_map<std::string, ggml_backend_dev_t>& get_rpc_dev_map();
+
 static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
 
 #ifdef _WIN32
@@ -1760,16 +1763,47 @@ static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
 }
 
 static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
-    return 0;
+    const auto& dev_map = get_rpc_dev_map();
+    return dev_map.size();
 
     GGML_UNUSED(reg);
 }
 
 static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
+    const auto& dev_map = get_rpc_dev_map();
+    
+    if (index >= dev_map.size()) {
+        return nullptr;
+    }
+    
+    // Convert unordered_map to vector to access by index
+    std::vector<ggml_backend_dev_t> devices;
+    devices.reserve(dev_map.size());
+    for (const auto& pair : dev_map) {
+        devices.push_back(pair.second);
+    }
+    
+    if (index < devices.size()) {
+        return devices[index];
+    }
+    
+    return nullptr;
 
     GGML_UNUSED(reg);
-    GGML_UNUSED(index);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_rpc_split_buffer_type(int main_device, const float * tensor_split) {
+    // For RPC backend, we don't implement actual tensor splitting
+    // Just return the default buffer type for the main device
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_rpc_reg(), main_device);
+    if (!dev) {
+        return nullptr;
+    }
+    
+    // Suppress unused parameter warning
+    GGML_UNUSED(tensor_split);
+    
+    return ggml_backend_dev_buffer_type(dev);
 }
 
 static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
@@ -1779,6 +1813,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch
     if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) {
         return (void *)ggml_backend_rpc_start_server;
     }
+    if (std::strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+        return (void *)ggml_backend_rpc_split_buffer_type;
+    }
     return NULL;
 
     GGML_UNUSED(reg);
@@ -1801,8 +1838,14 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) {
     return &ggml_backend_rpc_reg;
 }
 
-ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
+// Expose the device map for enumeration
+static std::unordered_map<std::string, ggml_backend_dev_t>& get_rpc_dev_map() {
     static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
+    return dev_map;
+}
+
+ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
+    auto& dev_map = get_rpc_dev_map();
 
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b9e4634a7061c..2efc3ba6fad20 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -377,8 +377,13 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
         if (ggml_backend_split_buffer_type_fn) {
             size_t dev_index = [&]() {
                 auto * reg = ggml_backend_dev_backend_reg(dev);
-                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
-                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
+                size_t reg_dev_count = ggml_backend_reg_dev_count(reg);
+                LLAMA_LOG_DEBUG("%s: device %s, reg %s, device count %zu\n", __func__, ggml_backend_dev_name(dev), ggml_backend_reg_name(reg), reg_dev_count);
+                for (size_t i = 0; i < reg_dev_count; ++i) {
+                    ggml_backend_dev_t reg_dev = ggml_backend_reg_dev_get(reg, i);
+                    LLAMA_LOG_DEBUG("%s: comparing device %s with reg device %s at index %zu\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_name(reg_dev), i);
+                    if (reg_dev == dev) {
+                        LLAMA_LOG_DEBUG("%s: found device %s at index %zu\n", __func__, ggml_backend_dev_name(dev), i);
                         return i;
                     }
                 }
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 9b9803dedabef..af152e4092278 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -20,6 +20,7 @@
 
 #include "common.h"
 #include "ggml.h"
+#include "ggml-rpc.h"
 #include "llama.h"
 
 #ifdef _WIN32
@@ -1827,6 +1828,39 @@ int main(int argc, char ** argv) {
 
     cmd_params params = parse_cmd_params(argc, argv);
 
+    // Register RPC devices if specified
+    for (const auto& rpc_servers_str : params.rpc_servers) {
+        if (!rpc_servers_str.empty()) {
+            auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
+            if (!rpc_servers.empty()) {
+                ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+                if (!rpc_reg) {
+                    fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
+                    return 1;
+                }
+
+                typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+                ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = 
+                    (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+                if (!ggml_backend_rpc_add_device_fn) {
+                    fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
+                    return 1;
+                }
+
+                // Register each RPC device
+                for (const std::string & server : rpc_servers) {
+                    ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+                    if (dev) {
+                        ggml_backend_device_register(dev);
+                    } else {
+                        fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                        return 1;
+                    }
+                }
+            }
+        }
+    }
+
     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
     if (!cpu_dev) {
         fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);