diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index a610694423483..c36bffaaec7ee 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -51,6 +51,8 @@ GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backe GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_split_buffer_type(int main_device, const float * tensor_split); + // helper to check if the device supports a specific family // ideally, the user code should be doing these checks // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 7646f3f1346a4..ad5c45f5c6410 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -533,7 +533,8 @@ bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_t bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) { GGML_ASSERT(device); - return device->iface.supports_buft(device, buft); + bool res = device->iface.supports_buft(device, buft); + return res; } bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { @@ -762,7 +763,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co return -1; } -#if 0 +#if 1 #define GGML_SCHED_MAX_SPLITS_DEBUG 4096 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e76fb712631e1..44e6dfc2dcff9 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -14,7 +14,7 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) // max memory buffers that can be mapped to the device -#define GGML_METAL_MAX_BUFFERS 64 +#define GGML_METAL_MAX_BUFFERS 32 // max number of MTLCommandBuffer used to submit a graph for processing #define GGML_METAL_MAX_COMMAND_BUFFERS 8 @@ -36,6 +36,9 @@ // overload of MTLGPUFamilyMetal3 (not available in some environments) static const NSInteger MTLGPUFamilyMetal3_GGML = 5001; +// Matrix row padding to avoid out-of-bounds memory accesses +#define MATRIX_ROW_PADDING 512 + // initialized in ggml_backend_metal_reg static struct ggml_backend_reg g_ggml_backend_metal_reg; static struct ggml_backend_device g_ggml_backend_metal_device; @@ -1698,6 +1701,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { id rset; }; +// Helper function to calculate tensor size for split buffers +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + // Calculate the size based on the number of rows in the split + return nrows_split * ggml_row_size(tensor->type, tensor->ne[0]); +} + // rset init static bool ggml_backend_metal_buffer_rset_init( struct ggml_backend_metal_buffer_context * ctx, @@ -1841,6 +1850,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; } case GGML_OP_NONE: + //case GGML_OP_NOPE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_TRANSPOSE: @@ -5865,6 +5875,403 @@ static enum ggml_status ggml_metal_graph_compute( // backend interface +// Metal equivalent of ggml_tensor_extra_gpu +struct ggml_tensor_extra_metal { + // Metal buffers for each device (Metal only supports one device in current implementation) + // But we'll keep the array structure for consistency with CUDA + id data_device[1]; // Metal only supports one device currently +}; + +// Buffer type context +struct ggml_backend_metal_split_buffer_type_context { + int main_device; + float tensor_split[1]; // Metal only supports one device, but keeping array for API consistency + char* name; +}; + +// Buffer context +struct ggml_backend_metal_split_buffer_context { + struct ggml_tensor_extra_metal ** tensor_extras; + size_t tensor_extras_size; + size_t tensor_extras_capacity; +}; + +// Cleanup function for buffer context +static void ggml_backend_metal_split_buffer_context_free(struct ggml_backend_metal_split_buffer_context * ctx) { + if (ctx == NULL) return; + + for (size_t i = 0; i < ctx->tensor_extras_size; i++) { + struct ggml_tensor_extra_metal * extra = ctx->tensor_extras[i]; + if (extra != NULL) { + // Clean up Metal buffers + if (extra->data_device[0] != NULL) { + [extra->data_device[0] release]; + } + free(extra); + } + } + + free(ctx->tensor_extras); + free(ctx); +} + +// Tensor split calculation +static void get_row_split(int64_t * row_low, int64_t * row_high, const struct ggml_tensor * tensor, const float tensor_split[1], int id) { + GGML_LOG_DEBUG("%s: tensor '%s', id=%d, ne[1]=%lld\n", __func__, tensor->name, id, tensor->ne[1]); + + const int64_t nrows = ggml_nrows(tensor); + + // For Metal, we only have one device, so all rows go to device 0 + if (id == 0) { + // Use the tensor_split value to determine how much of the tensor goes to this device + *row_low = id == 0 ? 0 : (int64_t)(nrows * tensor_split[id]); + *row_high = id == 0 ? nrows : (int64_t)(nrows * tensor_split[id]); + GGML_LOG_DEBUG("%s: assigning rows [%lld, %lld] to device %d\n", __func__, *row_low, *row_high, id); + } else { + *row_low = 0; + *row_high = 0; + GGML_LOG_DEBUG("%s: device %d gets no rows\n", __func__, id); + } + + GGML_LOG_DEBUG("%s: tensor_split[0] = %f\n", __func__, (double)tensor_split[0]); +} + +// Buffer free function +static void ggml_backend_metal_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context; + ggml_backend_metal_split_buffer_context_free(ctx); +} + +// Buffer get base function +static void * ggml_backend_metal_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // The pointers are stored in the tensor extras, this is just a dummy address + return (void *)0x1000; + + GGML_UNUSED(buffer); +} + +// Buffer init tensor function +static enum ggml_status ggml_backend_metal_split_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_src == NULL); // views of split tensors are not supported + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + GGML_LOG_DEBUG("%s: initializing tensor '%s' with %d dimensions [%lld, %lld, %lld, %lld]\n", + __func__, tensor->name, ggml_n_dims(tensor), + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + + struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context; + struct ggml_backend_metal_split_buffer_type_context * buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + struct ggml_tensor_extra_metal * extra = calloc(1, sizeof(struct ggml_tensor_extra_metal)); + if (extra == NULL) { + GGML_LOG_ERROR("%s: failed to allocate tensor extra for '%s'\n", __func__, tensor->name); + return GGML_STATUS_ALLOC_FAILED; + } + + // For a dynamic array, we need to manually manage the array with proper capacity tracking + if (ctx->tensor_extras_size >= ctx->tensor_extras_capacity) { + size_t new_capacity = ctx->tensor_extras_capacity == 0 ? 16 : ctx->tensor_extras_capacity * 2; + struct ggml_tensor_extra_metal ** new_tensor_extras = realloc(ctx->tensor_extras, new_capacity * sizeof(struct ggml_tensor_extra_metal *)); + if (new_tensor_extras == NULL) { + GGML_LOG_ERROR("%s: failed to reallocate tensor_extras array\n", __func__); + free(extra); + return GGML_STATUS_ALLOC_FAILED; + } + ctx->tensor_extras = new_tensor_extras; + ctx->tensor_extras_capacity = new_capacity; + } + ctx->tensor_extras[ctx->tensor_extras_size] = extra; + ctx->tensor_extras_size++; + + // For Metal, we only have one device + int id = 0; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + GGML_LOG_DEBUG("%s: tensor '%s' row split: low=%lld, high=%lld\n", __func__, tensor->name, row_low, row_high); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + GGML_LOG_DEBUG("%s: tensor '%s' has 0 rows, skipping allocation\n", __func__, tensor->name); + tensor->extra = extra; + return GGML_STATUS_SUCCESS; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + GGML_LOG_DEBUG("%s: tensor '%s' size=%zu bytes\n", __func__, tensor->name, size); + + // const size_t original_size = size; // Not used in this implementation + + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size_t padding = ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + GGML_LOG_DEBUG("%s: tensor '%s' adding padding=%zu bytes\n", __func__, tensor->name, padding); + size += padding; + } + + // Get Metal device context + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buffer->buft->device->context; + GGML_LOG_DEBUG("%s: tensor '%s' using Metal device: %s\n", __func__, tensor->name, ctx_dev->name); + + // Allocate Metal buffer directly using ctx_dev->mtl_device + GGML_LOG_DEBUG("%s: tensor '%s' allocating Metal buffer with size=%zu\n", __func__, tensor->name, size); + extra->data_device[id] = [ctx_dev->mtl_device newBufferWithLength:size + options:MTLResourceStorageModeShared]; + + if (extra->data_device[id] == nil) { + GGML_LOG_ERROR("%s: failed to allocate Metal buffer for tensor '%s' with size=%zu\n", __func__, tensor->name, size); + free(extra); + return GGML_STATUS_ALLOC_FAILED; + } + + GGML_LOG_DEBUG("%s: tensor '%s' Metal buffer allocated at %p\n", __func__, tensor->name, (void *)extra->data_device[id]); + + // Initialize buffer with zeros + GGML_LOG_DEBUG("%s: tensor '%s' initializing buffer with zeros\n", __func__, tensor->name); + void * bufferContents = [extra->data_device[id] contents]; + if (bufferContents == NULL) { + // If we can't access the buffer contents directly, we'll skip initialization + // Buffers with StorageModePrivate are typically zero-initialized by Metal + // or will be initialized when first used + GGML_LOG_DEBUG("%s: Metal buffer contents is NULL for tensor '%s', skipping zero initialization\n", __func__, tensor->name); + } else { + // We can access the buffer contents directly, so initialize with zeros + memset(bufferContents, 0, size); + } + + tensor->extra = extra; + GGML_LOG_DEBUG("%s: tensor '%s' initialization completed successfully\n", __func__, tensor->name); + return GGML_STATUS_SUCCESS; +} + +// Buffer set tensor function +static void ggml_backend_metal_split_buffer_set_tensor( + ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) +{ + // Must set entire tensor at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(ggml_is_contiguous(tensor)); + + struct ggml_backend_metal_split_buffer_context *ctx = (struct ggml_backend_metal_split_buffer_context *) buffer->context; + struct ggml_backend_metal_split_buffer_type_context *buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *) buffer->buft->context; + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + struct ggml_tensor_extra_metal *extra = (struct ggml_tensor_extra_metal *) tensor->extra; + + // For Metal we treat id=0 as the (only) device; loop structure left in place + const int device_count = 1; + for (int id = 0; id < device_count; ++id) { + const float id_ = 1.0f; + int64_t row_low = 0, row_high = 0; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + int64_t nrows = row_high - row_low; + if (nrows <= 0) { + continue; + } + // Compute offset and sizes for this slice + const size_t offset_split = (size_t)row_low * nb1; + size_t original_size = ggml_nbytes_split(tensor, nrows); + size_t copy_size = original_size; + // Pad for alignment (if needed) but we only copy original_size bytes + if (ne0 % MATRIX_ROW_PADDING != 0) { + copy_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - (ne0 % MATRIX_ROW_PADDING)); + } + const char *buf_host = (const char *)data + offset_split; + + // Copy the slice into the Metal buffer (contents pointer to GPU memory) + memcpy([extra->data_device[id] contents], buf_host, original_size); + // On macOS, inform Metal that buffer range was modified so GPU sees new data:contentReference[oaicite:2]{index=2} + [extra->data_device[id] didModifyRange:NSMakeRange(0, original_size)]; + } +} + + +// Buffer get tensor function +static void ggml_backend_metal_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + // Split tensors must always be retrieved in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + struct ggml_backend_metal_split_buffer_context * ctx = (struct ggml_backend_metal_split_buffer_context *)buffer->context; + struct ggml_backend_metal_split_buffer_type_context * buft_ctx = (struct ggml_backend_metal_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + struct ggml_tensor_extra_metal * extra = (struct ggml_tensor_extra_metal *)tensor->extra; + + // For Metal, we only have one device + int id = 0; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + return; + } + + const size_t offset_split = row_low * nb1; + size_t alloc_size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = alloc_size; + + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + + // Copy data from Metal buffer + memcpy(buf_host, [extra->data_device[id] contents], original_size); +} + +// Buffer clear function +static void ggml_backend_metal_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_UNUSED(buffer); + GGML_UNUSED(value); + // Not implemented for split buffers +} + +// Buffer interface +static const struct ggml_backend_buffer_i ggml_backend_metal_split_buffer_interface = { + /* .free_buffer = */ ggml_backend_metal_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_metal_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_metal_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_metal_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_metal_split_buffer_clear, + /* .reset = */ NULL, +}; + +// Buffer type interface functions +static const char * ggml_backend_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + struct ggml_backend_metal_split_buffer_type_context * ctx = (struct ggml_backend_metal_split_buffer_type_context *)buft->context; + return ctx->name; +} + +static bool ggml_backend_buft_is_metal_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_split_buffer_type_get_name; +} + +static ggml_backend_buffer_t ggml_backend_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // Since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // Instead, we allocate them for each tensor separately in init_tensor + // However, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. This limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + struct ggml_backend_metal_split_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_split_buffer_context)); + if (ctx == NULL) { + GGML_LOG_ERROR("%s: failed to allocate split buffer context\n", __func__); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_metal_split_buffer_interface, ctx, size); +} + +static size_t ggml_backend_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + struct ggml_backend_metal_split_buffer_type_context * ctx = (struct ggml_backend_metal_split_buffer_type_context *)buft->context; + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + // For Metal, we only have one device + int id = 0; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + return total_size; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + return total_size; +} + +static bool ggml_backend_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + +// Buffer type interface +static const struct ggml_backend_buffer_type_i ggml_backend_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_split_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_split_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_split_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_split_buffer_type_is_host, +}; + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_split_buffer_type(int main_device, const float * tensor_split) { + GGML_LOG_INFO("%s: creating Metal split buffer type, main_device=%d\n", __func__, main_device); + + // For Metal, we only support one device, so we simplify the implementation + // We'll just create a new buffer type context each time since Metal only has one device + + struct ggml_backend_metal_split_buffer_type_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_split_buffer_type_context)); + if (ctx == NULL) { + GGML_LOG_ERROR("%s: failed to allocate buffer type context\n", __func__); + return NULL; + } + + ctx->main_device = main_device; + + // Properly handle tensor split values + if (tensor_split != NULL) { + ctx->tensor_split[0] = tensor_split[0]; + GGML_LOG_DEBUG("%s: tensor_split[0] = %f (from input)\n", __func__, (double)ctx->tensor_split[0]); + } else { + ctx->tensor_split[0] = 1.0f; // All tensors go to the single Metal device + GGML_LOG_DEBUG("%s: tensor_split[0] = %f (default)\n", __func__, (double)ctx->tensor_split[0]); + } + + ctx->name = "Metal_Split"; + + // Allocate a new buffer type structure each time + struct ggml_backend_buffer_type * buft = calloc(1, sizeof(struct ggml_backend_buffer_type)); + if (buft == NULL) { + GGML_LOG_ERROR("%s: failed to allocate buffer type\n", __func__); + free(ctx); + return NULL; + } + + buft->iface = ggml_backend_split_buffer_type_interface; + buft->device = ggml_backend_reg_dev_get(ggml_backend_metal_reg(), main_device); + if (buft->device == NULL) { + GGML_LOG_ERROR("%s: failed to get device for main_device=%d\n", __func__, main_device); + free(ctx); + free(buft); + return NULL; + } + buft->context = ctx; + + GGML_LOG_DEBUG("%s: buffer type created successfully\n", __func__); + + return buft; +} + + static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @@ -6507,9 +6914,14 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return - buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; + bool res = + buft->iface.get_name == ggml_backend_metal_buffer_type_get_name + || + buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name + || + buft->iface.get_name == ggml_backend_split_buffer_type_get_name; + + return res; GGML_UNUSED(dev); } @@ -6579,6 +6991,9 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r } static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_split_buffer_type; + } if (strcmp(name, "ggml_backend_get_features") == 0) { return (void *)ggml_backend_metal_get_features; } diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index d4833068d0016..ce5c3283b5f0d 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -33,6 +33,9 @@ namespace fs = std::filesystem; +// Forward declaration for device map access +static std::unordered_map& get_rpc_dev_map(); + static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB #ifdef _WIN32 @@ -1760,16 +1763,47 @@ static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) { } static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) { - return 0; + const auto& dev_map = get_rpc_dev_map(); + return dev_map.size(); GGML_UNUSED(reg); } static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) { - GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead"); + const auto& dev_map = get_rpc_dev_map(); + + if (index >= dev_map.size()) { + return nullptr; + } + + // Convert unordered_map to vector to access by index + std::vector devices; + devices.reserve(dev_map.size()); + for (const auto& pair : dev_map) { + devices.push_back(pair.second); + } + + if (index < devices.size()) { + return devices[index]; + } + + return nullptr; GGML_UNUSED(reg); - GGML_UNUSED(index); +} + +static ggml_backend_buffer_type_t ggml_backend_rpc_split_buffer_type(int main_device, const float * tensor_split) { + // For RPC backend, we don't implement actual tensor splitting + // Just return the default buffer type for the main device + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_rpc_reg(), main_device); + if (!dev) { + return nullptr; + } + + // Suppress unused parameter warning + GGML_UNUSED(tensor_split); + + return ggml_backend_dev_buffer_type(dev); } static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) { @@ -1779,6 +1813,9 @@ static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const ch if (std::strcmp(name, "ggml_backend_rpc_start_server") == 0) { return (void *)ggml_backend_rpc_start_server; } + if (std::strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_rpc_split_buffer_type; + } return NULL; GGML_UNUSED(reg); @@ -1801,8 +1838,14 @@ ggml_backend_reg_t ggml_backend_rpc_reg(void) { return &ggml_backend_rpc_reg; } -ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { +// Expose the device map for enumeration +static std::unordered_map& get_rpc_dev_map() { static std::unordered_map dev_map; + return dev_map; +} + +ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { + auto& dev_map = get_rpc_dev_map(); static std::mutex mutex; std::lock_guard lock(mutex); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b9e4634a7061c..2efc3ba6fad20 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -377,8 +377,13 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s if (ggml_backend_split_buffer_type_fn) { size_t dev_index = [&]() { auto * reg = ggml_backend_dev_backend_reg(dev); - for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) { - if (ggml_backend_reg_dev_get(reg, i) == dev) { + size_t reg_dev_count = ggml_backend_reg_dev_count(reg); + LLAMA_LOG_DEBUG("%s: device %s, reg %s, device count %zu\n", __func__, ggml_backend_dev_name(dev), ggml_backend_reg_name(reg), reg_dev_count); + for (size_t i = 0; i < reg_dev_count; ++i) { + ggml_backend_dev_t reg_dev = ggml_backend_reg_dev_get(reg, i); + LLAMA_LOG_DEBUG("%s: comparing device %s with reg device %s at index %zu\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_name(reg_dev), i); + if (reg_dev == dev) { + LLAMA_LOG_DEBUG("%s: found device %s at index %zu\n", __func__, ggml_backend_dev_name(dev), i); return i; } } diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 9b9803dedabef..af152e4092278 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -20,6 +20,7 @@ #include "common.h" #include "ggml.h" +#include "ggml-rpc.h" #include "llama.h" #ifdef _WIN32 @@ -1827,6 +1828,39 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // Register RPC devices if specified + for (const auto& rpc_servers_str : params.rpc_servers) { + if (!rpc_servers_str.empty()) { + auto rpc_servers = string_split(rpc_servers_str, ','); + if (!rpc_servers.empty()) { + ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + fprintf(stderr, "%s: failed to find RPC backend\n", __func__); + return 1; + } + + typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); + ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = + (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); + return 1; + } + + // Register each RPC device + for (const std::string & server : rpc_servers) { + ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (dev) { + ggml_backend_device_register(dev); + } else { + fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); + return 1; + } + } + } + } + } + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);