|
8 | 8 |
|
9 | 9 | #import <Metal/Metal.h> |
10 | 10 |
|
11 | | -#ifdef __cplusplus |
12 | | -#include <array> |
13 | | -#include <map> |
14 | | -#include <mutex> |
15 | | -#include <vector> |
16 | | -#endif |
17 | | - |
18 | 11 | #undef MIN |
19 | 12 | #undef MAX |
20 | 13 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
@@ -5878,17 +5871,307 @@ static enum ggml_status ggml_metal_graph_compute( |
5878 | 5871 |
|
5879 | 5872 | // backend interface |
5880 | 5873 |
|
| 5874 | +// Metal equivalent of ggml_tensor_extra_gpu |
| 5875 | +struct ggml_tensor_extra_metal { |
| 5876 | + // Metal buffers for each device (Metal only supports one device in current implementation) |
| 5877 | + // But we'll keep the array structure for consistency with CUDA |
| 5878 | + id<MTLBuffer> data_device[1]; // Metal only supports one device currently |
| 5879 | +}; |
| 5880 | + |
| 5881 | +// Buffer type context |
| 5882 | +struct ggml_backend_metal_split_buffer_type_context { |
| 5883 | + int main_device; |
| 5884 | + std::array<float, 1> tensor_split; // Metal only supports one device, but keeping array for API consistency |
| 5885 | + std::string name; |
| 5886 | +}; |
| 5887 | + |
| 5888 | +// Buffer context |
| 5889 | +struct ggml_backend_metal_split_buffer_context { |
| 5890 | + ~ggml_backend_metal_split_buffer_context() { |
| 5891 | + for (ggml_tensor_extra_metal * extra : tensor_extras) { |
| 5892 | + // Clean up Metal buffers |
| 5893 | + if (extra->data_device[0] != nullptr) { |
| 5894 | + [extra->data_device[0] release]; |
| 5895 | + } |
| 5896 | + delete extra; |
| 5897 | + } |
| 5898 | + } |
| 5899 | + |
| 5900 | + std::vector<ggml_tensor_extra_metal *> tensor_extras; |
| 5901 | +}; |
| 5902 | + |
| 5903 | +// Tensor split calculation |
| 5904 | +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, 1> & tensor_split, int id) { |
| 5905 | + // For Metal, we only have one device, so all rows go to device 0 |
| 5906 | + if (id == 0) { |
| 5907 | + *row_low = 0; |
| 5908 | + *row_high = tensor->ne[1]; |
| 5909 | + } else { |
| 5910 | + *row_low = 0; |
| 5911 | + *row_high = 0; |
| 5912 | + } |
| 5913 | + |
| 5914 | + GGML_UNUSED(tensor_split); |
| 5915 | +} |
| 5916 | + |
| 5917 | +// Buffer free function |
| 5918 | +static void ggml_backend_metal_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { |
| 5919 | + ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context; |
| 5920 | + delete ctx; |
| 5921 | +} |
| 5922 | + |
| 5923 | +// Buffer get base function |
| 5924 | +static void * ggml_backend_metal_split_buffer_get_base(ggml_backend_buffer_t buffer) { |
| 5925 | + // The pointers are stored in the tensor extras, this is just a dummy address |
| 5926 | + return (void *)0x1000; |
| 5927 | + |
| 5928 | + GGML_UNUSED(buffer); |
| 5929 | +} |
| 5930 | + |
| 5931 | +// Buffer init tensor function |
| 5932 | +static enum ggml_status ggml_backend_metal_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { |
| 5933 | + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported |
| 5934 | + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); |
| 5935 | + |
| 5936 | + ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context; |
| 5937 | + ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context; |
| 5938 | + |
| 5939 | + const int64_t ne0 = tensor->ne[0]; |
| 5940 | + |
| 5941 | + ggml_tensor_extra_metal * extra = new ggml_tensor_extra_metal{}; |
| 5942 | + ctx->tensor_extras.push_back(extra); |
| 5943 | + |
| 5944 | + // For Metal, we only have one device |
| 5945 | + int id = 0; |
| 5946 | + int64_t row_low, row_high; |
| 5947 | + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); |
| 5948 | + |
| 5949 | + int64_t nrows_split = row_high - row_low; |
| 5950 | + if (nrows_split == 0) { |
| 5951 | + tensor->extra = extra; |
| 5952 | + return GGML_STATUS_SUCCESS; |
| 5953 | + } |
| 5954 | + |
| 5955 | + size_t size = ggml_nbytes_split(tensor, nrows_split); |
| 5956 | + const size_t original_size = size; |
| 5957 | + |
| 5958 | + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses |
| 5959 | + if (ne0 % MATRIX_ROW_PADDING != 0) { |
| 5960 | + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); |
| 5961 | + } |
| 5962 | + |
| 5963 | + // Get Metal device context |
| 5964 | + struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buffer->buft->device->context; |
| 5965 | + id<MTLDevice> device = ctx_dev->mtl_device; |
| 5966 | + |
| 5967 | + // Allocate Metal buffer |
| 5968 | + extra->data_device[id] = [device newBufferWithLength:size options:MTLResourceStorageModePrivate]; |
| 5969 | + |
| 5970 | + // Initialize buffer with zeros |
| 5971 | + memset([extra->data_device[id] contents], 0, size); |
| 5972 | + |
| 5973 | + tensor->extra = extra; |
| 5974 | + return GGML_STATUS_SUCCESS; |
| 5975 | +} |
| 5976 | + |
| 5977 | +// Buffer set tensor function |
| 5978 | +static void ggml_backend_metal_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
| 5979 | + // Split tensors must always be set in their entirety at once |
| 5980 | + GGML_ASSERT(offset == 0); |
| 5981 | + GGML_ASSERT(size == ggml_nbytes(tensor)); |
| 5982 | + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); |
| 5983 | + |
| 5984 | + ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context; |
| 5985 | + |
| 5986 | + const int64_t ne0 = tensor->ne[0]; |
| 5987 | + const size_t nb1 = tensor->nb[1]; |
| 5988 | + ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra; |
| 5989 | + |
| 5990 | + // For Metal, we only have one device |
| 5991 | + int id = 0; |
| 5992 | + int64_t row_low, row_high; |
| 5993 | + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); |
| 5994 | + |
| 5995 | + int64_t nrows_split = row_high - row_low; |
| 5996 | + if (nrows_split == 0) { |
| 5997 | + return; |
| 5998 | + } |
| 5999 | + |
| 6000 | + const size_t offset_split = row_low * nb1; |
| 6001 | + size_t alloc_size = ggml_nbytes_split(tensor, nrows_split); |
| 6002 | + const size_t original_size = alloc_size; |
| 6003 | + |
| 6004 | + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses |
| 6005 | + if (ne0 % MATRIX_ROW_PADDING != 0) { |
| 6006 | + alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); |
| 6007 | + } |
| 6008 | + |
| 6009 | + const char * buf_host = (const char *)data + offset_split; |
| 6010 | + |
| 6011 | + // Copy data to Metal buffer |
| 6012 | + memcpy([extra->data_device[id] contents], buf_host, original_size); |
| 6013 | +} |
| 6014 | + |
| 6015 | +// Buffer get tensor function |
| 6016 | +static void ggml_backend_metal_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
| 6017 | + // Split tensors must always be retrieved in their entirety at once |
| 6018 | + GGML_ASSERT(offset == 0); |
| 6019 | + GGML_ASSERT(size == ggml_nbytes(tensor)); |
| 6020 | + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); |
| 6021 | + |
| 6022 | + ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context; |
| 6023 | + |
| 6024 | + const int64_t ne0 = tensor->ne[0]; |
| 6025 | + const size_t nb1 = tensor->nb[1]; |
| 6026 | + ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra; |
| 6027 | + |
| 6028 | + // For Metal, we only have one device |
| 6029 | + int id = 0; |
| 6030 | + int64_t row_low, row_high; |
| 6031 | + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); |
| 6032 | + |
| 6033 | + int64_t nrows_split = row_high - row_low; |
| 6034 | + if (nrows_split == 0) { |
| 6035 | + return; |
| 6036 | + } |
| 6037 | + |
| 6038 | + const size_t offset_split = row_low * nb1; |
| 6039 | + size_t alloc_size = ggml_nbytes_split(tensor, nrows_split); |
| 6040 | + const size_t original_size = alloc_size; |
| 6041 | + |
| 6042 | + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses |
| 6043 | + if (ne0 % MATRIX_ROW_PADDING != 0) { |
| 6044 | + alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); |
| 6045 | + } |
| 6046 | + |
| 6047 | + char * buf_host = (char *)data + offset_split; |
| 6048 | + |
| 6049 | + // Copy data from Metal buffer |
| 6050 | + memcpy(buf_host, [extra->data_device[id] contents], original_size); |
| 6051 | +} |
| 6052 | + |
| 6053 | +// Buffer clear function |
| 6054 | +static void ggml_backend_metal_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { |
| 6055 | + GGML_UNUSED(buffer); |
| 6056 | + GGML_UNUSED(value); |
| 6057 | + // Not implemented for split buffers |
| 6058 | +} |
| 6059 | + |
| 6060 | +// Buffer interface |
| 6061 | +static const ggml_backend_buffer_i ggml_backend_metal_split_buffer_interface = { |
| 6062 | + /* .free_buffer = */ ggml_backend_metal_split_buffer_free_buffer, |
| 6063 | + /* .get_base = */ ggml_backend_metal_split_buffer_get_base, |
| 6064 | + /* .init_tensor = */ ggml_backend_metal_split_buffer_init_tensor, |
| 6065 | + /* .memset_tensor = */ NULL, |
| 6066 | + /* .set_tensor = */ ggml_backend_metal_split_buffer_set_tensor, |
| 6067 | + /* .get_tensor = */ ggml_backend_metal_split_buffer_get_tensor, |
| 6068 | + /* .cpy_tensor = */ NULL, |
| 6069 | + /* .clear = */ ggml_backend_metal_split_buffer_clear, |
| 6070 | + /* .reset = */ NULL, |
| 6071 | +}; |
| 6072 | + |
| 6073 | +// Buffer type interface functions |
| 6074 | +static const char * ggml_backend_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |
| 6075 | + ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context; |
| 6076 | + return ctx->name.c_str(); |
| 6077 | +} |
| 6078 | + |
| 6079 | +static bool ggml_backend_buft_is_metal_split(ggml_backend_buffer_type_t buft) { |
| 6080 | + return buft->iface.get_name == ggml_backend_split_buffer_type_get_name; |
| 6081 | +} |
| 6082 | + |
| 6083 | +static ggml_backend_buffer_t ggml_backend_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |
| 6084 | + // Since we don't know the exact split after rounding, we cannot allocate the device buffers at this point |
| 6085 | + // Instead, we allocate them for each tensor separately in init_tensor |
| 6086 | + // However, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, |
| 6087 | + // as returned by get_alloc_size. This limit is enforced during tensor allocation by ggml-alloc, so it must be correct. |
| 6088 | + ggml_backend_metal_split_buffer_context * ctx = new ggml_backend_metal_split_buffer_context(); |
| 6089 | + |
| 6090 | + return ggml_backend_buffer_init(buft, ggml_backend_metal_split_buffer_interface, ctx, size); |
| 6091 | +} |
| 6092 | + |
| 6093 | +static size_t ggml_backend_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |
| 6094 | + return 128; |
| 6095 | + |
| 6096 | + GGML_UNUSED(buft); |
| 6097 | +} |
| 6098 | + |
| 6099 | +static size_t ggml_backend_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { |
| 6100 | + ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context; |
| 6101 | + GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors"); |
| 6102 | + |
| 6103 | + size_t total_size = 0; |
| 6104 | + |
| 6105 | + const int64_t ne0 = tensor->ne[0]; |
| 6106 | + |
| 6107 | + // For Metal, we only have one device |
| 6108 | + int id = 0; |
| 6109 | + int64_t row_low, row_high; |
| 6110 | + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); |
| 6111 | + |
| 6112 | + int64_t nrows_split = row_high - row_low; |
| 6113 | + if (nrows_split == 0) { |
| 6114 | + return total_size; |
| 6115 | + } |
| 6116 | + |
| 6117 | + total_size += ggml_nbytes_split(tensor, nrows_split); |
| 6118 | + |
| 6119 | + // Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses |
| 6120 | + if (ne0 % MATRIX_ROW_PADDING != 0) { |
| 6121 | + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); |
| 6122 | + } |
| 6123 | + |
| 6124 | + return total_size; |
| 6125 | +} |
| 6126 | + |
| 6127 | +static bool ggml_backend_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { |
| 6128 | + return false; |
| 6129 | + |
| 6130 | + GGML_UNUSED(buft); |
| 6131 | +} |
| 6132 | + |
| 6133 | +// Buffer type interface |
| 6134 | +static const ggml_backend_buffer_type_i ggml_backend_split_buffer_type_interface = { |
| 6135 | + /* .get_name = */ ggml_backend_split_buffer_type_get_name, |
| 6136 | + /* .alloc_buffer = */ ggml_backend_split_buffer_type_alloc_buffer, |
| 6137 | + /* .get_alignment = */ ggml_backend_split_buffer_type_get_alignment, |
| 6138 | + /* .get_max_size = */ NULL, // defaults to SIZE_MAX |
| 6139 | + /* .get_alloc_size = */ ggml_backend_split_buffer_type_get_alloc_size, |
| 6140 | + /* .is_host = */ ggml_backend_split_buffer_type_is_host, |
| 6141 | +}; |
| 6142 | + |
5881 | 6143 | GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_split_buffer_type(int main_device, const float * tensor_split) { |
5882 | 6144 | GGML_LOG_INFO("%s: creating Metal split buffer type, main_device=%d\n", __func__, main_device); |
5883 | 6145 |
|
5884 | | - // For Metal split buffer type, we return the regular Metal buffer type |
5885 | | - // since Metal currently only supports one device |
5886 | | - ggml_backend_buffer_type_t buft = ggml_backend_metal_buffer_type(); |
5887 | | - GGML_LOG_INFO("%s: returning Metal buffer type\n", __func__); |
5888 | | - return buft; |
| 6146 | + static std::mutex mutex; |
| 6147 | + std::lock_guard<std::mutex> lock(mutex); |
| 6148 | + |
| 6149 | + static std::map<std::pair<int, std::array<float, 1>>, struct ggml_backend_buffer_type> buft_map; |
| 6150 | + |
| 6151 | + std::array<float, 1> tensor_split_arr = {}; |
| 6152 | + |
| 6153 | + // For Metal, we only support one device, so we simplify the tensor split logic |
| 6154 | + tensor_split_arr[0] = 1.0f; // All tensors go to the single Metal device |
| 6155 | + |
| 6156 | + auto it = buft_map.find({main_device, tensor_split_arr}); |
| 6157 | + if (it != buft_map.end()) { |
| 6158 | + return &it->second; |
| 6159 | + } |
5889 | 6160 |
|
5890 | | - GGML_UNUSED(main_device); |
5891 | | - GGML_UNUSED(tensor_split); |
| 6161 | + auto * ctx = new ggml_backend_metal_split_buffer_type_context{ |
| 6162 | + main_device, |
| 6163 | + tensor_split_arr, |
| 6164 | + std::string("Metal_Split"), |
| 6165 | + }; |
| 6166 | + |
| 6167 | + struct ggml_backend_buffer_type buft { |
| 6168 | + /* .iface = */ ggml_backend_split_buffer_type_interface, |
| 6169 | + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), main_device), |
| 6170 | + /* .context = */ ctx, |
| 6171 | + }; |
| 6172 | + |
| 6173 | + auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft); |
| 6174 | + return &result.first->second; |
5892 | 6175 | } |
5893 | 6176 |
|
5894 | 6177 |
|
@@ -6635,6 +6918,11 @@ static void ggml_metal_cleanup(void) { |
6635 | 6918 |
|
6636 | 6919 | #ifdef __cplusplus |
6637 | 6920 |
|
| 6921 | +#include <array> |
| 6922 | +#include <map> |
| 6923 | +#include <mutex> |
| 6924 | +#include <vector> |
| 6925 | + |
6638 | 6926 | #define MATRIX_ROW_PADDING 512 // As defined in CUDA implementation |
6639 | 6927 |
|
6640 | 6928 | // Metal equivalent of ggml_tensor_extra_gpu |
|
0 commit comments