Skip to content

Commit c5cc46b

Browse files
committed
checkpoint
1 parent e4e068a commit c5cc46b

File tree

1 file changed

+302
-14
lines changed

1 file changed

+302
-14
lines changed

ggml/src/ggml-metal/ggml-metal.m

Lines changed: 302 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,6 @@
88

99
#import <Metal/Metal.h>
1010

11-
#ifdef __cplusplus
12-
#include <array>
13-
#include <map>
14-
#include <mutex>
15-
#include <vector>
16-
#endif
17-
1811
#undef MIN
1912
#undef MAX
2013
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -5878,17 +5871,307 @@ static enum ggml_status ggml_metal_graph_compute(
58785871

58795872
// backend interface
58805873

5874+
// Metal equivalent of ggml_tensor_extra_gpu
5875+
struct ggml_tensor_extra_metal {
5876+
// Metal buffers for each device (Metal only supports one device in current implementation)
5877+
// But we'll keep the array structure for consistency with CUDA
5878+
id<MTLBuffer> data_device[1]; // Metal only supports one device currently
5879+
};
5880+
5881+
// Buffer type context
5882+
struct ggml_backend_metal_split_buffer_type_context {
5883+
int main_device;
5884+
std::array<float, 1> tensor_split; // Metal only supports one device, but keeping array for API consistency
5885+
std::string name;
5886+
};
5887+
5888+
// Buffer context
5889+
struct ggml_backend_metal_split_buffer_context {
5890+
~ggml_backend_metal_split_buffer_context() {
5891+
for (ggml_tensor_extra_metal * extra : tensor_extras) {
5892+
// Clean up Metal buffers
5893+
if (extra->data_device[0] != nullptr) {
5894+
[extra->data_device[0] release];
5895+
}
5896+
delete extra;
5897+
}
5898+
}
5899+
5900+
std::vector<ggml_tensor_extra_metal *> tensor_extras;
5901+
};
5902+
5903+
// Tensor split calculation
5904+
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, 1> & tensor_split, int id) {
5905+
// For Metal, we only have one device, so all rows go to device 0
5906+
if (id == 0) {
5907+
*row_low = 0;
5908+
*row_high = tensor->ne[1];
5909+
} else {
5910+
*row_low = 0;
5911+
*row_high = 0;
5912+
}
5913+
5914+
GGML_UNUSED(tensor_split);
5915+
}
5916+
5917+
// Buffer free function
5918+
static void ggml_backend_metal_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
5919+
ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context;
5920+
delete ctx;
5921+
}
5922+
5923+
// Buffer get base function
5924+
static void * ggml_backend_metal_split_buffer_get_base(ggml_backend_buffer_t buffer) {
5925+
// The pointers are stored in the tensor extras, this is just a dummy address
5926+
return (void *)0x1000;
5927+
5928+
GGML_UNUSED(buffer);
5929+
}
5930+
5931+
// Buffer init tensor function
5932+
static enum ggml_status ggml_backend_metal_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
5933+
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
5934+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
5935+
5936+
ggml_backend_metal_split_buffer_context * ctx = (ggml_backend_metal_split_buffer_context *)buffer->context;
5937+
ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
5938+
5939+
const int64_t ne0 = tensor->ne[0];
5940+
5941+
ggml_tensor_extra_metal * extra = new ggml_tensor_extra_metal{};
5942+
ctx->tensor_extras.push_back(extra);
5943+
5944+
// For Metal, we only have one device
5945+
int id = 0;
5946+
int64_t row_low, row_high;
5947+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
5948+
5949+
int64_t nrows_split = row_high - row_low;
5950+
if (nrows_split == 0) {
5951+
tensor->extra = extra;
5952+
return GGML_STATUS_SUCCESS;
5953+
}
5954+
5955+
size_t size = ggml_nbytes_split(tensor, nrows_split);
5956+
const size_t original_size = size;
5957+
5958+
// Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
5959+
if (ne0 % MATRIX_ROW_PADDING != 0) {
5960+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
5961+
}
5962+
5963+
// Get Metal device context
5964+
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buffer->buft->device->context;
5965+
id<MTLDevice> device = ctx_dev->mtl_device;
5966+
5967+
// Allocate Metal buffer
5968+
extra->data_device[id] = [device newBufferWithLength:size options:MTLResourceStorageModePrivate];
5969+
5970+
// Initialize buffer with zeros
5971+
memset([extra->data_device[id] contents], 0, size);
5972+
5973+
tensor->extra = extra;
5974+
return GGML_STATUS_SUCCESS;
5975+
}
5976+
5977+
// Buffer set tensor function
5978+
static void ggml_backend_metal_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
5979+
// Split tensors must always be set in their entirety at once
5980+
GGML_ASSERT(offset == 0);
5981+
GGML_ASSERT(size == ggml_nbytes(tensor));
5982+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
5983+
5984+
ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
5985+
5986+
const int64_t ne0 = tensor->ne[0];
5987+
const size_t nb1 = tensor->nb[1];
5988+
ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra;
5989+
5990+
// For Metal, we only have one device
5991+
int id = 0;
5992+
int64_t row_low, row_high;
5993+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
5994+
5995+
int64_t nrows_split = row_high - row_low;
5996+
if (nrows_split == 0) {
5997+
return;
5998+
}
5999+
6000+
const size_t offset_split = row_low * nb1;
6001+
size_t alloc_size = ggml_nbytes_split(tensor, nrows_split);
6002+
const size_t original_size = alloc_size;
6003+
6004+
// Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
6005+
if (ne0 % MATRIX_ROW_PADDING != 0) {
6006+
alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
6007+
}
6008+
6009+
const char * buf_host = (const char *)data + offset_split;
6010+
6011+
// Copy data to Metal buffer
6012+
memcpy([extra->data_device[id] contents], buf_host, original_size);
6013+
}
6014+
6015+
// Buffer get tensor function
6016+
static void ggml_backend_metal_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
6017+
// Split tensors must always be retrieved in their entirety at once
6018+
GGML_ASSERT(offset == 0);
6019+
GGML_ASSERT(size == ggml_nbytes(tensor));
6020+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
6021+
6022+
ggml_backend_metal_split_buffer_type_context * buft_ctx = (ggml_backend_metal_split_buffer_type_context *)buffer->buft->context;
6023+
6024+
const int64_t ne0 = tensor->ne[0];
6025+
const size_t nb1 = tensor->nb[1];
6026+
ggml_tensor_extra_metal * extra = (ggml_tensor_extra_metal *)tensor->extra;
6027+
6028+
// For Metal, we only have one device
6029+
int id = 0;
6030+
int64_t row_low, row_high;
6031+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
6032+
6033+
int64_t nrows_split = row_high - row_low;
6034+
if (nrows_split == 0) {
6035+
return;
6036+
}
6037+
6038+
const size_t offset_split = row_low * nb1;
6039+
size_t alloc_size = ggml_nbytes_split(tensor, nrows_split);
6040+
const size_t original_size = alloc_size;
6041+
6042+
// Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
6043+
if (ne0 % MATRIX_ROW_PADDING != 0) {
6044+
alloc_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
6045+
}
6046+
6047+
char * buf_host = (char *)data + offset_split;
6048+
6049+
// Copy data from Metal buffer
6050+
memcpy(buf_host, [extra->data_device[id] contents], original_size);
6051+
}
6052+
6053+
// Buffer clear function
6054+
static void ggml_backend_metal_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
6055+
GGML_UNUSED(buffer);
6056+
GGML_UNUSED(value);
6057+
// Not implemented for split buffers
6058+
}
6059+
6060+
// Buffer interface
6061+
static const ggml_backend_buffer_i ggml_backend_metal_split_buffer_interface = {
6062+
/* .free_buffer = */ ggml_backend_metal_split_buffer_free_buffer,
6063+
/* .get_base = */ ggml_backend_metal_split_buffer_get_base,
6064+
/* .init_tensor = */ ggml_backend_metal_split_buffer_init_tensor,
6065+
/* .memset_tensor = */ NULL,
6066+
/* .set_tensor = */ ggml_backend_metal_split_buffer_set_tensor,
6067+
/* .get_tensor = */ ggml_backend_metal_split_buffer_get_tensor,
6068+
/* .cpy_tensor = */ NULL,
6069+
/* .clear = */ ggml_backend_metal_split_buffer_clear,
6070+
/* .reset = */ NULL,
6071+
};
6072+
6073+
// Buffer type interface functions
6074+
static const char * ggml_backend_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
6075+
ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context;
6076+
return ctx->name.c_str();
6077+
}
6078+
6079+
static bool ggml_backend_buft_is_metal_split(ggml_backend_buffer_type_t buft) {
6080+
return buft->iface.get_name == ggml_backend_split_buffer_type_get_name;
6081+
}
6082+
6083+
static ggml_backend_buffer_t ggml_backend_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
6084+
// Since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
6085+
// Instead, we allocate them for each tensor separately in init_tensor
6086+
// However, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
6087+
// as returned by get_alloc_size. This limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
6088+
ggml_backend_metal_split_buffer_context * ctx = new ggml_backend_metal_split_buffer_context();
6089+
6090+
return ggml_backend_buffer_init(buft, ggml_backend_metal_split_buffer_interface, ctx, size);
6091+
}
6092+
6093+
static size_t ggml_backend_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
6094+
return 128;
6095+
6096+
GGML_UNUSED(buft);
6097+
}
6098+
6099+
static size_t ggml_backend_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
6100+
ggml_backend_metal_split_buffer_type_context * ctx = (ggml_backend_metal_split_buffer_type_context *)buft->context;
6101+
GGML_ASSERT(ggml_is_contiguous(tensor) && "split buffers only supported for contiguous tensors");
6102+
6103+
size_t total_size = 0;
6104+
6105+
const int64_t ne0 = tensor->ne[0];
6106+
6107+
// For Metal, we only have one device
6108+
int id = 0;
6109+
int64_t row_low, row_high;
6110+
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
6111+
6112+
int64_t nrows_split = row_high - row_low;
6113+
if (nrows_split == 0) {
6114+
return total_size;
6115+
}
6116+
6117+
total_size += ggml_nbytes_split(tensor, nrows_split);
6118+
6119+
// Pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
6120+
if (ne0 % MATRIX_ROW_PADDING != 0) {
6121+
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
6122+
}
6123+
6124+
return total_size;
6125+
}
6126+
6127+
static bool ggml_backend_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
6128+
return false;
6129+
6130+
GGML_UNUSED(buft);
6131+
}
6132+
6133+
// Buffer type interface
6134+
static const ggml_backend_buffer_type_i ggml_backend_split_buffer_type_interface = {
6135+
/* .get_name = */ ggml_backend_split_buffer_type_get_name,
6136+
/* .alloc_buffer = */ ggml_backend_split_buffer_type_alloc_buffer,
6137+
/* .get_alignment = */ ggml_backend_split_buffer_type_get_alignment,
6138+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
6139+
/* .get_alloc_size = */ ggml_backend_split_buffer_type_get_alloc_size,
6140+
/* .is_host = */ ggml_backend_split_buffer_type_is_host,
6141+
};
6142+
58816143
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_split_buffer_type(int main_device, const float * tensor_split) {
58826144
GGML_LOG_INFO("%s: creating Metal split buffer type, main_device=%d\n", __func__, main_device);
58836145

5884-
// For Metal split buffer type, we return the regular Metal buffer type
5885-
// since Metal currently only supports one device
5886-
ggml_backend_buffer_type_t buft = ggml_backend_metal_buffer_type();
5887-
GGML_LOG_INFO("%s: returning Metal buffer type\n", __func__);
5888-
return buft;
6146+
static std::mutex mutex;
6147+
std::lock_guard<std::mutex> lock(mutex);
6148+
6149+
static std::map<std::pair<int, std::array<float, 1>>, struct ggml_backend_buffer_type> buft_map;
6150+
6151+
std::array<float, 1> tensor_split_arr = {};
6152+
6153+
// For Metal, we only support one device, so we simplify the tensor split logic
6154+
tensor_split_arr[0] = 1.0f; // All tensors go to the single Metal device
6155+
6156+
auto it = buft_map.find({main_device, tensor_split_arr});
6157+
if (it != buft_map.end()) {
6158+
return &it->second;
6159+
}
58896160

5890-
GGML_UNUSED(main_device);
5891-
GGML_UNUSED(tensor_split);
6161+
auto * ctx = new ggml_backend_metal_split_buffer_type_context{
6162+
main_device,
6163+
tensor_split_arr,
6164+
std::string("Metal_Split"),
6165+
};
6166+
6167+
struct ggml_backend_buffer_type buft {
6168+
/* .iface = */ ggml_backend_split_buffer_type_interface,
6169+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), main_device),
6170+
/* .context = */ ctx,
6171+
};
6172+
6173+
auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
6174+
return &result.first->second;
58926175
}
58936176

58946177

@@ -6635,6 +6918,11 @@ static void ggml_metal_cleanup(void) {
66356918

66366919
#ifdef __cplusplus
66376920

6921+
#include <array>
6922+
#include <map>
6923+
#include <mutex>
6924+
#include <vector>
6925+
66386926
#define MATRIX_ROW_PADDING 512 // As defined in CUDA implementation
66396927

66406928
// Metal equivalent of ggml_tensor_extra_gpu

0 commit comments

Comments
 (0)