Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions recipe/0001-Change-gpuAddress-for-contents.patch

This file was deleted.

180 changes: 180 additions & 0 deletions recipe/16576.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
From a8d57d66096a65019e2354ff9efe23688794f72e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <[email protected]>
Date: Tue, 14 Oct 2025 14:11:18 +0300
Subject: [PATCH 1/2] metal : avoid using Metal's gpuAddress property

---
ggml/src/ggml-metal/ggml-metal-device.m | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index c3fe8f4e91002..553cf8f5f39ac 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -7,6 +7,8 @@

#include <Metal/Metal.h>

+#include <stdatomic.h>
+
#ifndef TARGET_OS_VISION
#define TARGET_OS_VISION 0
#endif
@@ -22,6 +24,9 @@
// overload of MTLGPUFamilyMetal3 (not available in some environments)
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;

+// virtual address for GPU memory allocations
+static atomic_uintptr_t g_addr_device = 0x000000400ULL;
+
#if !GGML_METAL_EMBED_LIBRARY
// Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
@@ -827,7 +832,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
};

struct ggml_metal_buffer {
- void * all_data; // TODO: https://github.com/ggml-org/llama.cpp/pull/15985
+ void * all_data;
size_t all_size;

// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
@@ -965,14 +970,15 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
if (shared) {
res->all_data = ggml_metal_host_malloc(size_aligned);
res->is_shared = true;
- res->owned = true;
} else {
- // dummy, non-NULL value - we'll populate this after creating the Metal buffer below
- res->all_data = (void *) 0x000000400ULL;
+ // use virtual address from g_addr_device counter
+ res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
res->is_shared = false;
}
res->all_size = size_aligned;

+ res->owned = true;
+
res->device = ggml_metal_device_get_obj(dev);
res->queue = ggml_metal_device_get_queue(dev);

@@ -983,15 +989,13 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
res->buffers[0].metal = nil;

if (size_aligned > 0) {
- if (props_dev->use_shared_buffers &&shared) {
+ if (props_dev->use_shared_buffers && shared) {
res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
} else {
res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
-
- res->all_data = (void *) (res->buffers[0].metal.gpuAddress);
}
}

@@ -1139,7 +1143,7 @@ bool ggml_metal_buffer_is_shared(ggml_metal_buffer_t buf) {

void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
if (buf->is_shared) {
- memset((char *)tensor->data + offset, value, size);
+ memset((char *) tensor->data + offset, value, size);
return;
}

@@ -1168,7 +1172,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor

void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (buf->is_shared) {
- memcpy((char *)tensor->data + offset, data, size);
+ memcpy((char *) tensor->data + offset, data, size);
return;
}

@@ -1223,7 +1227,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *

void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
if (buf->is_shared) {
- memcpy(data, (const char *)tensor->data + offset, size);
+ memcpy(data, (const char *) tensor->data + offset, size);
return;
}


From 84e3d8d26961cca81de65b1790506121dda45bf5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <[email protected]>
Date: Tue, 14 Oct 2025 14:44:01 +0300
Subject: [PATCH 2/2] metal : fix rope kernels buffer check

---
ggml/src/ggml-metal/ggml-metal-impl.h | 1 +
ggml/src/ggml-metal/ggml-metal-ops.cpp | 1 +
ggml/src/ggml-metal/ggml-metal.metal | 8 ++++----
3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index a448c14f66b63..fa2d82cefb40e 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -251,6 +251,7 @@ typedef struct {
int32_t sect_1;
int32_t sect_2;
int32_t sect_3;
+ bool src2;
} ggml_metal_kargs_rope;

typedef struct {
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index a61ea8fb5a7b3..784b7b77851e6 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2969,6 +2969,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
/* sect_1 =*/ sect_1,
/* sect_2 =*/ sect_2,
/* sect_3 =*/ sect_3,
+ /* src2 =*/ op->src[2] != nullptr,
};

ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 1029cf8f9a3ab..6d39ddcc634ef 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3748,7 +3748,7 @@ kernel void kernel_rope_norm(

const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);

- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;

rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);

@@ -3801,7 +3801,7 @@ kernel void kernel_rope_neox(

const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);

- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;

rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);

@@ -3872,7 +3872,7 @@ kernel void kernel_rope_multi(

const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);

- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;

rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);

@@ -3939,7 +3939,7 @@ kernel void kernel_rope_vision(
const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
// end of mrope

- const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;

rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);

3 changes: 2 additions & 1 deletion recipe/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ source:
url: https://github.com/ggml-org/${{ name }}/archive/b${{ version | split(".") | list | last }}.tar.gz
sha256: bfe625422c8fa74cf12d1d6aff8bdbbe61c86647de1615c2e7b6f0cde4804e18
patches:
- 0001-Change-gpuAddress-for-contents.patch
# See: https://github.com/ggml-org/llama.cpp/pull/16576/
- 16576.patch

build:
number: ${{ build }}
Expand Down