Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
0c90859
WIP
JohannesGaessler May 17, 2025
838f577
WIP
JohannesGaessler May 17, 2025
aedc3f7
WIP
JohannesGaessler May 17, 2025
99bb015
try fix
JohannesGaessler May 17, 2025
06d7a88
WIP
JohannesGaessler May 17, 2025
0a69555
WIP
JohannesGaessler May 17, 2025
7563db8
WIP
JohannesGaessler May 17, 2025
363e237
WIP
JohannesGaessler May 19, 2025
47e6d24
fix
JohannesGaessler May 20, 2025
751e488
WIP
JohannesGaessler May 20, 2025
3f8f323
WIP
JohannesGaessler May 20, 2025
316ef4e
WIP
JohannesGaessler May 20, 2025
47b228f
try fix
JohannesGaessler May 20, 2025
cf4d0b6
try fix
JohannesGaessler May 20, 2025
bb48a90
try fix
JohannesGaessler May 20, 2025
016405b
WIP
JohannesGaessler May 21, 2025
7c17ff1
WIP
JohannesGaessler May 21, 2025
deda9c2
WIP
JohannesGaessler May 22, 2025
3c1291f
WIP
JohannesGaessler May 22, 2025
16d29fe
WIP
JohannesGaessler May 22, 2025
7468e9d
WIP
JohannesGaessler May 22, 2025
119657a
WIP
JohannesGaessler May 22, 2025
50d2c5e
WIP
JohannesGaessler May 22, 2025
fe2747e
try fix
JohannesGaessler May 22, 2025
6ddf206
try fix
JohannesGaessler May 22, 2025
996d263
WIP
JohannesGaessler May 22, 2025
3a432ab
WIP
JohannesGaessler May 22, 2025
9c6550e
WIP
JohannesGaessler May 23, 2025
2da2cc3
WIP
JohannesGaessler May 23, 2025
67f02bf
WIP
JohannesGaessler May 23, 2025
2e282d5
WIP
JohannesGaessler May 23, 2025
8860122
WIP
JohannesGaessler May 23, 2025
3d96528
WIP
JohannesGaessler May 23, 2025
2d2ef89
WIP
JohannesGaessler May 23, 2025
6b836c8
WIP
JohannesGaessler May 23, 2025
f5a5155
WIP
JohannesGaessler May 23, 2025
cc91ca1
WIP
JohannesGaessler May 23, 2025
6ee4d0e
WIP
JohannesGaessler May 23, 2025
935d652
WIP
JohannesGaessler May 23, 2025
4dacb2f
WIP
JohannesGaessler May 23, 2025
7b7f399
WIP
JohannesGaessler May 23, 2025
aeda7e0
WIP
JohannesGaessler May 23, 2025
95f1caf
WIP
JohannesGaessler May 23, 2025
1f648ba
WIP
JohannesGaessler May 23, 2025
e18d1ef
WIP
JohannesGaessler May 23, 2025
66c8eec
WIP
JohannesGaessler May 23, 2025
206ab58
WIP
JohannesGaessler May 23, 2025
ae1617c
WIP
JohannesGaessler May 24, 2025
f617bbb
WIP
JohannesGaessler May 24, 2025
528dd51
WIP
JohannesGaessler May 24, 2025
4006293
WIP
JohannesGaessler May 24, 2025
943456b
WIP
JohannesGaessler May 24, 2025
25c25ea
WIP
JohannesGaessler May 24, 2025
739d902
WIP
JohannesGaessler May 24, 2025
26807a9
WIP
JohannesGaessler May 24, 2025
1c9dcde
WIP
JohannesGaessler May 24, 2025
3c21fdd
WIP
JohannesGaessler May 24, 2025
9719003
WIP
JohannesGaessler May 24, 2025
1c37a20
WIP
JohannesGaessler May 24, 2025
f6dd08e
WIP
JohannesGaessler May 24, 2025
02e4af1
WIP
JohannesGaessler May 24, 2025
07ca4b8
WIP
JohannesGaessler May 24, 2025
c0358bd
WIP
JohannesGaessler May 24, 2025
ea3cab5
WIP
JohannesGaessler May 24, 2025
027d97e
WIP
JohannesGaessler May 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ extern "C" {
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_buft_is_split (ggml_backend_buffer_type_t buft);

//
// Backend buffer
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ extern "C" {
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
bool (*is_split) (ggml_backend_buffer_type_t buft);
};

struct ggml_backend_buffer_type {
Expand Down
429 changes: 345 additions & 84 deletions ggml/src/ggml-backend.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6422,6 +6422,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
/* .get_max_size = */ nullptr, // defaults to SIZE_MAX
/* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
/* .is_host = */ nullptr,
/* .is_split = */ nullptr,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
Expand Down
11 changes: 8 additions & 3 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -689,13 +689,16 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
GGML_UNUSED(buft);
}

static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft);

static const ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .is_host = */ NULL,
/* .is_split = */ ggml_backend_buft_is_cuda_split,
};

ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
Expand Down Expand Up @@ -1013,6 +1016,7 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
/* .is_split = */ ggml_backend_buft_is_cuda_split,
};

ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
Expand Down Expand Up @@ -1111,6 +1115,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
/* .is_split = */ NULL,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
/* .context = */ nullptr,
Expand Down Expand Up @@ -1907,7 +1912,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
}

static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
const bool split = false && ggml_backend_buft_is_cuda_split(src0->buffer->buft);

// If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
// But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
Expand Down Expand Up @@ -2124,7 +2129,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
if (false && dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}

Expand Down Expand Up @@ -2992,7 +2997,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
struct ggml_tensor * b = op->src[1];
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
// this avoids some edge cases (and the performance would not be good anyways)
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
if (false && a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
int64_t row_low;
int64_t row_high;
Expand Down
78 changes: 78 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -593,9 +593,87 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {

#ifdef __cplusplus
#include <vector>
#include <map>

// expose GGUF internals for test code
GGML_API size_t gguf_type_size(enum gguf_type type);
GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);

static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor, bool deep) {
if (!tensor) {
return nullptr;
}

if (tensor_map.find(tensor) != tensor_map.end()) {
return tensor_map[tensor];
}

ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
tensor_map[tensor] = new_tensor;

new_tensor->op = tensor->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
new_tensor->nb[i] = tensor->nb[i];
}
new_tensor->flags = tensor->flags;
memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
strcpy(new_tensor->name, tensor->name);
new_tensor->data = tensor->data;
new_tensor->buffer = tensor->buffer;
new_tensor->extra = tensor->extra;
new_tensor->view_offs = tensor->view_offs;

if (deep) {
new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src, deep);
for (int i = 0; i < GGML_MAX_SRC; i++) {
new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i], deep);
}
} else {
new_tensor->view_src = tensor->view_src;
for (int i = 0; i < GGML_MAX_SRC; i++) {
new_tensor->src[i] = tensor->src[i];
}
}

return new_tensor;
}

static void dup_graph(ggml_context * ctx, const ggml_cgraph * src, ggml_cgraph * dst, bool deep) {
std::map<ggml_tensor *, ggml_tensor *> tensor_map;

if (deep) {
for (int i = 0; i < src->n_leafs; i++) {
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i], deep));
}
for (int i = 0; i < src->n_nodes; i++) {
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i], deep));
}
} else {
for (int i = 0; i < src->n_leafs; i++) {
dst->leafs[dst->n_leafs++] = map_tensor(tensor_map, ctx, src->leafs[i], deep);
}
for (int i = 0; i < src->n_nodes; i++) {
dst->nodes[dst->n_nodes++] = map_tensor(tensor_map, ctx, src->nodes[i], deep);
}
}
GGML_ASSERT(dst->n_leafs == src->n_leafs);
GGML_ASSERT(dst->n_nodes == src->n_nodes);

if (src->grads) {
GGML_ASSERT(dst->grads);
for (int i = 0; i < src->n_nodes; ++i) {
const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);

GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));

dst->grads[igrad_dst] = src->grads[igrad_src];
dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
}
}
}
#endif // __cplusplus
64 changes: 2 additions & 62 deletions ggml/src/ggml-opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <cmath>
#include <cstdint>
#include <cinttypes>
#include <map>
#include <random>
#include <vector>

Expand Down Expand Up @@ -252,66 +251,6 @@ struct ggml_opt_params ggml_opt_default_params(
};
}

static ggml_tensor * map_tensor(std::map<ggml_tensor *, ggml_tensor *> & tensor_map, ggml_context * ctx, ggml_tensor * tensor) {
if (!tensor) {
return nullptr;
}

if (tensor_map.find(tensor) != tensor_map.end()) {
return tensor_map[tensor];
}

ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
tensor_map[tensor] = new_tensor;

new_tensor->op = tensor->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
new_tensor->nb[i] = tensor->nb[i];
}
new_tensor->flags = tensor->flags;
memcpy(new_tensor->op_params, tensor->op_params, sizeof(tensor->op_params));
strcpy(new_tensor->name, tensor->name);
new_tensor->data = tensor->data;
new_tensor->buffer = tensor->buffer;
new_tensor->extra = tensor->extra;
new_tensor->view_offs = tensor->view_offs;
new_tensor->view_src = map_tensor(tensor_map, ctx, tensor->view_src);
for (int i = 0; i < GGML_MAX_SRC; i++) {
new_tensor->src[i] = map_tensor(tensor_map, ctx, tensor->src[i]);
}

return new_tensor;
}

static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
std::map<ggml_tensor *, ggml_tensor *> tensor_map;

ggml_cgraph * dst = ggml_new_graph_custom(ctx, src->size, /*grads =*/ true);

for (int i = 0; i < src->n_leafs; i++) {
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->leafs[i]));
}
GGML_ASSERT(dst->n_leafs == src->n_leafs);
for (int i = 0; i < src->n_nodes; i++) {
ggml_build_forward_expand(dst, map_tensor(tensor_map, ctx, src->nodes[i]));
}
GGML_ASSERT(dst->n_nodes == src->n_nodes);
for (int i = 0; i < src->n_nodes; ++i) {
const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);

GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));

dst->grads[igrad_dst] = src->grads[igrad_src];
dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
}

return dst;
}

static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
Expand Down Expand Up @@ -738,7 +677,8 @@ void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
ggml_free(opt_ctx->ctx_copy);
opt_ctx->ctx_copy = ggml_init(params);

opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
opt_ctx->allocated_graph_copy = ggml_new_graph_custom(opt_ctx->ctx_copy, graph->size, /*grads =*/ true);
dup_graph(opt_ctx->ctx_copy, graph, opt_ctx->allocated_graph_copy, /*deep =*/ true);
} else {
opt_ctx->allocated_graph_copy = graph;
}
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1714,6 +1714,7 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
va_list args;
va_start(args, fmt);
assert(tensor->name != fmt);
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
va_end(args);
return tensor;
Expand Down
2 changes: 1 addition & 1 deletion src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ llama_context::llama_context(
bool pipeline_parallel =
model.n_devices() > 1 &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
(model.params.split_mode == LLAMA_SPLIT_MODE_LAYER || model.params.split_mode == LLAMA_SPLIT_MODE_ROW) &&
cparams.offload_kqv &&
!model.has_tensor_overrides();

Expand Down
9 changes: 9 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4566,6 +4566,14 @@ struct llm_build_llama : public llm_graph_context {
cb(Vcur, "Vcur", il);
}

// FIXME
Qcur = ggml_scale(ctx0, Qcur, 1.0f);
Kcur = ggml_scale(ctx0, Kcur, 1.0f);
Vcur = ggml_scale(ctx0, Vcur, 1.0f);
cb(Qcur, "QcurFIXME", il);
cb(Kcur, "KcurFIXME", il);
cb(Vcur, "VcurFIXME", il);

Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Expand Down Expand Up @@ -4705,6 +4713,7 @@ struct llm_build_llama : public llm_graph_context {
cur = build_lora_mm(model.output, cur);

cb(cur, "result_output", -1);
cur = ggml_scale(ctx0, cur, 1.0f); // FIXME
res->t_logits = cur;

ggml_build_forward_expand(gf, cur);
Expand Down
Loading