Skip to content

Commit 7ec24b4

Browse files
committed
control vector api and implementation
1 parent 621e86b commit 7ec24b4

File tree

2 files changed

+259
-0
lines changed

2 files changed

+259
-0
lines changed

llama.cpp

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,10 @@ struct llama_context {
20922092
struct ggml_tensor * inp_s_mask; // F32 [kv_size]
20932093
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
20942094

2095+
struct llama_control_vector * control_vector;
2096+
int32_t control_vector_layer_start;
2097+
int32_t control_vector_layer_end;
2098+
20952099
#ifdef GGML_USE_MPI
20962100
ggml_mpi_context * ctx_mpi = NULL;
20972101
#endif
@@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv(
54165420
return cur;
54175421
}
54185422

5423+
ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il);
5424+
54195425
struct llm_build_context {
54205426
const llama_model & model;
54215427
const llama_context & lctx;
@@ -5770,6 +5776,14 @@ struct llm_build_context {
57705776
}
57715777

57725778
cur = ggml_add(ctx0, cur, ffn_inp);
5779+
cb(cur, "ffn_out", il);
5780+
5781+
if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) {
5782+
ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il);
5783+
if (layer_dir != nullptr) {
5784+
cur = ggml_add(ctx0, cur, layer_dir);
5785+
}
5786+
}
57735787
cb(cur, "l_out", il);
57745788

57755789
// input for next layer
@@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
1318313197
}
1318413198
}
1318513199

13200+
struct llama_control_vector {
13201+
struct ggml_context * ctx;
13202+
std::vector<ggml_tensor*> tensors;
13203+
13204+
llama_control_vector() : ctx(nullptr) {}
13205+
13206+
~llama_control_vector() {
13207+
if (this->ctx) {
13208+
ggml_free(this->ctx);
13209+
}
13210+
}
13211+
};
13212+
13213+
ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) {
13214+
if (!vector->ctx || il > vector->tensors.size()) {
13215+
return nullptr;
13216+
}
13217+
return vector->tensors[il];
13218+
}
13219+
13220+
struct llama_control_vector * llama_control_vector_load(const char * path) {
13221+
struct llama_control_vector * vector = new llama_control_vector();
13222+
13223+
int n_tensors;
13224+
size_t n_bytes = 0;
13225+
uint32_t max_direction_layer = 0;
13226+
13227+
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
13228+
{
13229+
struct ggml_init_params meta_params = {
13230+
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
13231+
/* .mem_buffer = */ nullptr,
13232+
/* .no_alloc = */ true,
13233+
};
13234+
ggml_context * meta_ctx = ggml_init(meta_params);
13235+
struct gguf_init_params meta_gguf_params = {
13236+
/* .no_alloc = */ true,
13237+
/* .ctx = */ &meta_ctx,
13238+
};
13239+
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params);
13240+
if (!meta_ctx_gguf) {
13241+
LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13242+
ggml_free(meta_ctx);
13243+
return nullptr;
13244+
}
13245+
13246+
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
13247+
for (int i = 0; i < n_tensors; i++) {
13248+
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
13249+
13250+
// split on '.'
13251+
size_t dotpos = name.find('.');
13252+
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
13253+
try {
13254+
uint32_t layer = std::stoi(name.substr(dotpos + 1));
13255+
if (layer == 0) {
13256+
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13257+
ggml_free(meta_ctx);
13258+
gguf_free(meta_ctx_gguf);
13259+
return nullptr;
13260+
}
13261+
if (layer > max_direction_layer) {
13262+
max_direction_layer = layer;
13263+
}
13264+
} catch (...) {
13265+
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13266+
ggml_free(meta_ctx);
13267+
gguf_free(meta_ctx_gguf);
13268+
return nullptr;
13269+
}
13270+
}
13271+
13272+
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
13273+
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
13274+
LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13275+
ggml_free(meta_ctx);
13276+
gguf_free(meta_ctx_gguf);
13277+
return nullptr;
13278+
}
13279+
n_bytes += ggml_nbytes(tensor_meta);
13280+
}
13281+
ggml_free(meta_ctx);
13282+
gguf_free(meta_ctx_gguf);
13283+
}
13284+
13285+
// load and scale tensors into final control vector context
13286+
struct ggml_init_params ggml_params = {
13287+
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
13288+
/* .mem_buffer = */ nullptr,
13289+
/* .no_alloc = */ false,
13290+
};
13291+
struct ggml_context * ctx = ggml_init(ggml_params);
13292+
13293+
struct gguf_init_params params = {
13294+
/*.no_alloc = */ false,
13295+
/*.ctx = */ &ctx,
13296+
};
13297+
struct gguf_context * ctx_gguf = gguf_init_from_file(path, params);
13298+
if (!ctx_gguf) {
13299+
LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13300+
ggml_free(ctx);
13301+
return nullptr;
13302+
}
13303+
13304+
vector->ctx = ctx;
13305+
vector->tensors.push_back(nullptr); // there's never a direction vector for 0
13306+
for (uint32_t i = 1; i < max_direction_layer; i++) {
13307+
std::string name = format("direction.%d", i);
13308+
ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
13309+
if (tensor) {
13310+
vector->tensors.push_back(tensor);
13311+
// LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data);
13312+
} else {
13313+
vector->tensors.push_back(nullptr); // as a filler
13314+
}
13315+
}
13316+
13317+
return vector;
13318+
}
13319+
13320+
struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) {
13321+
struct llama_control_vector * new_vector = new llama_control_vector();
13322+
if (vector->ctx == nullptr) {
13323+
return new_vector;
13324+
}
13325+
struct ggml_init_params ggml_params = {
13326+
/* .mem_size = */ ggml_get_mem_size(vector->ctx),
13327+
/* .mem_buffer = */ nullptr,
13328+
/* .no_alloc = */ false,
13329+
};
13330+
13331+
struct ggml_context * ctx = ggml_init(ggml_params);
13332+
13333+
for (ggml_tensor * tensor : vector->tensors) {
13334+
if (tensor == nullptr) {
13335+
new_vector->tensors.push_back(nullptr);
13336+
} else {
13337+
ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
13338+
new_vector->tensors.push_back(new_tensor);
13339+
}
13340+
}
13341+
13342+
new_vector->ctx = ctx;
13343+
return new_vector;
13344+
}
13345+
13346+
int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) {
13347+
if (vector->ctx == nullptr) {
13348+
LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__);
13349+
return 1;
13350+
}
13351+
13352+
for (ggml_tensor * tensor : vector->tensors) {
13353+
if (tensor == nullptr) continue;
13354+
for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) {
13355+
float v = ggml_get_f32_1d(tensor, j);
13356+
ggml_set_f32_1d(tensor, j, v * strength);
13357+
}
13358+
}
13359+
13360+
return 0;
13361+
}
13362+
13363+
int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) {
13364+
if (vector->ctx == nullptr || other->ctx == nullptr) {
13365+
LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__);
13366+
return 1;
13367+
}
13368+
13369+
size_t size = std::max(vector->tensors.size(), other->tensors.size());
13370+
for (size_t i = 0; i < size; i++) {
13371+
if (i >= vector->tensors.size()) {
13372+
vector->tensors.push_back(nullptr);
13373+
}
13374+
13375+
ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr;
13376+
if (other_tensor != nullptr) {
13377+
if (vector->tensors[i] == nullptr) {
13378+
ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor);
13379+
vector->tensors[i] = new_tensor;
13380+
} else {
13381+
ggml_tensor * this_tensor = vector->tensors[i];
13382+
size_t this_nelements = ggml_nelements(this_tensor);
13383+
size_t other_nelements = ggml_nelements(other_tensor);
13384+
13385+
if (this_nelements != other_nelements) {
13386+
LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements);
13387+
return 1;
13388+
}
13389+
13390+
for (size_t j = 0; j < this_nelements; j++) {
13391+
float a = ggml_get_f32_1d(this_tensor, j);
13392+
float b = ggml_get_f32_1d(other_tensor, j);
13393+
ggml_set_f32_1d(this_tensor, j, a + b);
13394+
}
13395+
}
13396+
}
13397+
}
13398+
13399+
return 0;
13400+
}
13401+
13402+
void llama_control_vector_free(struct llama_control_vector * vector) {
13403+
delete vector;
13404+
}
13405+
13406+
void llama_apply_control_vector(
13407+
struct llama_context * lctx,
13408+
struct llama_control_vector * vector,
13409+
int32_t control_vector_layer_start,
13410+
int32_t control_vector_layer_end
13411+
) {
13412+
lctx->control_vector = vector;
13413+
lctx->control_vector_layer_start = control_vector_layer_start;
13414+
lctx->control_vector_layer_end = control_vector_layer_end;
13415+
}
13416+
13417+
void llama_clear_control_vector(struct llama_context * lctx) {
13418+
lctx->control_vector = nullptr;
13419+
}
13420+
1318613421
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
1318713422
struct llama_kv_cache_view result = {
1318813423
/*.n_cells = */ 0,

llama.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "ggml.h"
55
#include "ggml-backend.h"
66

7+
#include <cstdint>
78
#include <stddef.h>
89
#include <stdint.h>
910
#include <stdio.h>
@@ -436,6 +437,29 @@ extern "C" {
436437
float scale,
437438
const char * path_base_model,
438439
int32_t n_threads);
440+
441+
struct llama_control_vector;
442+
443+
LLAMA_API struct llama_control_vector * llama_control_vector_load(const char * path);
444+
LLAMA_API struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector);
445+
LLAMA_API int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength);
446+
LLAMA_API int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other);
447+
LLAMA_API void llama_control_vector_free(struct llama_control_vector * vector);
448+
LLAMA_API void llama_apply_control_vector(
449+
struct llama_context * lctx,
450+
struct llama_control_vector * vector,
451+
int32_t control_vector_layer_start,
452+
int32_t control_vector_layer_end);
453+
LLAMA_API void llama_clear_control_vector(struct llama_context * lctx);
454+
455+
456+
// Apply a control vector to a model context
457+
LLAMA_API int32_t llama_load_control_vector_from_file(
458+
struct llama_context * lctx,
459+
const char * control_vector_path,
460+
float strength,
461+
int32_t layer_start,
462+
int32_t layer_end);
439463

440464
//
441465
// KV cache

0 commit comments

Comments
 (0)