@@ -2092,6 +2092,10 @@ struct llama_context {
2092
2092
struct ggml_tensor * inp_s_mask; // F32 [kv_size]
2093
2093
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
2094
2094
2095
+ struct llama_control_vector * control_vector;
2096
+ int32_t control_vector_layer_start;
2097
+ int32_t control_vector_layer_end;
2098
+
2095
2099
#ifdef GGML_USE_MPI
2096
2100
ggml_mpi_context * ctx_mpi = NULL;
2097
2101
#endif
@@ -5416,6 +5420,8 @@ static struct ggml_tensor * llm_build_kv(
5416
5420
return cur;
5417
5421
}
5418
5422
5423
+ ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il);
5424
+
5419
5425
struct llm_build_context {
5420
5426
const llama_model & model;
5421
5427
const llama_context & lctx;
@@ -5770,6 +5776,14 @@ struct llm_build_context {
5770
5776
}
5771
5777
5772
5778
cur = ggml_add(ctx0, cur, ffn_inp);
5779
+ cb(cur, "ffn_out", il);
5780
+
5781
+ if (lctx.control_vector != nullptr && il >= lctx.control_vector_layer_start && il <= lctx.control_vector_layer_end) {
5782
+ ggml_tensor * layer_dir = get_control_vector_layer_tensor(lctx.control_vector, il);
5783
+ if (layer_dir != nullptr) {
5784
+ cur = ggml_add(ctx0, cur, layer_dir);
5785
+ }
5786
+ }
5773
5787
cb(cur, "l_out", il);
5774
5788
5775
5789
// input for next layer
@@ -13183,6 +13197,227 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
13183
13197
}
13184
13198
}
13185
13199
13200
+ struct llama_control_vector {
13201
+ struct ggml_context * ctx;
13202
+ std::vector<ggml_tensor*> tensors;
13203
+
13204
+ llama_control_vector() : ctx(nullptr) {}
13205
+
13206
+ ~llama_control_vector() {
13207
+ if (this->ctx) {
13208
+ ggml_free(this->ctx);
13209
+ }
13210
+ }
13211
+ };
13212
+
13213
+ ggml_tensor * get_control_vector_layer_tensor(struct llama_control_vector * vector, int il) {
13214
+ if (!vector->ctx || il > vector->tensors.size()) {
13215
+ return nullptr;
13216
+ }
13217
+ return vector->tensors[il];
13218
+ }
13219
+
13220
+ struct llama_control_vector * llama_control_vector_load(const char * path) {
13221
+ struct llama_control_vector * vector = new llama_control_vector();
13222
+
13223
+ int n_tensors;
13224
+ size_t n_bytes = 0;
13225
+ uint32_t max_direction_layer = 0;
13226
+
13227
+ // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
13228
+ {
13229
+ struct ggml_init_params meta_params = {
13230
+ /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
13231
+ /* .mem_buffer = */ nullptr,
13232
+ /* .no_alloc = */ true,
13233
+ };
13234
+ ggml_context * meta_ctx = ggml_init(meta_params);
13235
+ struct gguf_init_params meta_gguf_params = {
13236
+ /* .no_alloc = */ true,
13237
+ /* .ctx = */ &meta_ctx,
13238
+ };
13239
+ struct gguf_context * meta_ctx_gguf = gguf_init_from_file(path, meta_gguf_params);
13240
+ if (!meta_ctx_gguf) {
13241
+ LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13242
+ ggml_free(meta_ctx);
13243
+ return nullptr;
13244
+ }
13245
+
13246
+ n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
13247
+ for (int i = 0; i < n_tensors; i++) {
13248
+ std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
13249
+
13250
+ // split on '.'
13251
+ size_t dotpos = name.find('.');
13252
+ if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
13253
+ try {
13254
+ uint32_t layer = std::stoi(name.substr(dotpos + 1));
13255
+ if (layer == 0) {
13256
+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13257
+ ggml_free(meta_ctx);
13258
+ gguf_free(meta_ctx_gguf);
13259
+ return nullptr;
13260
+ }
13261
+ if (layer > max_direction_layer) {
13262
+ max_direction_layer = layer;
13263
+ }
13264
+ } catch (...) {
13265
+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13266
+ ggml_free(meta_ctx);
13267
+ gguf_free(meta_ctx_gguf);
13268
+ return nullptr;
13269
+ }
13270
+ }
13271
+
13272
+ struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
13273
+ if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
13274
+ LLAMA_LOG_ERROR("%s: direction tensor invalid: %s\n", __func__, name.c_str());
13275
+ ggml_free(meta_ctx);
13276
+ gguf_free(meta_ctx_gguf);
13277
+ return nullptr;
13278
+ }
13279
+ n_bytes += ggml_nbytes(tensor_meta);
13280
+ }
13281
+ ggml_free(meta_ctx);
13282
+ gguf_free(meta_ctx_gguf);
13283
+ }
13284
+
13285
+ // load and scale tensors into final control vector context
13286
+ struct ggml_init_params ggml_params = {
13287
+ /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
13288
+ /* .mem_buffer = */ nullptr,
13289
+ /* .no_alloc = */ false,
13290
+ };
13291
+ struct ggml_context * ctx = ggml_init(ggml_params);
13292
+
13293
+ struct gguf_init_params params = {
13294
+ /*.no_alloc = */ false,
13295
+ /*.ctx = */ &ctx,
13296
+ };
13297
+ struct gguf_context * ctx_gguf = gguf_init_from_file(path, params);
13298
+ if (!ctx_gguf) {
13299
+ LLAMA_LOG_ERROR("%s: failed to load control vector\n", __func__);
13300
+ ggml_free(ctx);
13301
+ return nullptr;
13302
+ }
13303
+
13304
+ vector->ctx = ctx;
13305
+ vector->tensors.push_back(nullptr); // there's never a direction vector for 0
13306
+ for (uint32_t i = 1; i < max_direction_layer; i++) {
13307
+ std::string name = format("direction.%d", i);
13308
+ ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
13309
+ if (tensor) {
13310
+ vector->tensors.push_back(tensor);
13311
+ // LLAMA_LOG_INFO("%s: found control vector tensor: t[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(tensor), tensor->name, tensor->data);
13312
+ } else {
13313
+ vector->tensors.push_back(nullptr); // as a filler
13314
+ }
13315
+ }
13316
+
13317
+ return vector;
13318
+ }
13319
+
13320
+ struct llama_control_vector * llama_control_vector_dup(const struct llama_control_vector * vector) {
13321
+ struct llama_control_vector * new_vector = new llama_control_vector();
13322
+ if (vector->ctx == nullptr) {
13323
+ return new_vector;
13324
+ }
13325
+ struct ggml_init_params ggml_params = {
13326
+ /* .mem_size = */ ggml_get_mem_size(vector->ctx),
13327
+ /* .mem_buffer = */ nullptr,
13328
+ /* .no_alloc = */ false,
13329
+ };
13330
+
13331
+ struct ggml_context * ctx = ggml_init(ggml_params);
13332
+
13333
+ for (ggml_tensor * tensor : vector->tensors) {
13334
+ if (tensor == nullptr) {
13335
+ new_vector->tensors.push_back(nullptr);
13336
+ } else {
13337
+ ggml_tensor * new_tensor = ggml_dup_tensor(ctx, tensor);
13338
+ new_vector->tensors.push_back(new_tensor);
13339
+ }
13340
+ }
13341
+
13342
+ new_vector->ctx = ctx;
13343
+ return new_vector;
13344
+ }
13345
+
13346
+ int32_t llama_control_vector_scale(struct llama_control_vector * vector, float strength) {
13347
+ if (vector->ctx == nullptr) {
13348
+ LLAMA_LOG_ERROR("%s: attempted to scale unloaded control vector\n", __func__);
13349
+ return 1;
13350
+ }
13351
+
13352
+ for (ggml_tensor * tensor : vector->tensors) {
13353
+ if (tensor == nullptr) continue;
13354
+ for (int j = 0; (int64_t)j < ggml_nelements(tensor); j++) {
13355
+ float v = ggml_get_f32_1d(tensor, j);
13356
+ ggml_set_f32_1d(tensor, j, v * strength);
13357
+ }
13358
+ }
13359
+
13360
+ return 0;
13361
+ }
13362
+
13363
+ int32_t llama_control_vector_add(struct llama_control_vector * vector, const struct llama_control_vector * other) {
13364
+ if (vector->ctx == nullptr || other->ctx == nullptr) {
13365
+ LLAMA_LOG_ERROR("%s: attempted to add with an unloaded control vector\n", __func__);
13366
+ return 1;
13367
+ }
13368
+
13369
+ size_t size = std::max(vector->tensors.size(), other->tensors.size());
13370
+ for (size_t i = 0; i < size; i++) {
13371
+ if (i >= vector->tensors.size()) {
13372
+ vector->tensors.push_back(nullptr);
13373
+ }
13374
+
13375
+ ggml_tensor * other_tensor = i < other->tensors.size() ? other->tensors[i] : nullptr;
13376
+ if (other_tensor != nullptr) {
13377
+ if (vector->tensors[i] == nullptr) {
13378
+ ggml_tensor * new_tensor = ggml_dup_tensor(vector->ctx, other_tensor);
13379
+ vector->tensors[i] = new_tensor;
13380
+ } else {
13381
+ ggml_tensor * this_tensor = vector->tensors[i];
13382
+ size_t this_nelements = ggml_nelements(this_tensor);
13383
+ size_t other_nelements = ggml_nelements(other_tensor);
13384
+
13385
+ if (this_nelements != other_nelements) {
13386
+ LLAMA_LOG_ERROR("%s: attempted to add control vectors of incompatible dimension: %zu != %zu\n", __func__, this_nelements, other_nelements);
13387
+ return 1;
13388
+ }
13389
+
13390
+ for (size_t j = 0; j < this_nelements; j++) {
13391
+ float a = ggml_get_f32_1d(this_tensor, j);
13392
+ float b = ggml_get_f32_1d(other_tensor, j);
13393
+ ggml_set_f32_1d(this_tensor, j, a + b);
13394
+ }
13395
+ }
13396
+ }
13397
+ }
13398
+
13399
+ return 0;
13400
+ }
13401
+
13402
+ void llama_control_vector_free(struct llama_control_vector * vector) {
13403
+ delete vector;
13404
+ }
13405
+
13406
+ void llama_apply_control_vector(
13407
+ struct llama_context * lctx,
13408
+ struct llama_control_vector * vector,
13409
+ int32_t control_vector_layer_start,
13410
+ int32_t control_vector_layer_end
13411
+ ) {
13412
+ lctx->control_vector = vector;
13413
+ lctx->control_vector_layer_start = control_vector_layer_start;
13414
+ lctx->control_vector_layer_end = control_vector_layer_end;
13415
+ }
13416
+
13417
+ void llama_clear_control_vector(struct llama_context * lctx) {
13418
+ lctx->control_vector = nullptr;
13419
+ }
13420
+
13186
13421
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
13187
13422
struct llama_kv_cache_view result = {
13188
13423
/*.n_cells = */ 0,
0 commit comments