@@ -568,6 +568,34 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
568568 break ;
569569 }
570570 params.lora_base = argv[i];
571+ } else if (arg == " --control-vector" ) {
572+ if (++i >= argc) {
573+ invalid_param = true ;
574+ break ;
575+ }
576+ params.control_vectors .push_back ({ 1 .0f , argv[i], });
577+ } else if (arg == " --control-vector-scaled" ) {
578+ if (++i >= argc) {
579+ invalid_param = true ;
580+ break ;
581+ }
582+ const char * fname = argv[i];
583+ if (++i >= argc) {
584+ invalid_param = true ;
585+ break ;
586+ }
587+ params.control_vectors .push_back ({ std::stof (argv[i]), fname, });
588+ } else if (arg == " --control-vector-layer-range" ) {
589+ if (++i >= argc) {
590+ invalid_param = true ;
591+ break ;
592+ }
593+ params.control_vector_layer_start = std::stoi (argv[i]);
594+ if (++i >= argc) {
595+ invalid_param = true ;
596+ break ;
597+ }
598+ params.control_vector_layer_end = std::stoi (argv[i]);
571599 } else if (arg == " --mmproj" ) {
572600 if (++i >= argc) {
573601 invalid_param = true ;
@@ -1095,6 +1123,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
10951123 printf (" --lora FNAME apply LoRA adapter (implies --no-mmap)\n " );
10961124 printf (" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n " );
10971125 printf (" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n " );
1126+ printf (" --control-vector FNAME\n " );
1127+ printf (" add a control vector\n " );
1128+ printf (" --control-vector-scaled FNAME S\n " );
1129+ printf (" add a control vector with user defined scaling S\n " );
1130+ printf (" --control-vector-layer-range START END\n " );
1131+ printf (" layer range to apply the control vector(s) to, start and end inclusive\n " );
10981132 printf (" -m FNAME, --model FNAME\n " );
10991133 printf (" model path (default: %s)\n " , params.model .c_str ());
11001134 printf (" -md FNAME, --model-draft FNAME\n " );
@@ -1360,6 +1394,30 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
13601394 return std::make_tuple (nullptr , nullptr );
13611395 }
13621396
1397+ if (!params.control_vectors .empty ()) {
1398+ if (params.control_vector_layer_start <= 0 ) params.control_vector_layer_start = 1 ;
1399+ if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_n_layer (model);
1400+
1401+ const auto cvec = llama_control_vector_load (params.control_vectors );
1402+ if (cvec.n_embd == -1 ) {
1403+ llama_free (lctx);
1404+ llama_free_model (model);
1405+ return std::make_tuple (nullptr , nullptr );
1406+ }
1407+
1408+ int err = llama_control_vector_apply (lctx,
1409+ cvec.data .data (),
1410+ cvec.data .size (),
1411+ cvec.n_embd ,
1412+ params.control_vector_layer_start ,
1413+ params.control_vector_layer_end );
1414+ if (err) {
1415+ llama_free (lctx);
1416+ llama_free_model (model);
1417+ return std::make_tuple (nullptr , nullptr );
1418+ }
1419+ }
1420+
13631421 for (unsigned int i = 0 ; i < params.lora_adapter .size (); ++i) {
13641422 const std::string& lora_adapter = std::get<0 >(params.lora_adapter [i]);
13651423 float lora_scale = std::get<1 >(params.lora_adapter [i]);
@@ -1890,3 +1948,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
18901948
18911949 return sum / (sqrt (sum1) * sqrt (sum2));
18921950}
1951+
1952+ //
1953+ // Control vector utils
1954+ //
1955+
1956+ static llama_control_vector_data llama_control_vector_load_one (const llama_control_vector_load_info & load_info) {
1957+ int32_t n_tensors;
1958+
1959+ size_t n_bytes = 0 ;
1960+
1961+ uint32_t max_direction_layer = 0 ;
1962+
1963+ llama_control_vector_data result = { -1 , {} };
1964+
1965+ // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
1966+ {
1967+ struct ggml_init_params meta_params = {
1968+ /* .mem_size = */ ggml_tensor_overhead () * 128 + ggml_graph_overhead (),
1969+ /* .mem_buffer = */ nullptr ,
1970+ /* .no_alloc = */ true ,
1971+ };
1972+ ggml_context * meta_ctx = ggml_init (meta_params);
1973+ struct gguf_init_params meta_gguf_params = {
1974+ /* .no_alloc = */ true ,
1975+ /* .ctx = */ &meta_ctx,
1976+ };
1977+ struct gguf_context * meta_ctx_gguf = gguf_init_from_file (load_info.fname .c_str (), meta_gguf_params);
1978+ if (!meta_ctx_gguf) {
1979+ fprintf (stderr, " %s: failed to load control vector from %s\n " , __func__, load_info.fname .c_str ());
1980+ ggml_free (meta_ctx);
1981+ return result;
1982+ }
1983+
1984+ n_tensors = gguf_get_n_tensors (meta_ctx_gguf);
1985+ for (int i = 0 ; i < n_tensors; i++) {
1986+ std::string name = gguf_get_tensor_name (meta_ctx_gguf, i);
1987+
1988+ // split on '.'
1989+ size_t dotpos = name.find (' .' );
1990+ if (dotpos != std::string::npos && name.substr (0 , dotpos) == " direction" ) {
1991+ try {
1992+ uint32_t layer = std::stoi (name.substr (dotpos + 1 ));
1993+ if (layer == 0 ) {
1994+ fprintf (stderr, " %s: direction tensor invalid in %s\n " , __func__, load_info.fname .c_str ());
1995+ ggml_free (meta_ctx);
1996+ gguf_free (meta_ctx_gguf);
1997+ return result;
1998+ }
1999+ if (layer > max_direction_layer) {
2000+ max_direction_layer = layer;
2001+ }
2002+ } catch (...) {
2003+ fprintf (stderr, " %s: direction tensor invalid in %s\n " , __func__, load_info.fname .c_str ());
2004+ ggml_free (meta_ctx);
2005+ gguf_free (meta_ctx_gguf);
2006+ return result;
2007+ }
2008+ }
2009+
2010+ struct ggml_tensor * tensor_meta = ggml_get_tensor (meta_ctx, name.c_str ());
2011+ if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims (tensor_meta) != 1 ) {
2012+ fprintf (stderr, " %s: direction tensor invalid in %s\n " , __func__, load_info.fname .c_str ());
2013+ ggml_free (meta_ctx);
2014+ gguf_free (meta_ctx_gguf);
2015+ return result;
2016+ }
2017+ if (result.n_embd == -1 ) {
2018+ result.n_embd = ggml_nelements (tensor_meta);
2019+ } else if (ggml_nelements (tensor_meta) != result.n_embd ) {
2020+ fprintf (stderr, " %s: direction tensor sizes mismatched in %s\n " , __func__, load_info.fname .c_str ());
2021+ ggml_free (meta_ctx);
2022+ gguf_free (meta_ctx_gguf);
2023+ return result;
2024+ }
2025+ n_bytes += ggml_nbytes (tensor_meta);
2026+ }
2027+ ggml_free (meta_ctx);
2028+ gguf_free (meta_ctx_gguf);
2029+ }
2030+
2031+ if (n_tensors == 0 ) {
2032+ fprintf (stderr, " %s: no direction tensors found in %s\n " , __func__, load_info.fname .c_str ());
2033+ return result;
2034+ }
2035+
2036+ // load and scale tensors into final control vector context
2037+ struct ggml_init_params ggml_params = {
2038+ /* .mem_size = */ ggml_tensor_overhead () * n_tensors + n_bytes,
2039+ /* .mem_buffer = */ nullptr ,
2040+ /* .no_alloc = */ false ,
2041+ };
2042+ struct ggml_context * ctx = ggml_init (ggml_params);
2043+
2044+ struct gguf_init_params params = {
2045+ /* .no_alloc = */ false ,
2046+ /* .ctx = */ &ctx,
2047+ };
2048+ struct gguf_context * ctx_gguf = gguf_init_from_file (load_info.fname .c_str (), params);
2049+ if (!ctx_gguf) {
2050+ fprintf (stderr, " %s: failed to load control vector from %s\n " , __func__, load_info.fname .c_str ());
2051+ ggml_free (ctx);
2052+ return result;
2053+ }
2054+
2055+ // do not store data for layer 0 (it's not used)
2056+ result.data .resize (result.n_embd * max_direction_layer);
2057+
2058+ for (uint32_t il = 1 ; il <= max_direction_layer; il++) {
2059+ const std::string name = " direction." + std::to_string (il);
2060+ const ggml_tensor * tensor = ggml_get_tensor (ctx, name.c_str ());
2061+
2062+ float * dst = result.data .data () + result.n_embd * (il - 1 );
2063+
2064+ if (tensor) {
2065+ const float * src = (const float *) tensor->data ;
2066+ for (int j = 0 ; j < result.n_embd ; j++) {
2067+ dst[j] = src[j] * load_info.strength ;
2068+ }
2069+ } else {
2070+ for (int j = 0 ; j < result.n_embd ; j++) {
2071+ dst[j] = 0 .0f ;
2072+ }
2073+ }
2074+ }
2075+
2076+ return result;
2077+ }
2078+
2079+ llama_control_vector_data llama_control_vector_load (const std::vector<llama_control_vector_load_info> & load_infos) {
2080+ llama_control_vector_data result = { -1 , {} };
2081+
2082+ for (const auto & info : load_infos) {
2083+ auto cur = llama_control_vector_load_one (info);
2084+
2085+ if (cur.n_embd == -1 ) {
2086+ return result;
2087+ }
2088+ if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data .size () != cur.data .size ())) {
2089+ fprintf (stderr, " %s: control vector in %s does not match previous vector dimensions\n " , __func__, info.fname .c_str ());
2090+ return result;
2091+ }
2092+
2093+ if (result.n_embd == -1 ) {
2094+ result = std::move (cur);
2095+ } else {
2096+ for (size_t i = 0 ; i < cur.data .size (); i++) {
2097+ result.data [i] += cur.data [i];
2098+ }
2099+ }
2100+ }
2101+
2102+ if (result.n_embd == -1 ) {
2103+ fprintf (stderr, " %s: no vectors passed\n " , __func__);
2104+ }
2105+
2106+ return result;
2107+ }
0 commit comments