Skip to content

Commit 78534d9

Browse files
committed
Merge branch 'sl/custom-tensor-offload' into testing
2 parents 8352cdc + c44de8a commit 78534d9

File tree

10 files changed

+97
-8
lines changed

10 files changed

+97
-8
lines changed

common/arg.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "arg.h"
22

3+
#include "common.h"
34
#include "log.h"
45
#include "sampling.h"
56
#include "chat.h"
@@ -322,6 +323,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
322323
params.kv_overrides.back().key[0] = 0;
323324
}
324325

326+
if (!params.tensor_buft_overrides.empty()) {
327+
params.tensor_buft_overrides.push_back({nullptr, nullptr});
328+
}
329+
325330
if (params.reranking && params.embedding) {
326331
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
327332
}
@@ -1629,6 +1634,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
16291634
exit(0);
16301635
}
16311636
));
1637+
add_opt(common_arg(
1638+
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
1639+
"override tensor buffer type", [](common_params & params, const std::string & value) {
1640+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
1641+
if (buft_list.empty()) {
1642+
// enumerate all the devices and add their buffer types to the list
1643+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
1644+
auto * dev = ggml_backend_dev_get(i);
1645+
auto * buft = ggml_backend_dev_buffer_type(dev);
1646+
if (buft) {
1647+
buft_list[ggml_backend_buft_name(buft)] = buft;
1648+
}
1649+
}
1650+
}
1651+
1652+
for (const auto & override : string_split<std::string>(value, ',')) {
1653+
std::string::size_type pos = override.find('=');
1654+
if (pos == std::string::npos) {
1655+
throw std::invalid_argument("invalid value");
1656+
}
1657+
std::string tensor_name = override.substr(0, pos);
1658+
std::string buffer_type = override.substr(pos + 1);
1659+
1660+
if (buft_list.find(buffer_type) == buft_list.end()) {
1661+
printf("Available buffer types:\n");
1662+
for (const auto & it : buft_list) {
1663+
printf(" %s\n", ggml_backend_buft_name(it.second));
1664+
}
1665+
throw std::invalid_argument("unknown buffer type");
1666+
}
1667+
// FIXME: this leaks memory
1668+
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
1669+
}
1670+
}
1671+
));
16321672
add_opt(common_arg(
16331673
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
16341674
"number of layers to store in VRAM",

common/common.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,22 +1086,32 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10861086
if (!params.devices.empty()) {
10871087
mparams.devices = params.devices.data();
10881088
}
1089+
10891090
if (params.n_gpu_layers != -1) {
10901091
mparams.n_gpu_layers = params.n_gpu_layers;
10911092
}
1093+
10921094
mparams.main_gpu = params.main_gpu;
10931095
mparams.split_mode = params.split_mode;
10941096
mparams.tensor_split = params.tensor_split;
10951097
mparams.use_mmap = params.use_mmap;
10961098
mparams.use_mlock = params.use_mlock;
10971099
mparams.check_tensors = params.check_tensors;
1100+
10981101
if (params.kv_overrides.empty()) {
10991102
mparams.kv_overrides = NULL;
11001103
} else {
11011104
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
11021105
mparams.kv_overrides = params.kv_overrides.data();
11031106
}
11041107

1108+
if (params.tensor_buft_overrides.empty()) {
1109+
mparams.tensor_buft_overrides = NULL;
1110+
} else {
1111+
GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
1112+
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1113+
}
1114+
11051115
return mparams;
11061116
}
11071117

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ struct common_params {
286286
std::vector<std::string> in_files; // all input files
287287
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
288288
std::vector<llama_model_kv_override> kv_overrides;
289+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
289290

290291
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
291292
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale

ggml/src/ggml.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
11551155
}
11561156

11571157
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
1158+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
1159+
if (tensor->ne[i] <= 0) {
1160+
return 0;
1161+
}
1162+
}
1163+
11581164
size_t nbytes;
11591165
const size_t blck_size = ggml_blck_size(tensor->type);
11601166
if (blck_size == 1) {

include/llama.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,10 +276,18 @@ extern "C" {
276276
};
277277
};
278278

279+
struct llama_model_tensor_buft_override {
280+
const char * pattern;
281+
ggml_backend_buffer_type_t buft;
282+
};
283+
279284
struct llama_model_params {
280285
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
281286
ggml_backend_dev_t * devices;
282287

288+
// NULL-terminated list of buffer types to use for tensors that match a pattern
289+
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
290+
283291
int32_t n_gpu_layers; // number of layers to store in VRAM
284292
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
285293

src/llama-model-loader.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader(
445445
std::vector<std::string> & splits,
446446
bool use_mmap,
447447
bool check_tensors,
448-
const struct llama_model_kv_override * param_overrides_p) {
448+
const llama_model_kv_override * param_overrides_p,
449+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
449450
int trace = 0;
450451
if (getenv("LLAMA_TRACE")) {
451452
trace = atoi(getenv("LLAMA_TRACE"));
@@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader(
457458
}
458459
}
459460

461+
tensor_buft_overrides = param_tensor_buft_overrides_p;
462+
460463
// Load the main GGUF
461464
struct ggml_context * ctx = NULL;
462465
struct gguf_init_params params = {

src/llama-model-loader.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,9 @@ struct llama_model_loader {
7777

7878
llama_mmaps mappings;
7979

80-
std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
81-
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
80+
std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
81+
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
82+
const llama_model_tensor_buft_override * tensor_buft_overrides;
8283

8384
gguf_context_ptr meta;
8485
std::vector<ggml_context_ptr> contexts;
@@ -95,7 +96,8 @@ struct llama_model_loader {
9596
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
9697
bool use_mmap,
9798
bool check_tensors,
98-
const struct llama_model_kv_override * param_overrides_p);
99+
const llama_model_kv_override * param_overrides_p,
100+
const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
99101

100102
template<typename T>
101103
typename std::enable_if<std::is_integral<T>::value, bool>::type

src/llama-model.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <cstring>
1212
#include <functional>
1313
#include <map>
14+
#include <regex>
1415
#include <sstream>
1516
#include <stdexcept>
1617

@@ -1468,9 +1469,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
14681469
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
14691470
}
14701471

1471-
ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1472+
ggml_backend_buffer_type_t buft = nullptr;
1473+
1474+
// check overrides
1475+
if (ml.tensor_buft_overrides) {
1476+
std::string tensor_name = tn.str();
1477+
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
1478+
std::regex pattern(overrides->pattern);
1479+
if (std::regex_search(tensor_name, pattern)) {
1480+
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
1481+
buft = overrides->buft;
1482+
break;
1483+
}
1484+
}
1485+
}
1486+
14721487
if (!buft) {
1473-
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1488+
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
1489+
if (!buft) {
1490+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
1491+
}
14741492
}
14751493

14761494
// avoid using a host buffer when using mmap
@@ -3792,6 +3810,7 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
37923810
struct llama_model_params llama_model_default_params() {
37933811
struct llama_model_params result = {
37943812
/*.devices =*/ nullptr,
3813+
/*.tensor_buft_overrides =*/ nullptr,
37953814
/*.n_gpu_layers =*/ 0,
37963815
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
37973816
/*.main_gpu =*/ 0,

src/llama-quant.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
527527
}
528528

529529
std::vector<std::string> splits = {};
530-
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
530+
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
531531
ml.init_mappings(false); // no prefetching
532532

533533
llama_model model(llama_model_default_params());

src/llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
4040
model.t_start_us = tm.t_start_us;
4141

4242
try {
43-
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
43+
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
4444

4545
ml.print_info();
4646

0 commit comments

Comments
 (0)