Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2344,8 +2344,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
throw std::invalid_argument("unknown buffer type");
}
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
// store pattern to ensure lifetime for the C-string
params.tensor_buft_override_names.push_back(tensor_name);
params.tensor_buft_overrides.push_back({params.tensor_buft_override_names.back().c_str(), buft_list.at(buffer_type)});
}
}
));
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string>
#include <string_view>
#include <vector>
#include <deque>
#include <sstream>

#ifdef _WIN32
Expand Down Expand Up @@ -281,6 +282,7 @@ struct common_params {
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
std::deque<std::string> tensor_buft_override_names; // storage for tensor override patterns
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
Expand Down
16 changes: 14 additions & 2 deletions ggml/src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,22 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
}
}

// return the next offset aligned to the specified power-of-two boundary
// optimized to avoid expensive modulo operations for common alignments
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
assert(alignment && !(alignment & (alignment - 1))); // power of 2
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
return offset + align;

uintptr_t addr = (uintptr_t) buffer + offset;

switch (alignment) {
case 16: return offset + ((-addr) & 15);
case 32: return offset + ((-addr) & 31);
case 64: return offset + ((-addr) & 63);
default: {
size_t mask = alignment - 1;
return offset + ((-addr) & mask);
}
}
}

// tallocr
Expand Down
10 changes: 7 additions & 3 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@ llama_context::llama_context(
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD;
bool gpu_backend = model.params.n_gpu_layers > 0;
if (gpu_backend && cparams.causal_attn) {
uint32_t padded = GGML_PAD(cparams.n_batch, GGML_KQ_MASK_PAD);
if (padded != cparams.n_batch) {
LLAMA_LOG_WARN("%s: n_batch padded from %u to %u due to GPU requirements\n", __func__, cparams.n_batch, padded);
cparams.n_batch = padded;
}
}

cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
Expand Down
41 changes: 37 additions & 4 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,31 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
};

// RAII helper for temporary buffer assignment
struct buffer_guard {
explicit buffer_guard(ggml_tensor * t, ggml_backend_buffer_type_t buft) : t(t) {
t->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
}
~buffer_guard() {
if (t->buffer) {
ggml_backend_buffer_free(t->buffer);
t->buffer = nullptr;
}
}
ggml_tensor * t;
};

// cache for operation support checks
struct op_support_key {
ggml_backend_dev_t dev;
ggml_backend_buffer_type_t buft;
ggml_op op;
bool operator<(const op_support_key & other) const {
return std::tie(dev, buft, op) < std::tie(other.dev, other.buft, other.op);
}
};
static std::map<op_support_key, bool> g_op_support_cache;

std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
}
Expand All @@ -135,6 +160,12 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
GGML_ASSERT(w != nullptr);

op_support_key key { dev, buft, op };
auto it = g_op_support_cache.find(key);
if (it != g_op_support_cache.end()) {
return it->second;
}

if (op == GGML_OP_NONE) {
return true;
}
Expand Down Expand Up @@ -245,10 +276,10 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w

// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
GGML_ASSERT(w->buffer == nullptr);
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
buffer_guard guard(w, buft);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
ggml_backend_buffer_free(w->buffer);
w->buffer = nullptr;

g_op_support_cache[key] = op_supported;

return op_supported;
}
Expand All @@ -262,7 +293,9 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
bool should_offload = ggml_backend_dev_type(cur_dev) != GGML_BACKEND_DEVICE_TYPE_CPU ?
ggml_backend_dev_offload_op(cur_dev, tensor) : true;
if (should_offload && weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
return cur_buft;
}
}
Expand Down