Skip to content

Commit 670d8b7

Browse files
committed
Merge remote-tracking branch 'upstream/master' into dry-sampling-post-refactor
2 parents 1d6ecb1 + 3752217 commit 670d8b7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1830
-978
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud.
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
3232
- AVX, AVX2 and AVX512 support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
34-
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
34+
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
3636
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
3737

@@ -130,6 +130,8 @@ Typically finetunes of the base models below are supported as well.
130130
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
131131
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132132
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
133+
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
134+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
133135

134136
**UI:**
135137

@@ -413,7 +415,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
413415
| [BLAS](./docs/build.md#blas-build) | All |
414416
| [BLIS](./docs/backend/BLIS.md) | All |
415417
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
416-
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
418+
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
417419
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
418420
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
419421
| [Vulkan](./docs/build.md#vulkan) | GPU |

common/arg.cpp

Lines changed: 131 additions & 153 deletions
Large diffs are not rendered by default.

common/common.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include <algorithm>
1414
#include <cinttypes>
15+
#include <climits>
1516
#include <cmath>
1617
#include <codecvt>
1718
#include <cstdarg>
@@ -23,10 +24,10 @@
2324
#include <regex>
2425
#include <sstream>
2526
#include <string>
27+
#include <thread>
2628
#include <unordered_map>
2729
#include <unordered_set>
2830
#include <vector>
29-
#include <thread>
3031

3132
#if defined(__APPLE__) && defined(__MACH__)
3233
#include <sys/types.h>
@@ -400,6 +401,21 @@ std::string common_params_get_system_info(const common_params & params) {
400401
// String utils
401402
//
402403

404+
std::string string_format(const char * fmt, ...) {
405+
va_list ap;
406+
va_list ap2;
407+
va_start(ap, fmt);
408+
va_copy(ap2, ap);
409+
int size = vsnprintf(NULL, 0, fmt, ap);
410+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
411+
std::vector<char> buf(size + 1);
412+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
413+
GGML_ASSERT(size2 == size);
414+
va_end(ap2);
415+
va_end(ap);
416+
return std::string(buf.data(), size);
417+
}
418+
403419
std::vector<std::string> string_split(std::string input, char separator) {
404420
std::vector<std::string> parts;
405421
size_t separator_pos = input.find(separator);
@@ -2092,6 +2108,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
20922108
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
20932109
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
20942110
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
2111+
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
2112+
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
20952113
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
20962114
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
20972115
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

common/common.h

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ enum common_sampler_type {
9090
COMMON_SAMPLER_TYPE_TFS_Z = 4,
9191
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
9292
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
93+
COMMON_SAMPLER_TYPE_XTC = 7,
94+
COMMON_SAMPLER_TYPE_INFILL = 8,
9395
};
9496

9597
// dimensionality reduction methods, used by cvector-generator
@@ -108,6 +110,8 @@ struct common_sampler_params {
108110
int32_t top_k = 40; // <= 0 to use vocab size
109111
float top_p = 0.95f; // 1.0 = disabled
110112
float min_p = 0.05f; // 0.0 = disabled
113+
float xtc_probability = 0.00f; // 0.0 = disabled
114+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
111115
float tfs_z = 1.00f; // 1.0 = disabled
112116
float typ_p = 1.00f; // typical_p, 1.0 = disabled
113117
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -130,13 +134,15 @@ struct common_sampler_params {
130134

131135
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
132136

137+
133138
std::vector<enum common_sampler_type> samplers = {
134139
COMMON_SAMPLER_TYPE_TOP_K,
135140
COMMON_SAMPLER_TYPE_TFS_Z,
136141
COMMON_SAMPLER_TYPE_TYPICAL_P,
137142
COMMON_SAMPLER_TYPE_TOP_P,
138143
COMMON_SAMPLER_TYPE_MIN_P,
139-
COMMON_SAMPLER_TYPE_TEMPERATURE
144+
COMMON_SAMPLER_TYPE_XTC,
145+
COMMON_SAMPLER_TYPE_TEMPERATURE,
140146
};
141147

142148
std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -283,12 +289,12 @@ struct common_params {
283289
int32_t port = 8080; // server listens on this network port
284290
int32_t timeout_read = 600; // http read timeout in seconds
285291
int32_t timeout_write = timeout_read; // http write timeout in seconds
286-
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
292+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
293+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
287294

288295
std::string hostname = "127.0.0.1";
289296
std::string public_path = ""; // NOLINT
290297
std::string chat_template = ""; // NOLINT
291-
std::string system_prompt = ""; // NOLINT
292298
bool enable_chat_template = true;
293299

294300
std::vector<std::string> api_keys;
@@ -358,15 +364,28 @@ void common_init();
358364

359365
std::string common_params_get_system_info(const common_params & params);
360366

361-
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
362-
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
363-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
367+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
368+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
369+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
364370
bool set_process_priority(enum ggml_sched_priority prio);
365371

366372
//
367373
// String utils
368374
//
369375

376+
#ifdef __GNUC__
377+
#ifdef __MINGW32__
378+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
379+
#else
380+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
381+
#endif
382+
#else
383+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
384+
#endif
385+
386+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
387+
std::string string_format(const char * fmt, ...);
388+
370389
std::vector<std::string> string_split(std::string input, char separator);
371390

372391
std::string string_strip(const std::string & str);

common/json-schema-to-grammar.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ class SchemaConverter {
611611
}
612612
return join_seq();
613613
};
614-
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
614+
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
615615
}
616616

617617
/*

common/sampling.cpp

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,14 +129,14 @@ std::string common_sampler_params::print() const {
129129
char result[1024];
130130

131131
snprintf(result, sizeof(result),
132-
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133-
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134-
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
135-
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
136-
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
137-
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138-
top_k, tfs_z, top_p, min_p, typ_p, temp,
139-
mirostat, mirostat_eta, mirostat_tau);
132+
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
133+
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
134+
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
135+
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
136+
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
137+
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
138+
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
139+
mirostat, mirostat_eta, mirostat_tau);
140140

141141
return std::string(result);
142142
}
@@ -193,6 +193,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
193193
case COMMON_SAMPLER_TYPE_MIN_P:
194194
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
195195
break;
196+
case COMMON_SAMPLER_TYPE_XTC:
197+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
198+
break;
196199
case COMMON_SAMPLER_TYPE_TFS_Z:
197200
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
198201
break;
@@ -202,6 +205,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
202205
case COMMON_SAMPLER_TYPE_TEMPERATURE:
203206
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
204207
break;
208+
case COMMON_SAMPLER_TYPE_INFILL:
209+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210+
break;
205211
default:
206212
GGML_ASSERT(false && "unknown sampler type");
207213
}
@@ -381,6 +387,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
381387
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
382388
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
383389
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
390+
case COMMON_SAMPLER_TYPE_XTC: return 'x';
391+
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
384392
default : return '?';
385393
}
386394
}
@@ -393,6 +401,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
393401
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
394402
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
395403
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
404+
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
405+
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
396406
default : return "";
397407
}
398408
}
@@ -405,6 +415,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
405415
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
406416
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
407417
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
418+
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
419+
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
408420
};
409421

410422
// since samplers names are written multiple ways
@@ -450,7 +462,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
450462
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
451463
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
452464
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
453-
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
465+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
466+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
467+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
454468
};
455469

456470
std::vector<common_sampler_type> samplers;

docs/build.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance:
198198
199199
### MUSA
200200
201+
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
202+
201203
- Using `make`:
202204
```bash
203205
make GGML_MUSA=1
@@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance:
209211
cmake --build build --config Release
210212
```
211213
214+
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
215+
216+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
217+
218+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
219+
212220
### hipBLAS
213221

214222
This provides BLAS acceleration on HIP-supported AMD GPUs.

examples/infill/infill.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,11 @@ int main(int argc, char ** argv) {
205205
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
206206
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
207207

208-
GGML_ASSERT(llama_token_prefix(model) >= 0);
209-
GGML_ASSERT(llama_token_suffix(model) >= 0);
208+
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
209+
GGML_ASSERT(llama_token_fim_suf(model) >= 0);
210210

211-
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
212-
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
211+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
212+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
213213

214214
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
215215
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
218218
}
219219
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
220220

221-
const llama_token middle_token = llama_token_middle(model);
221+
const llama_token middle_token = llama_token_fim_mid(model);
222222
if (middle_token >= 0) {
223223
embd_inp.push_back(middle_token);
224224
}
@@ -508,8 +508,8 @@ int main(int argc, char ** argv) {
508508
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
509509
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
510510

511-
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
512-
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
511+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
512+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
513513

514514
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
515515
embd_end = params.spm_infill ? inp_pfx : inp_sfx;

examples/json_schema_to_grammar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def join_seq():
540540
return self._add_rule(
541541
name,
542542
to_rule(transform()) if self._raw_pattern \
543-
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
543+
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
544544

545545

546546
def _resolve_ref(self, ref):

examples/llava/llava.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
432432
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
433433
if (!image_embed_result) {
434434
clip_image_u8_free(img);
435-
LOG_ERR("%s: coulnd't embed the image\n", __func__);
435+
LOG_ERR("%s: couldn't embed the image\n", __func__);
436436
return NULL;
437437
}
438438

0 commit comments

Comments
 (0)