Skip to content

Commit 6add36a

Browse files
authored
Merge branch 'ggerganov:master' into master
2 parents 79b2c38 + 6f55bcc commit 6add36a

File tree

7 files changed

+50
-61
lines changed

7 files changed

+50
-61
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
131131
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
132132
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
133133
- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
134+
- Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
134135

135136
**UI:**
136137

examples/main/README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models.
297297

298298
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
299299

300-
### Memory Float 32
301-
302-
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
303-
304300
### Batch Size
305301

306302
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

include/llama.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -953,12 +953,6 @@ extern "C" {
953953
int32_t lstrip,
954954
bool special);
955955

956-
// check if token0 is contained as a prefix in token1
957-
LLAMA_API bool llama_token_is_prefix(
958-
const struct llama_model * model,
959-
llama_token token0,
960-
llama_token token1);
961-
962956
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
963957
/// @param text The char pointer must be large enough to hold the resulting text.
964958
/// @return Returns the number of chars/bytes on success, no more than text_len_max.

scripts/run-with-preset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
1616
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
1717
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
18-
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
18+
"low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
1919
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
2020
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
2121
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
@@ -25,12 +25,12 @@
2525
]
2626

2727
CLI_ARGS_LLAMA_BENCH = [
28-
"batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
28+
"batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
2929
"n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
3030
]
3131

3232
CLI_ARGS_LLAMA_SERVER = [
33-
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
33+
"alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
3434
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
3535
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
3636
"threads", "verbose"

src/llama-sampling.cpp

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1745,6 +1745,9 @@ struct llama_sampler * llama_sampler_init_logit_bias(
17451745

17461746
struct llama_sampler_infill {
17471747
const struct llama_vocab * vocab;
1748+
1749+
std::vector<char> buf0;
1750+
std::vector<char> buf1;
17481751
};
17491752

17501753
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
@@ -1810,27 +1813,44 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
18101813
size_t n_combined = 0; GGML_UNUSED(n_combined);
18111814

18121815
// combine tokens with common prefix
1813-
for (size_t i = 0; i < cur_p->size; ++i) {
1814-
for (size_t j = 0; j < cur_p->size; ++j) {
1815-
if (cur_p->data[i].logit == -INFINITY) {
1816+
for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
1817+
for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
1818+
if (cur_p->data[i0].logit == -INFINITY) {
18161819
break;
18171820
}
18181821

1819-
if (i == j || cur_p->data[j].logit == -INFINITY) {
1822+
if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
18201823
continue;
18211824
}
18221825

1823-
if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
1824-
if (cur_p->data[i].p > cur_p->data[j].p) {
1825-
cur_p->data[i].p += cur_p->data[j].p;
1826-
cur_p->data[j].logit = -INFINITY;
1827-
cur_p->data[j].p = 0.0f;
1828-
} else {
1829-
cur_p->data[j].p += cur_p->data[i].p;
1830-
cur_p->data[i].logit = -INFINITY;
1831-
cur_p->data[i].p = 0.0f;
1826+
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
1827+
if (len0 < 0) {
1828+
ctx->buf0.resize(len0);
1829+
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
1830+
assert(len0 > 0);
1831+
}
1832+
1833+
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
1834+
if (len1 < 0) {
1835+
ctx->buf1.resize(len1);
1836+
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
1837+
assert(len1 > 0);
1838+
}
1839+
1840+
// token i0 is a prefix of token i1
1841+
if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
1842+
int dst = i0;
1843+
int src = i1;
1844+
1845+
// merge into the token with higher probability
1846+
if (cur_p->data[i1].p > cur_p->data[i0].p) {
1847+
std::swap(dst, src);
18321848
}
18331849

1850+
cur_p->data[dst].p += cur_p->data[src].p;
1851+
cur_p->data[src].logit = -INFINITY;
1852+
cur_p->data[src].p = 0.0f;
1853+
18341854
n_combined++;
18351855
}
18361856
}
@@ -1936,6 +1956,8 @@ struct llama_sampler * llama_sampler_init_infill_impl(
19361956
/* .iface = */ &llama_sampler_infill_i,
19371957
/* .ctx = */ new llama_sampler_infill {
19381958
/* .vocab = */ &vocab,
1959+
/* .buf0 = */ std::vector<char>(512),
1960+
/* .buf1 = */ std::vector<char>(512),
19391961
},
19401962
};
19411963
}

src/llama-vocab.cpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1858,23 +1858,6 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
18581858
return 0;
18591859
}
18601860

1861-
bool llama_token_is_prefix_impl(
1862-
const struct llama_vocab & vocab,
1863-
llama_token token0,
1864-
llama_token token1) {
1865-
char text_buf_0[128];
1866-
char text_buf_1[128];
1867-
1868-
const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
1869-
const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
1870-
1871-
if (len0 <= 0 || len1 <= 0) {
1872-
return false;
1873-
}
1874-
1875-
return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
1876-
}
1877-
18781861
int32_t llama_detokenize_impl(
18791862
const struct llama_vocab & vocab,
18801863
const llama_token * tokens,

src/llama.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6735,9 +6735,9 @@ static void llm_load_vocab(
67356735
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67366736
}
67376737
} else {
6738-
// token is control, but not marked as EOG -> print a warning
6738+
// token is control, but not marked as EOG -> print a debug log
67396739
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
6740-
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
6740+
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
67416741
__func__, t.second, t.first.c_str());
67426742
}
67436743
}
@@ -17134,10 +17134,10 @@ static void llama_graph_compute(
1713417134
//
1713517135
static int llama_decode_internal(
1713617136
llama_context & lctx,
17137-
llama_batch batch_all) { // TODO: rename back to batch
17137+
llama_batch batch) {
1713817138

1713917139
lctx.is_encoding = false;
17140-
const uint32_t n_tokens_all = batch_all.n_tokens;
17140+
const uint32_t n_tokens_all = batch.n_tokens;
1714117141

1714217142
if (n_tokens_all == 0) {
1714317143
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
@@ -17148,12 +17148,12 @@ static int llama_decode_internal(
1714817148
const auto & hparams = model.hparams;
1714917149
const auto & cparams = lctx.cparams;
1715017150

17151-
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
17151+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
1715217152

17153-
if (batch_all.token) {
17153+
if (batch.token) {
1715417154
for (uint32_t i = 0; i < n_tokens_all; ++i) {
17155-
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
17156-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
17155+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
17156+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
1715717157
return -1;
1715817158
}
1715917159
}
@@ -17184,9 +17184,9 @@ static int llama_decode_internal(
1718417184
lctx.embd_seq.clear();
1718517185

1718617186
// count outputs
17187-
if (batch_all.logits && !embd_pooled) {
17187+
if (batch.logits && !embd_pooled) {
1718817188
for (uint32_t i = 0; i < n_tokens_all; ++i) {
17189-
n_outputs += batch_all.logits[i] != 0;
17189+
n_outputs += batch.logits[i] != 0;
1719017190
}
1719117191
} else if (lctx.logits_all || embd_pooled) {
1719217192
n_outputs = n_tokens_all;
@@ -17195,7 +17195,7 @@ static int llama_decode_internal(
1719517195
n_outputs = 1;
1719617196
}
1719717197

17198-
lctx.sbatch.from_batch(batch_all, n_embd,
17198+
lctx.sbatch.from_batch(batch, n_embd,
1719917199
/* simple_split */ !kv_self.recurrent,
1720017200
/* logits_all */ n_outputs == n_tokens_all);
1720117201

@@ -21466,13 +21466,6 @@ int32_t llama_token_to_piece(
2146621466
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
2146721467
}
2146821468

21469-
bool llama_token_is_prefix(
21470-
const struct llama_model * model,
21471-
llama_token token0,
21472-
llama_token token1) {
21473-
return llama_token_is_prefix_impl(model->vocab, token0, token1);
21474-
}
21475-
2147621469
int32_t llama_detokenize(
2147721470
const struct llama_model * model,
2147821471
const llama_token * tokens,

0 commit comments

Comments
 (0)