Skip to content

Commit 66dea01

Browse files
authored
Merge branch 'ggerganov:master' into vulkan
2 parents d8072e7 + 53ff6b9 commit 66dea01

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+2135
-1887
lines changed

.github/ISSUE_TEMPLATE/010-bug-compilation.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,22 @@ body:
6565
If possible, please do a git bisect and identify the exact commit that introduced the bug.
6666
validations:
6767
required: false
68+
- type: textarea
69+
id: command
70+
attributes:
71+
label: Compile command
72+
description: >
73+
Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
74+
This will be automatically formatted into code, so no need for backticks.
75+
render: shell
76+
validations:
77+
required: true
6878
- type: textarea
6979
id: logs
7080
attributes:
7181
label: Relevant log output
7282
description: >
73-
Please copy and paste any relevant log output, including the command that you entered and any generated text.
83+
Please copy and paste any relevant log output, including any generated text.
7484
This will be automatically formatted into code, so no need for backticks.
7585
render: shell
7686
validations:

.github/ISSUE_TEMPLATE/019-bug-misc.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ body:
5252
- Other (Please specify in the next section)
5353
validations:
5454
required: false
55+
- type: textarea
56+
id: command
57+
attributes:
58+
label: Command line
59+
description: >
60+
Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
61+
This will be automatically formatted into code, so no need for backticks.
62+
render: shell
63+
validations:
64+
required: false
5565
- type: textarea
5666
id: info
5767
attributes:
@@ -74,7 +84,7 @@ body:
7484
attributes:
7585
label: Relevant log output
7686
description: >
77-
If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
87+
If applicable, please copy and paste any relevant log output, including any generated text.
7888
This will be automatically formatted into code, so no need for backticks.
7989
render: shell
8090
validations:

CODEOWNERS

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
22

33
/ci/ @ggerganov
4-
/.devops/ @ngxson
4+
/.devops/*.Dockerfile @ngxson
55
/examples/server/ @ngxson
6+
/ggml/src/ggml-cuda/fattn* @JohannesGaessler
7+
/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
8+
/ggml/src/ggml-cuda/mmv.* @JohannesGaessler
9+
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
10+
/ggml/src/ggml-opt.cpp @JohannesGaessler
11+
/ggml/src/gguf.cpp @JohannesGaessler

common/common.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
33
#endif
44

5+
#include "ggml.h"
6+
#include "gguf.h"
7+
58
#include "common.h"
69
#include "log.h"
710
// Change JSON_ASSERT from assert() to GGML_ASSERT:
@@ -846,7 +849,7 @@ struct common_init_result common_init_from_params(common_params & params) {
846849
} else if (!params.model_url.empty()) {
847850
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
848851
} else {
849-
model = llama_load_model_from_file(params.model.c_str(), mparams);
852+
model = llama_model_load_from_file(params.model.c_str(), mparams);
850853
}
851854

852855
if (model == NULL) {
@@ -873,7 +876,7 @@ struct common_init_result common_init_from_params(common_params & params) {
873876
}
874877

875878
if (!ok) {
876-
llama_free_model(model);
879+
llama_model_free(model);
877880

878881
return iparams;
879882
}
@@ -884,7 +887,7 @@ struct common_init_result common_init_from_params(common_params & params) {
884887
llama_context * lctx = llama_new_context_with_model(model, cparams);
885888
if (lctx == NULL) {
886889
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
887-
llama_free_model(model);
890+
llama_model_free(model);
888891
return iparams;
889892
}
890893

@@ -900,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) {
900903
const auto cvec = common_control_vector_load(params.control_vectors);
901904
if (cvec.n_embd == -1) {
902905
llama_free(lctx);
903-
llama_free_model(model);
906+
llama_model_free(model);
904907

905908
return iparams;
906909
}
@@ -913,7 +916,7 @@ struct common_init_result common_init_from_params(common_params & params) {
913916
params.control_vector_layer_end);
914917
if (err) {
915918
llama_free(lctx);
916-
llama_free_model(model);
919+
llama_model_free(model);
917920

918921
return iparams;
919922
}
@@ -926,7 +929,7 @@ struct common_init_result common_init_from_params(common_params & params) {
926929
if (lora == nullptr) {
927930
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
928931
llama_free(lctx);
929-
llama_free_model(model);
932+
llama_model_free(model);
930933
return iparams;
931934
}
932935

@@ -982,7 +985,7 @@ struct common_init_result common_init_from_params(common_params & params) {
982985
if (llama_model_has_encoder(model)) {
983986
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
984987
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
985-
if (decoder_start_token_id == -1) {
988+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
986989
decoder_start_token_id = bos;
987990
}
988991
tmp.clear();
@@ -1411,7 +1414,7 @@ struct llama_model * common_load_model_from_url(
14111414
}
14121415
}
14131416

1414-
return llama_load_model_from_file(local_path.c_str(), params);
1417+
return llama_model_load_from_file(local_path.c_str(), params);
14151418
}
14161419

14171420
struct llama_model * common_load_model_from_hf(

common/ngram-cache.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
6565
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
6666
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
6767
if (part_static_it == nc_static.end()) {
68-
return -1;
68+
return LLAMA_TOKEN_NULL;
6969
}
7070
const common_ngram_cache_part part_static = part_static_it->second;
7171

7272
int max_count_static = 0;
7373
int sum_count_static = 0;
74-
llama_token max_token = -1;
74+
llama_token max_token = LLAMA_TOKEN_NULL;
7575

7676
for (std::pair<llama_token, int> token_count_static : part_static) {
7777
const llama_token token = token_count_static.first;
@@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram
8585
}
8686

8787
if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
88-
return -1;
88+
return LLAMA_TOKEN_NULL;
8989
}
9090
if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
91-
return -1;
91+
return LLAMA_TOKEN_NULL;
9292
}
9393
return max_token;
9494
}
@@ -98,9 +98,9 @@ static llama_token try_draft(
9898
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
9999
const int * min_sample_size, const int * min_percent) {
100100

101-
llama_token drafted_token = -1;
101+
llama_token drafted_token = LLAMA_TOKEN_NULL;
102102

103-
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
103+
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) {
104104
const common_ngram ngram_primary = ngrams_primary[i];
105105

106106
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
@@ -112,7 +112,7 @@ static llama_token try_draft(
112112
int max_count_primary = 0;
113113
int max_count_static = 0;
114114
int sum_count_primary = 0;
115-
llama_token max_token = -1;
115+
llama_token max_token = LLAMA_TOKEN_NULL;
116116

117117
for (std::pair<llama_token, int> token_count_primary : part_primary) {
118118
const llama_token token = token_count_primary.first;
@@ -154,7 +154,7 @@ void common_ngram_cache_draft(
154154
}
155155

156156
while ((int) draft.size()-1 < n_draft) {
157-
llama_token drafted_token = -1;
157+
llama_token drafted_token = LLAMA_TOKEN_NULL;
158158

159159
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
160160
common_ngram ngram_static;
@@ -177,17 +177,17 @@ void common_ngram_cache_draft(
177177
}
178178
ngrams_cd.push_back(ngram_cd);
179179
}
180-
if (drafted_token == -1) {
180+
if (drafted_token == LLAMA_TOKEN_NULL) {
181181
drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
182182
}
183-
if (drafted_token == -1) {
183+
if (drafted_token == LLAMA_TOKEN_NULL) {
184184
drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
185185
}
186-
if (drafted_token == -1) {
186+
if (drafted_token == LLAMA_TOKEN_NULL) {
187187
drafted_token = try_draft(nc_static, ngram_static);
188188
}
189189

190-
if (drafted_token == -1) {
190+
if (drafted_token == LLAMA_TOKEN_NULL) {
191191
break;
192192
}
193193

common/ngram-cache.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ struct common_ngram {
1717

1818
common_ngram() {
1919
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
20-
tokens[i] = -1;
20+
tokens[i] = LLAMA_TOKEN_NULL;
2121
}
2222
}
2323

2424
common_ngram(const llama_token * input, const int ngram_size) {
2525
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
26-
tokens[i] = i < ngram_size ? input[i] : -1;
26+
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
2727
}
2828
}
2929

examples/batched-bench/batched-bench.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {
3838

3939
llama_model_params model_params = common_model_params_to_llama(params);
4040

41-
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
41+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
4242

4343
if (model == NULL) {
4444
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -194,7 +194,7 @@ int main(int argc, char ** argv) {
194194
llama_batch_free(batch);
195195

196196
llama_free(ctx);
197-
llama_free_model(model);
197+
llama_model_free(model);
198198

199199
llama_backend_free();
200200

examples/batched/batched.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
4141

4242
llama_model_params model_params = common_model_params_to_llama(params);
4343

44-
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
44+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
4545

4646
if (model == NULL) {
4747
LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -120,7 +120,7 @@ int main(int argc, char ** argv) {
120120
}
121121

122122
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
123-
if (decoder_start_token_id == -1) {
123+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
124124
decoder_start_token_id = llama_token_bos(model);
125125
}
126126

@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
236236

237237
llama_sampler_free(smpl);
238238
llama_free(ctx);
239-
llama_free_model(model);
239+
llama_model_free(model);
240240

241241
llama_backend_free();
242242

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#include "ggml.h"
2+
#include "gguf.h"
3+
24
#include "llama.h"
35
#include "common.h"
46
#include "log.h"
@@ -689,8 +691,8 @@ static void save_as_llama_model(
689691
gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
690692
gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
691693
gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
692-
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
693-
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
694+
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL);
695+
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL);
694696

695697
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
696698
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);

examples/cvector-generator/cvector-generator.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
#include "ggml.h"
2+
#include "gguf.h"
3+
14
#include "arg.h"
25
#include "common.h"
36
#include "llama.h"
4-
#include "ggml.h"
57
#include "pca.hpp"
68
#include "mean.hpp"
79

0 commit comments

Comments
 (0)