Skip to content

Commit 5a699f1

Browse files
committed
server : accept extra_context for the infill endpoint
ggml-ci
1 parent c7181bd commit 5a699f1

File tree

2 files changed

+78
-22
lines changed

2 files changed

+78
-22
lines changed

examples/server/server.cpp

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ struct slot_params {
139139

140140
json input_prefix;
141141
json input_suffix;
142+
143+
json extra_context;
142144
};
143145

144146
struct server_slot {
@@ -170,6 +172,7 @@ struct server_slot {
170172

171173
// when a task is submitted, we first tokenize the prompt and store it here
172174
std::vector<llama_token> prompt_tokens;
175+
std::vector<llama_token> extra_tokens;
173176

174177
std::string generated_text;
175178
std::vector<llama_token> cache_tokens;
@@ -906,8 +909,18 @@ struct server_context {
906909
}
907910

908911
// infill
909-
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
910-
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
912+
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
913+
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
914+
slot.params.extra_context = json_value(data, "extra_context", default_params.extra_context);
915+
916+
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.params.extra_context.size());
917+
for (const auto & chunk : slot.params.extra_context) {
918+
if (chunk.is_string()) {
919+
SLT_DBG(slot, "chunk: \n%s\n", chunk.get<std::string>().c_str());
920+
} else {
921+
SLT_DBG(slot, "%s", "chunk is not a string - skipping\n");
922+
}
923+
}
911924

912925
// get prompt
913926
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
@@ -1937,10 +1950,28 @@ struct server_context {
19371950
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
19381951
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
19391952

1940-
// for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
1941-
const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
1953+
slot.extra_tokens.clear();
1954+
for (const auto & e : slot.params.extra_context) {
1955+
if (e.is_string()) {
1956+
// chunk separator in binary form to avoid confusing the AI
1957+
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
1958+
static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
1959+
slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
1960+
1961+
const auto part = tokenize(e, false, false);
1962+
slot.extra_tokens.insert(slot.extra_tokens.end(), part.begin(), part.end());
1963+
} else {
1964+
SLT_WRN(slot, "%s", "extra context element is not a string\n");
1965+
}
1966+
}
1967+
1968+
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
1969+
const int n_suffix_take = std::min<int>(suffix_tokens.size(), (n_batch)/4);
19421970
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
19431971

1972+
// fill the rest of the context with extra chunks
1973+
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
1974+
19441975
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
19451976
suffix_tokens.resize(n_suffix_take);
19461977

@@ -1954,6 +1985,11 @@ struct server_context {
19541985
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
19551986
}
19561987

1988+
SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
1989+
1990+
// put the extra context before the FIM prefix
1991+
embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
1992+
19571993
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
19581994
embd_inp.push_back(llama_token_fim_mid(model));
19591995

src/llama.cpp

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6596,8 +6596,8 @@ static void llm_load_vocab(
65966596
) {
65976597
vocab.special_eot_id = t.second;
65986598
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6599-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6600-
__func__, t.first.c_str());
6599+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6600+
__func__, t.second, t.first.c_str());
66016601
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66026602
}
66036603
}
@@ -6610,8 +6610,8 @@ static void llm_load_vocab(
66106610
) {
66116611
vocab.special_eom_id = t.second;
66126612
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6613-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6614-
__func__, t.first.c_str());
6613+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6614+
__func__, t.second, t.first.c_str());
66156615
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66166616
}
66176617
}
@@ -6627,8 +6627,8 @@ static void llm_load_vocab(
66276627
) {
66286628
vocab.special_fim_pre_id = t.second;
66296629
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6630-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6631-
__func__, t.first.c_str());
6630+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6631+
__func__, t.second, t.first.c_str());
66326632
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66336633
}
66346634
}
@@ -6644,8 +6644,8 @@ static void llm_load_vocab(
66446644
) {
66456645
vocab.special_fim_suf_id = t.second;
66466646
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6647-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6648-
__func__, t.first.c_str());
6647+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6648+
__func__, t.second, t.first.c_str());
66496649
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66506650
}
66516651
}
@@ -6661,8 +6661,8 @@ static void llm_load_vocab(
66616661
) {
66626662
vocab.special_fim_mid_id = t.second;
66636663
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6664-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6665-
__func__, t.first.c_str());
6664+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6665+
__func__, t.second, t.first.c_str());
66666666
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66676667
}
66686668
}
@@ -6677,8 +6677,8 @@ static void llm_load_vocab(
66776677
) {
66786678
vocab.special_fim_pad_id = t.second;
66796679
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6680-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6681-
__func__, t.first.c_str());
6680+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6681+
__func__, t.second, t.first.c_str());
66826682
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
66836683
}
66846684
}
@@ -6694,8 +6694,8 @@ static void llm_load_vocab(
66946694
) {
66956695
vocab.special_fim_rep_id = t.second;
66966696
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6697-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6698-
__func__, t.first.c_str());
6697+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6698+
__func__, t.second, t.first.c_str());
66996699
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67006700
}
67016701
}
@@ -6708,8 +6708,8 @@ static void llm_load_vocab(
67086708
) {
67096709
vocab.special_fim_sep_id = t.second;
67106710
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6711-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6712-
__func__, t.first.c_str());
6711+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6712+
__func__, t.second, t.first.c_str());
67136713
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67146714
}
67156715
}
@@ -6720,6 +6720,19 @@ static void llm_load_vocab(
67206720
// this is currently determined based on the token text, which is obviously not ideal
67216721
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
67226722
vocab.special_eog_ids.clear();
6723+
6724+
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
6725+
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
6726+
}
6727+
6728+
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
6729+
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
6730+
}
6731+
6732+
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
6733+
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
6734+
}
6735+
67236736
for (const auto & t : vocab.token_to_id) {
67246737
if (false
67256738
|| t.first == "<|eot_id|>"
@@ -6732,13 +6745,20 @@ static void llm_load_vocab(
67326745
) {
67336746
vocab.special_eog_ids.insert(t.second);
67346747
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
6735-
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6736-
__func__, t.first.c_str());
6748+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
6749+
__func__, t.second, t.first.c_str());
67376750
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
67386751
}
6752+
} else {
6753+
// token is control, but not marked as EOG -> print a warning
6754+
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
6755+
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
6756+
__func__, t.second, t.first.c_str());
6757+
}
67396758
}
67406759
}
67416760

6761+
// sanity checks
67426762
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
67436763
vocab.special_eog_ids.insert(vocab.special_eos_id);
67446764
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);

0 commit comments

Comments
 (0)