Skip to content

Commit 19536ca

Browse files
committed
Merge branch 'concedo_exp_b6031' into crokeso
2 parents 9e77547 + 9e3cec2 commit 19536ca

File tree

8 files changed

+8196
-71
lines changed

8 files changed

+8196
-71
lines changed

common/chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
19441944
}
19451945
}
19461946
auto msg = builder.result();
1947-
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1947+
if (!is_partial) {
1948+
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1949+
}
19481950
return msg;
19491951
}

docs/ops/CANN.csv

Lines changed: 8133 additions & 0 deletions
Large diffs are not rendered by default.

examples/embedding/embedding.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,14 @@ int main(int argc, char ** argv) {
8181

8282
params.embedding = true;
8383

84+
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
85+
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
86+
// in order to support any number of prompts
87+
if (params.n_parallel == 1) {
88+
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
89+
params.kv_unified = true;
90+
}
91+
8492
// utilize the full context
8593
if (params.n_batch < params.n_ctx) {
8694
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);

klite.embd

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11363,6 +11363,8 @@ Current version indicated by LITEVER below.
1136311363
}
1136411364
}
1136511365

11366+
autofetch_attempt_dict[desired_oai_ep] = true;
11367+
1136611368
let dropdown = get_custom_ep_model_dropdown();
1136711369
fetch((desired_oai_ep + oai_models_endpoint), {
1136811370
method: 'GET',
@@ -11437,22 +11439,18 @@ Current version indicated by LITEVER below.
1143711439
}
1143811440
}
1143911441

11440-
let openrouter_fetch_attempted = false;
11441-
let oai_custom_fetch_attempted = false;
11442+
let autofetch_attempt_dict = {};
1144211443
function try_fetch_oai_models_auto()
1144311444
{
11445+
let targetep = document.getElementById("custom_oai_endpoint").value;
1144411446
//only for apis that don't gate the model list
11445-
if (document.getElementById("custom_oai_endpoint").value!="" &&
11446-
document.getElementById("custom_oai_endpoint").value.toLowerCase().includes("featherless.ai"))
11447+
if (targetep!="")
1144711448
{
11448-
if(!oai_custom_fetch_attempted)
11449+
if(!autofetch_attempt_dict[targetep])
1144911450
{
11450-
oai_custom_fetch_attempted = true;
11451-
let dropdown = document.getElementById("custom_oai_model");
11452-
if(dropdown.options.length < 40)
11453-
{
11454-
oai_fetch_models(); //autofetch models
11455-
}
11451+
autofetch_attempt_dict[targetep] = true;
11452+
let dropdown = get_custom_ep_model_dropdown();
11453+
oai_fetch_models(); //autofetch models
1145611454
}
1145711455
}
1145811456
}
@@ -11509,6 +11507,14 @@ Current version indicated by LITEVER below.
1150911507
} else {
1151011508
document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr ? localsettings.saved_oai_addr : default_oai_base);
1151111509
}
11510+
}
11511+
else if(epchoice==3) //openrouter supports autofetch
11512+
{
11513+
document.getElementById("openrouterdesc").classList.remove("hidden");
11514+
document.getElementById("custom_openrouter_model").classList.remove("hidden");
11515+
document.getElementById("openrouterproviderbox").classList.remove("hidden");
11516+
document.getElementById("custom_oai_endpoint").value = default_openrouter_base;
11517+
document.getElementById("custom_oai_key").value =(localsettings.saved_openrouter_key==dummy_api_key?"":localsettings.saved_openrouter_key);
1151211518
try_fetch_oai_models_auto();
1151311519
}
1151411520
else if(epchoice==7)
@@ -11540,23 +11546,7 @@ Current version indicated by LITEVER below.
1154011546
document.getElementById("custom_oai_key").value = dummy_api_key;
1154111547
document.getElementById("custom_oai_endpoint").value = pollinations_text_endpoint;
1154211548
document.getElementById("custom_oai_key").classList.add("hidden");
11543-
}
11544-
else //openrouter supports autofetch
11545-
{
11546-
document.getElementById("openrouterdesc").classList.remove("hidden");
11547-
document.getElementById("custom_openrouter_model").classList.remove("hidden");
11548-
document.getElementById("openrouterproviderbox").classList.remove("hidden");
11549-
document.getElementById("custom_oai_endpoint").value = default_openrouter_base;
11550-
document.getElementById("custom_oai_key").value =(localsettings.saved_openrouter_key==dummy_api_key?"":localsettings.saved_openrouter_key);
11551-
if(!openrouter_fetch_attempted)
11552-
{
11553-
openrouter_fetch_attempted = true;
11554-
let dropdown = document.getElementById("custom_openrouter_model");
11555-
if(dropdown.options.length < 10)
11556-
{
11557-
oai_fetch_models(); //autofetch openrouter models
11558-
}
11559-
}
11549+
try_fetch_oai_models_auto();
1156011550
}
1156111551
oai_model_change(ep_should_always_use_chat_completions() || force_autotoggle_chatcompl);
1156211552
toggleoaichatcompl();
@@ -26911,7 +26901,7 @@ Current version indicated by LITEVER below.
2691126901
<span class="color_green" style="font-weight: bold;">No Key Required.</span><br><br>
2691226902
</span>
2691326903

26914-
<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="try_fetch_oai_models_auto()">
26904+
<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="">
2691526905
<input class="form-control" type="password" id="custom_oai_key" placeholder="API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
2691626906
Model Choice:<br>
2691726907
<select title="OpenAI Model Selection" style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control" id="custom_oai_model" onchange="oai_model_change(true)">

koboldcpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3869,7 +3869,7 @@ def transform_genparams(genparams, api_format):
38693869
attachedaudid += 1
38703870
messages_string += f"\n(Attached Audio {attachedaudid})\n"
38713871
# If last message, add any tools calls after message content and before message end token if any
3872-
if (message['role'] == "user" or message['role'] == "tool") and message_index == len(messages_array):
3872+
if message_index == len(messages_array):
38733873
used_tool_json = determine_tool_json_to_use(genparams, messages_string, assistant_message_start, (message['role'] == "tool"))
38743874

38753875
if used_tool_json:

src/llama-batch.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ bool llama_batch_allocr::init(
5959
for (int32_t i = 0; i < batch.n_tokens; ++i) {
6060
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
6161
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
62-
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
62+
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
6363
return false;
6464
}
6565
}

src/llama-graph.cpp

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -188,38 +188,23 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
188188

189189
void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
190190
const int64_t n_tokens = ubatch->n_tokens;
191-
const int64_t n_seq_tokens = ubatch->n_seq_tokens;
192191
const int64_t n_seqs_unq = ubatch->n_seqs_unq;
193192

194193
if (cparams.embeddings && (
195-
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
196-
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK
197-
)) {
194+
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
195+
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK ||
196+
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST
197+
)) {
198198
GGML_ASSERT(cls);
199199
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
200200

201201
uint32_t * data = (uint32_t *) cls->data;
202202
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
203203

204-
for (int i = 0; i < n_tokens; i += n_seq_tokens) {
205-
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
206-
const llama_seq_id seq_id = ubatch->seq_id[i][s];
207-
const int32_t seq_idx = ubatch->seq_idx[seq_id];
208-
209-
data[seq_idx] = i;
210-
}
211-
}
212-
}
213-
214-
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
215-
GGML_ASSERT(cls);
216-
GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
217-
218-
uint32_t * data = (uint32_t *) cls->data;
219-
memset(cls->data, 0, n_seqs_unq*ggml_element_size(cls));
204+
std::vector<int> target_pos(n_seqs_unq, -1);
205+
std::vector<int> target_row(n_seqs_unq, -1);
220206

221-
std::vector<int> last_pos(n_seqs_unq, -1);
222-
std::vector<int> last_row(n_seqs_unq, -1);
207+
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
223208

224209
for (int i = 0; i < n_tokens; ++i) {
225210
const llama_pos pos = ubatch->pos[i];
@@ -228,16 +213,20 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
228213
const llama_seq_id seq_id = ubatch->seq_id[i][s];
229214
const int32_t seq_idx = ubatch->seq_idx[seq_id];
230215

231-
if (pos >= last_pos[seq_idx]) {
232-
last_pos[seq_idx] = pos;
233-
last_row[seq_idx] = i;
216+
if (
217+
(target_pos[seq_idx] == -1) ||
218+
( last && pos >= target_pos[seq_idx]) ||
219+
(!last && pos < target_pos[seq_idx])
220+
) {
221+
target_pos[seq_idx] = pos;
222+
target_row[seq_idx] = i;
234223
}
235224
}
236225
}
237226

238227
for (int s = 0; s < n_seqs_unq; ++s) {
239-
if (last_row[s] >= 0) {
240-
data[s] = last_row[s];
228+
if (target_row[s] >= 0) {
229+
data[s] = target_row[s];
241230
}
242231
}
243232
}

src/llama-graph.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ class llm_graph_input_pos_bucket : public llm_graph_input_i {
144144

145145
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
146146

147-
const llama_hparams & hparams;
147+
const llama_hparams hparams;
148148
};
149149

150150
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
@@ -158,7 +158,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
158158

159159
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
160160

161-
const llama_hparams & hparams;
161+
const llama_hparams hparams;
162162

163163
const llama_kv_cache_unified_context * mctx;
164164
};
@@ -177,8 +177,8 @@ class llm_graph_input_out_ids : public llm_graph_input_i {
177177

178178
ggml_tensor * out_ids; // I32 [n_outputs]
179179

180-
const llama_hparams & hparams;
181-
const llama_cparams & cparams;
180+
const llama_hparams hparams;
181+
const llama_cparams cparams;
182182

183183
const uint32_t n_outputs;
184184
};
@@ -192,7 +192,7 @@ class llm_graph_input_mean : public llm_graph_input_i {
192192

193193
ggml_tensor * mean; // F32 [n_batch, n_batch]
194194

195-
const llama_cparams & cparams;
195+
const llama_cparams cparams;
196196
};
197197

198198
class llm_graph_input_cls : public llm_graph_input_i {
@@ -204,7 +204,7 @@ class llm_graph_input_cls : public llm_graph_input_i {
204204

205205
ggml_tensor * cls; // I32 [n_batch]
206206

207-
const llama_cparams & cparams;
207+
const llama_cparams cparams;
208208
};
209209

210210
class llm_graph_input_rs : public llm_graph_input_i {
@@ -247,8 +247,8 @@ class llm_graph_input_attn_no_cache : public llm_graph_input_i {
247247
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
248248
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
249249

250-
const llama_hparams & hparams;
251-
const llama_cparams & cparams;
250+
const llama_hparams hparams;
251+
const llama_cparams cparams;
252252
};
253253

254254
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
@@ -278,8 +278,11 @@ class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
278278
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
279279
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
280280

281-
const llama_hparams & hparams;
282-
const llama_cparams & cparams;
281+
// note: these have to be copies because in order to be able to reuse a graph, its inputs
282+
// need to carry these parameters with them. otherwise, they can point to freed
283+
// llm_graph_params from a previous batch, causing stack-use-after-return
284+
const llama_hparams hparams;
285+
const llama_cparams cparams;
283286

284287
const llama_kv_cache_unified_context * mctx;
285288
};
@@ -318,8 +321,8 @@ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
318321
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
319322
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
320323

321-
const llama_hparams & hparams;
322-
const llama_cparams & cparams;
324+
const llama_hparams hparams;
325+
const llama_cparams cparams;
323326

324327
const llama_kv_cache_unified_iswa_context * mctx;
325328
};

0 commit comments

Comments
 (0)