You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
- simple sleep on tgs per token
- sleep and split params on prompt processing (todo: instead of even split consider a max tokens processing batch)
- basic implementation of #38, advanced pacing might be needed for future
int NContextUsed = llama_get_kv_cache_used_cells(Context);
462
+
//check sizing before running prompt decode
463
+
int NContext = llama_n_ctx(Context);
464
+
int NContextUsed = llama_get_kv_cache_used_cells(Context);
460
465
461
-
if (NContextUsed + NPromptTokens > NContext)
462
-
{
463
-
EmitErrorMessage(FString::Printf(
464
-
TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
465
-
NPromptTokens, NContextUsed, NContext
466
+
if (NContextUsed + NPromptTokens > NContext)
467
+
{
468
+
EmitErrorMessage(FString::Printf(
469
+
TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
470
+
NPromptTokens, NContextUsed, NContext
466
471
), 22, __func__);
467
-
return0;
468
-
}
472
+
return0;
473
+
}
469
474
470
-
// run it through the decode (input)
471
-
if (llama_decode(Context, Batch))
475
+
// run it through the decode (input)
476
+
if (llama_decode(Context, Batch))
477
+
{
478
+
EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
479
+
return NPromptTokens;
480
+
}
481
+
}
482
+
//Split it and sleep between batches for pacing purposes
483
+
else
472
484
{
473
-
EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
int NContextUsed = llama_get_kv_cache_used_cells(Context);
510
+
511
+
if (NContextUsed + BatchTokens.size() > NContext)
512
+
{
513
+
EmitErrorMessage(FString::Printf(
514
+
TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
515
+
BatchTokens.size(), NContextUsed, NContext
516
+
), 22, __func__);
517
+
return0;
518
+
}
519
+
520
+
// Decode this batch
521
+
if (llama_decode(Context, Batch))
522
+
{
523
+
EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
0 commit comments