implement batch pacing

getnamo · getnamo · commit a0420bca224f · 2025-03-25T13:43:39.000-07:00
- simple sleep on tgs per token - sleep and split params on prompt processing (todo: instead of even split consider a max tokens processing batch) - basic implementation of #38, advanced pacing might be needed for future
diff --git a/Llama.uplugin b/Llama.uplugin
@@ -1,7 +1,7 @@
 {
 	"FileVersion": 3,
 	"Version": 1,
-	"VersionName": "0.9.1",
+	"VersionName": "0.9.2",
 	"FriendlyName": "Llama",
 	"Description": "Llama.cpp plugin for large language model (LLM) inference.",
 	"Category": "LLM",
diff --git a/Source/LlamaCore/Private/Internal/LlamaInternal.cpp b/Source/LlamaCore/Private/Internal/LlamaInternal.cpp
@@ -14,6 +14,8 @@ bool FLlamaInternal::LoadModelFromParams(const FLLMModelParams& InModelParams)
 
     UE_LOG(LogTemp, Log, TEXT("Device Found: %s %s"), *GPU, *RHI);
 
+    LastLoadedParams = InModelParams;
+
     // only print errors
     llama_log_set([](enum ggml_log_level level, const char* text, void* /* user_data */) {
         if (level >= GGML_LOG_LEVEL_ERROR) {
@@ -451,27 +453,80 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
         return NPromptTokens;
     }
 
-    // prepare a batch for the prompt
-    llama_batch Batch = llama_batch_get_one(PromptTokens.data(), PromptTokens.size());
+    //All in one batch
+    if (LastLoadedParams.Advanced.PromptProcessingPacingSleep == 0.f)
+    {
+        // prepare a batch for the prompt
+        llama_batch Batch = llama_batch_get_one(PromptTokens.data(), PromptTokens.size());
 
-    //check sizing before running prompt decode
-    int NContext = llama_n_ctx(Context);
-    int NContextUsed = llama_get_kv_cache_used_cells(Context);
+        //check sizing before running prompt decode
+        int NContext = llama_n_ctx(Context);
+        int NContextUsed = llama_get_kv_cache_used_cells(Context);
 
-    if (NContextUsed + NPromptTokens > NContext)
-    {
-        EmitErrorMessage(FString::Printf(
-            TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
-            NPromptTokens, NContextUsed, NContext
+        if (NContextUsed + NPromptTokens > NContext)
+        {
+            EmitErrorMessage(FString::Printf(
+                TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
+                NPromptTokens, NContextUsed, NContext
             ), 22, __func__);
-        return 0;
-    }
+            return 0;
+        }
 
-    // run it through the decode (input)
-    if (llama_decode(Context, Batch))
+        // run it through the decode (input)
+        if (llama_decode(Context, Batch))
+        {
+            EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
+            return NPromptTokens;
+        }
+    }
+    //Split it and sleep between batches for pacing purposes
+    else
     {
-        EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
-        return NPromptTokens;
+        int32 BatchCount = LastLoadedParams.Advanced.PromptProcessingPacingSplitN;
+
+        int32 TotalTokens = PromptTokens.size();
+        int32 TokensPerBatch = TotalTokens / BatchCount;
+        int32 Remainder = TotalTokens % BatchCount;
+
+        int32 StartIndex = 0;
+
+        for (int32 i = 0; i < BatchCount; i++)
+        {
+            // Calculate how many tokens to put in this batch
+            int32 CurrentBatchSize = TokensPerBatch + (i < Remainder ? 1 : 0);
+
+            // Slice the relevant tokens for this batch
+            std::vector<llama_token> BatchTokens(
+                PromptTokens.begin() + StartIndex,
+                PromptTokens.begin() + StartIndex + CurrentBatchSize
+            );
+
+            // Prepare the batch
+            llama_batch Batch = llama_batch_get_one(BatchTokens.data(), BatchTokens.size());
+
+            // Check context before running decode
+            int NContext = llama_n_ctx(Context);
+            int NContextUsed = llama_get_kv_cache_used_cells(Context);
+
+            if (NContextUsed + BatchTokens.size() > NContext)
+            {
+                EmitErrorMessage(FString::Printf(
+                    TEXT("Failed to insert, tried to insert %d tokens to currently used %d tokens which is more than the max %d context size. Try increasing the context size and re-run prompt."),
+                    BatchTokens.size(), NContextUsed, NContext
+                ), 22, __func__);
+                return 0;
+            }
+
+            // Decode this batch
+            if (llama_decode(Context, Batch))
+            {
+                EmitErrorMessage(TEXT("Failed to decode, could not find a KV slot for the batch (try reducing the size of the batch or increase the context)."), 23, __func__);
+                return BatchTokens.size();
+            }
+
+            StartIndex += CurrentBatchSize;
+            FPlatformProcess::Sleep(LastLoadedParams.Advanced.PromptProcessingPacingSleep);
+        }
     }
 
     const auto StopTime = ggml_time_us();
@@ -561,6 +616,12 @@ std::string FLlamaInternal::Generate(const std::string& Prompt, bool bAppendToMe
             //Return partial response
             return Response;
         }
+
+        //sleep pacing
+        if (LastLoadedParams.Advanced.TokenGenerationPacingSleep > 0.f)
+        {
+            FPlatformProcess::Sleep(LastLoadedParams.Advanced.TokenGenerationPacingSleep);
+        }
     }
 
     bGenerationActive = false;
diff --git a/Source/LlamaCore/Public/Internal/LlamaInternal.h b/Source/LlamaCore/Public/Internal/LlamaInternal.h
@@ -35,7 +35,7 @@ class FLlamaInternal
     std::string Template;
     std::string TemplateSource;
 
-    //Pacing
+    //Cached params, should be accessed on BT
     FLLMModelParams LastLoadedParams;
 
     //Model loading
diff --git a/Source/LlamaCore/Public/LlamaDataTypes.h b/Source/LlamaCore/Public/LlamaDataTypes.h
@@ -127,6 +127,10 @@ struct FLLMModelAdvancedParams
     UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "LLM Model Params")
     float PromptProcessingPacingSleep = 0.f;
 
+    //this part is only active if PromptProcessingPacingSleep > 0.f. Splits prompts into n chunks with sleep
+    UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "LLM Model Params")
+    int32 PromptProcessingPacingSplitN = 4;
+
     //usually . ? !
     UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "LLM Model Params")
     TArray<FString> PartialsSeparators;

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"FileVersion": 3,`
`3`	`3`	`"Version": 1,`
`4`		`- "VersionName": "0.9.1",`
	`4`	`+ "VersionName": "0.9.2",`
`5`	`5`	`"FriendlyName": "Llama",`
`6`	`6`	`"Description": "Llama.cpp plugin for large language model (LLM) inference.",`
`7`	`7`	`"Category": "LLM",`