Implement impersonation feature for component

getnamo · getnamo · commit 420a2b8e792f · 2025-04-02T22:22:44.000-07:00
- `ImpersonateTemplatedPrompt` used for user input (or non-streamed generated assistant input)
- `ImpersonateTemplatedToken` used for generated stream input. Signal EOS true to finish the response
- System prompt default insertion moved to load model function so we can chain user prompts directly after it without waiting for results. Will only insert if load is successful
- Log TGS if `bLogGenerationStats` is true
- Bind OnResponse instead of callback method so that both local and remote flow through same api
diff --git a/Llama.uplugin b/Llama.uplugin
@@ -1,7 +1,7 @@
 {
 	"FileVersion": 3,
 	"Version": 1,
-	"VersionName": "0.9.3",
+	"VersionName": "0.9.4",
 	"FriendlyName": "Llama",
 	"Description": "Llama.cpp plugin for large language model (LLM) inference.",
 	"Category": "LLM",
diff --git a/Source/LlamaCore/Private/LlamaComponent.cpp b/Source/LlamaCore/Private/LlamaComponent.cpp
@@ -19,6 +19,12 @@ ULlamaComponent::ULlamaComponent(const FObjectInitializer &ObjectInitializer)
         OnTokenGenerated.Broadcast(Token);
     };
 
+    LlamaNative->OnResponseGenerated = [this](const FString& Response)
+    {
+        OnResponseGenerated.Broadcast(Response);
+        OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
+    };
+
     LlamaNative->OnPartialGenerated = [this](const FString& Partial)
     {
         OnPartialGenerated.Broadcast(Partial);
@@ -87,26 +93,26 @@ void ULlamaComponent::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo
 
 void ULlamaComponent::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)
 {
-    LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)
-    {
+    LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response));
+     {
         if (ChatPrompt.bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    });*/
 }
 
 void ULlamaComponent::InsertRawPrompt(const FString& Text, bool bGenerateReply)
 {
-    LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)
+    LlamaNative->InsertRawPrompt(Text, bGenerateReply); /*, [this, bGenerateReply](const FString& Response)
     {
         if (bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    });*/
 }
 
 void ULlamaComponent::LoadModel(bool bForceReload)
@@ -120,11 +126,6 @@ void ULlamaComponent::LoadModel(bool bForceReload)
             return;
         }
 
-        if (ModelParams.bAutoInsertSystemPromptOnLoad)
-        {
-            InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);
-        }
-
         OnModelLoaded.Broadcast(ModelPath);
     });
 }
@@ -163,6 +164,19 @@ void ULlamaComponent::RemoveLastUserInput()
     LlamaNative->RemoveLastUserInput();
 }
 
+
+void ULlamaComponent::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& ChatPrompt)
+{
+    LlamaNative->SetModelParams(ModelParams);
+
+    LlamaNative->ImpersonateTemplatedPrompt(ChatPrompt);
+}
+
+void ULlamaComponent::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)
+{
+    LlamaNative->ImpersonateTemplatedToken(Token, Role, bEoS);
+}
+
 FString ULlamaComponent::WrapPromptForRole(const FString& Text, EChatTemplateRole Role, const FString& Template)
 {
     return LlamaNative->WrapPromptForRole(Text, Role, Template);
diff --git a/Source/LlamaCore/Private/LlamaNative.cpp b/Source/LlamaCore/Private/LlamaNative.cpp
@@ -60,7 +60,7 @@ FLlamaNative::FLlamaNative()
     {
         if (ModelParams.Advanced.bLogGenerationStats)
         {
-            UE_LOG(LlamaLog, Log, TEXT("Generated %d tokens in %1.2fs (%1.2ftps)"), TokensGenerated, Duration, SpeedTps);
+            UE_LOG(LlamaLog, Log, TEXT("TGS - Generated %d tokens in %1.2fs (%1.2ftps)"), TokensGenerated, Duration, SpeedTps);
         }
 
         int32 UsedContext = UsedContextLength();
@@ -88,6 +88,11 @@ FLlamaNative::FLlamaNative()
 
     Internal->OnPromptProcessed = [this](int32 TokensProcessed, EChatTemplateRole RoleProcessed, float SpeedTps)
     {
+        if (ModelParams.Advanced.bLogGenerationStats)
+        {
+            UE_LOG(LlamaLog, Log, TEXT("PPS - Processed %d tokens at %1.2ftps"), TokensProcessed, SpeedTps);
+        }
+
         int32 UsedContext = UsedContextLength();
 
         //Sync history data with additional state updates
@@ -266,6 +271,7 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
         //already loaded, we're done
         return ModelLoadedCallback(ModelParams.PathToModel, 0);
     }
+    bModelLoadInitiated = true;
 
     //Copy so these dont get modified during enqueue op
     const FLLMModelParams ParamsAtLoad = ModelParams;
@@ -284,6 +290,14 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
             const FString TemplateString = FLlamaString::ToUE(Internal->Template);
             const FString TemplateSource = FLlamaString::ToUE(Internal->TemplateSource);
 
+            //Before we release the BG thread, ensure we enqueue the system prompt
+            //If we do it later, other queued calls will frontrun it. This enables startup chaining correctly
+            if (ParamsAtLoad.bAutoInsertSystemPromptOnLoad)
+            {
+                Internal->InsertTemplatedPrompt(FLlamaString::ToStd(ParamsAtLoad.SystemPrompt), EChatTemplateRole::System, false, false);
+            }
+
+            //Callback on game thread for data sync
             EnqueueGTTask([this, TemplateString, TemplateSource, ModelLoadedCallback]
             {
                 FJinjaChatTemplate ChatTemplate;
@@ -293,6 +307,8 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
                 ModelState.ChatTemplateInUse = ChatTemplate;
                 ModelState.bModelIsLoaded = true;
 
+                bModelLoadInitiated = false;
+
                 if (OnModelStateChanged)
                 {
                     OnModelStateChanged(ModelState);
@@ -308,15 +324,22 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
         {
             EnqueueGTTask([this, ModelLoadedCallback]
             {
+                bModelLoadInitiated = false;
+
                 //On error will be triggered earlier in the chain, but forward our model loading error status here
-                ModelLoadedCallback(ModelParams.PathToModel, 15);
+                if (ModelLoadedCallback)
+                {
+                    ModelLoadedCallback(ModelParams.PathToModel, 15);
+                }
             }, TaskId);
         }
     });
 }
 
 void FLlamaNative::UnloadModel(TFunction<void(int32 StatusCode)> ModelUnloadedCallback)
 {
+    bModelLoadInitiated = false;
+
     EnqueueBGTask([this, ModelUnloadedCallback](int64 TaskId)
     {
         if (IsModelLoaded())
@@ -349,7 +372,7 @@ bool FLlamaNative::IsModelLoaded()
 
 void FLlamaNative::InsertTemplatedPrompt(const FLlamaChatPrompt& Prompt, TFunction<void(const FString& Response)> OnResponseFinished)
 {
-    if (!IsModelLoaded())
+    if (!IsModelLoaded() && !bModelLoadInitiated)
     {
         UE_LOG(LlamaLog, Warning, TEXT("Model isn't loaded, can't run prompt."));
         return;
@@ -386,7 +409,7 @@ void FLlamaNative::InsertTemplatedPrompt(const FLlamaChatPrompt& Prompt, TFuncti
 
 void FLlamaNative::InsertRawPrompt(const FString& Prompt, bool bGenerateReply, TFunction<void(const FString& Response)>OnResponseFinished)
 {
-    if (!IsModelLoaded())
+    if (!IsModelLoaded() && !bModelLoadInitiated)
     {
         UE_LOG(LlamaLog, Warning, TEXT("Model isn't loaded, can't run prompt."));
         return;
@@ -407,6 +430,124 @@ void FLlamaNative::InsertRawPrompt(const FString& Prompt, bool bGenerateReply, T
     });
 }
 
+void FLlamaNative::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& Prompt)
+{
+    //modify model state
+    if (IsModelLoaded())
+    {
+        //insert it but make sure we don't do any token generation
+        FLlamaChatPrompt ModifiedPrompt = Prompt;
+        ModifiedPrompt.bGenerateReply = false;
+
+        InsertTemplatedPrompt(ModifiedPrompt);
+    }
+    else
+    {
+        //no model, so just run this in sync mode
+        FStructuredChatMessage Message;
+        Message.Role = Prompt.Role;
+        Message.Content = Prompt.Prompt;
+
+        //modify our chat history state
+        ModelState.ChatHistory.History.Add(Message);
+
+        if (OnModelStateChanged)
+        {
+            OnModelStateChanged(ModelState);
+        }
+        //was this an assistant message? emit response generated callback
+        if (Message.Role == EChatTemplateRole::Assistant)
+        {
+            if (OnResponseGenerated)
+            {
+                OnResponseGenerated(Prompt.Prompt);
+            }
+        }
+    }
+}
+
+void FLlamaNative::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)
+{
+    //Should be called on game thread.
+    
+    //NB: we don't support updating model internal state atm
+
+    //Check if we need to add a message before modifying it
+    bool bLastRoleWasMatchingRole = false;
+
+    if (ModelState.ChatHistory.History.Num() != 0)
+    {
+        FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+        bLastRoleWasMatchingRole = Message.Role == Role;
+    }
+
+    FString CurrentReplyText;
+    
+    if (!bLastRoleWasMatchingRole)
+    {
+        FStructuredChatMessage Message;
+        Message.Role = Role;
+        Message.Content = Token;
+
+        ModelState.ChatHistory.History.Add(Message);
+
+        CurrentReplyText += Token;
+    }
+    else
+    {
+        FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+        Message.Content += Token;
+
+        CurrentReplyText += Message.Content;
+    }
+
+    FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+
+    FString Partial;
+
+    //Compute Partials
+    if (ModelParams.Advanced.bEmitPartials)
+    {
+        bool bSplitFound = false;
+        //Check new token for separators
+        for (const FString& Separator : ModelParams.Advanced.PartialsSeparators)
+        {
+            if (Token.Contains(Separator))
+            {
+                bSplitFound = true;
+            }
+        }
+        if (bSplitFound)
+        {
+            Partial = FLlamaString::GetLastSentence(CurrentReplyText);
+        }
+    }
+
+    //Emit token to game thread
+    if (OnTokenGenerated)
+    {
+        OnTokenGenerated(Token);
+
+        if (OnPartialGenerated && !Partial.IsEmpty())
+        {
+            OnPartialGenerated(Partial);
+        }
+    }
+
+    //full response reply on finish
+    if (bEoS)
+    {
+        if (OnModelStateChanged)
+        {
+            OnModelStateChanged(ModelState);
+        }
+        if (OnResponseGenerated)
+        {
+            OnResponseGenerated(CurrentReplyText);
+        }
+    }
+}
+
 void FLlamaNative::RemoveLastNMessages(int32 MessageCount)
 {
     EnqueueBGTask([this, MessageCount](int64 TaskId)
diff --git a/Source/LlamaCore/Private/LlamaSubsystem.cpp b/Source/LlamaCore/Private/LlamaSubsystem.cpp
@@ -29,6 +29,11 @@ void ULlamaSubsystem::Initialize(FSubsystemCollectionBase& Collection)
     {
         OnPromptProcessed.Broadcast(TokensProcessed, Role, Speed);
     };
+    LlamaNative->OnResponseGenerated = [this](const FString& Response)
+    {
+        OnResponseGenerated.Broadcast(Response);
+        OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
+    };
     LlamaNative->OnError = [this](const FString& ErrorMessage, int32 ErrorCode)
     {
         OnError.Broadcast(ErrorMessage, ErrorCode);
@@ -63,26 +68,26 @@ void ULlamaSubsystem::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo
 
 void ULlamaSubsystem::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)
 {
-    LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)
+    LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response)
     {
         if (ChatPrompt.bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    });*/
 }
 
 void ULlamaSubsystem::InsertRawPrompt(const FString& Text, bool bGenerateReply)
 {
-    LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)
+    LlamaNative->InsertRawPrompt(Text, bGenerateReply);/*, [this, bGenerateReply](const FString& Response)
     {
         if (bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    })*/;
 }
 
 void ULlamaSubsystem::LoadModel(bool bForceReload)
@@ -104,11 +109,6 @@ void ULlamaSubsystem::LoadModel(bool bForceReload)
             return;
         }
 
-        if (ModelParams.bAutoInsertSystemPromptOnLoad)
-        {
-            InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);
-        }
-
         OnModelLoaded.Broadcast(ModelPath);
     });
 }
diff --git a/Source/LlamaCore/Public/LlamaComponent.h b/Source/LlamaCore/Public/LlamaComponent.h
@@ -107,6 +107,14 @@ class LLAMACORE_API ULlamaComponent : public UActorComponent
     UFUNCTION(BlueprintCallable, Category = "LLM Model Component")
     void InsertRawPrompt(UPARAM(meta = (MultiLine = true)) const FString& Text, bool bGenerateReply = true);
 
+    //Typically as user, this pretends the input was generated in history and all downstream functions should trigger. KV-cache won't be updated if no models are loaded.
+    UFUNCTION(BlueprintCallable, Category = "LLM Model Component - Impersonation via External API")
+    void ImpersonateTemplatedPrompt(const FLlamaChatPrompt& ChatPrompt);
+
+    //Use this to feed external model inference through our loop (e.g. as assistant tokens are generated), it will pretend the output was generated locally downstream.
+    UFUNCTION(BlueprintCallable, Category = "LLM Model Component - Impersonation via External API")
+    void ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role = EChatTemplateRole::Assistant, bool bIsEndOfStream = false);
+
     //if you want to manually wrap prompt, if template is empty string, default model template is applied. NB: this function should be thread safe, but this has not be thoroughly tested.
     UFUNCTION(BlueprintPure, Category = "LLM Model Component")
     FString WrapPromptForRole(const FString& Text, EChatTemplateRole Role, const FString& OverrideTemplate);
diff --git a/Source/LlamaCore/Public/LlamaNative.h b/Source/LlamaCore/Public/LlamaNative.h
@@ -37,6 +37,8 @@ class LLAMACORE_API FLlamaNative
 		TFunction<void(const FString& Response)>OnResponseFinished = nullptr);
 	void InsertRawPrompt(const FString& Prompt, bool bGenerateReply = true, 
 		TFunction<void(const FString& Response)>OnResponseFinished = nullptr);
+	void ImpersonateTemplatedPrompt(const FLlamaChatPrompt& Prompt);
+	void ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role = EChatTemplateRole::Assistant, bool bEoS = false);
 	bool IsGenerating();
 	void StopGeneration();
 	void ResumeGeneration();
@@ -82,6 +84,7 @@ class LLAMACORE_API FLlamaNative
 	//GT State - safely accesible on game thread
 	FLLMModelParams ModelParams;
 	FLLMModelState ModelState;
+	bool bModelLoadInitiated = false; //tracking model load attempts
 
 	//BG State - do not read/write on GT
 	FString CombinedPieceText;	//accumulates tokens into full string during per-token inference.

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"FileVersion": 3,`
`3`	`3`	`"Version": 1,`
`4`		`- "VersionName": "0.9.3",`
	`4`	`+ "VersionName": "0.9.4",`
`5`	`5`	`"FriendlyName": "Llama",`
`6`	`6`	`"Description": "Llama.cpp plugin for large language model (LLM) inference.",`
`7`	`7`	`"Category": "LLM",`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,12 @@ ULlamaComponent::ULlamaComponent(const FObjectInitializer &ObjectInitializer)`
`19`	`19`	`OnTokenGenerated.Broadcast(Token);`
`20`	`20`	`};`
`21`	`21`
	`22`	`+ LlamaNative->OnResponseGenerated = [this](const FString& Response)`
	`23`	`+ {`
	`24`	`+ OnResponseGenerated.Broadcast(Response);`
	`25`	`+ OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
	`26`	`+ };`
	`27`	`+`
`22`	`28`	`LlamaNative->OnPartialGenerated = [this](const FString& Partial)`
`23`	`29`	`{`
`24`	`30`	`OnPartialGenerated.Broadcast(Partial);`
`@@ -87,26 +93,26 @@ void ULlamaComponent::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo`
`87`	`93`
`88`	`94`	`void ULlamaComponent::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)`
`89`	`95`	`{`
`90`		`- LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)`
`91`		`- {`
	`96`	`+ LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response));`
	`97`	`+ {`
`92`	`98`	`if (ChatPrompt.bGenerateReply)`
`93`	`99`	`{`
`94`	`100`	`OnResponseGenerated.Broadcast(Response);`
`95`	`101`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`96`	`102`	`}`
`97`		`- });`
	`103`	`+ });*/`
`98`	`104`	`}`
`99`	`105`
`100`	`106`	`void ULlamaComponent::InsertRawPrompt(const FString& Text, bool bGenerateReply)`
`101`	`107`	`{`
`102`		`- LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)`
	`108`	`+ LlamaNative->InsertRawPrompt(Text, bGenerateReply); /*, [this, bGenerateReply](const FString& Response)`
`103`	`109`	`{`
`104`	`110`	`if (bGenerateReply)`
`105`	`111`	`{`
`106`	`112`	`OnResponseGenerated.Broadcast(Response);`
`107`	`113`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`108`	`114`	`}`
`109`		`- });`
	`115`	`+ });*/`
`110`	`116`	`}`
`111`	`117`
`112`	`118`	`void ULlamaComponent::LoadModel(bool bForceReload)`
`@@ -120,11 +126,6 @@ void ULlamaComponent::LoadModel(bool bForceReload)`
`120`	`126`	`return;`
`121`	`127`	`}`
`122`	`128`
`123`		`- if (ModelParams.bAutoInsertSystemPromptOnLoad)`
`124`		`- {`
`125`		`- InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);`
`126`		`- }`
`127`		`-`
`128`	`129`	`OnModelLoaded.Broadcast(ModelPath);`
`129`	`130`	`});`
`130`	`131`	`}`
`@@ -163,6 +164,19 @@ void ULlamaComponent::RemoveLastUserInput()`
`163`	`164`	`LlamaNative->RemoveLastUserInput();`
`164`	`165`	`}`
`165`	`166`
	`167`	`+`
	`168`	`+void ULlamaComponent::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& ChatPrompt)`
	`169`	`+{`
	`170`	`+ LlamaNative->SetModelParams(ModelParams);`
	`171`	`+`
	`172`	`+ LlamaNative->ImpersonateTemplatedPrompt(ChatPrompt);`
	`173`	`+}`
	`174`	`+`
	`175`	`+void ULlamaComponent::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)`
	`176`	`+{`
	`177`	`+ LlamaNative->ImpersonateTemplatedToken(Token, Role, bEoS);`
	`178`	`+}`
	`179`	`+`
`166`	`180`	`FString ULlamaComponent::WrapPromptForRole(const FString& Text, EChatTemplateRole Role, const FString& Template)`
`167`	`181`	`{`
`168`	`182`	`return LlamaNative->WrapPromptForRole(Text, Role, Template);`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,11 @@ void ULlamaSubsystem::Initialize(FSubsystemCollectionBase& Collection)`
`29`	`29`	`{`
`30`	`30`	`OnPromptProcessed.Broadcast(TokensProcessed, Role, Speed);`
`31`	`31`	`};`
	`32`	`+ LlamaNative->OnResponseGenerated = [this](const FString& Response)`
	`33`	`+ {`
	`34`	`+ OnResponseGenerated.Broadcast(Response);`
	`35`	`+ OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
	`36`	`+ };`
`32`	`37`	`LlamaNative->OnError = [this](const FString& ErrorMessage, int32 ErrorCode)`
`33`	`38`	`{`
`34`	`39`	`OnError.Broadcast(ErrorMessage, ErrorCode);`
`@@ -63,26 +68,26 @@ void ULlamaSubsystem::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo`
`63`	`68`
`64`	`69`	`void ULlamaSubsystem::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)`
`65`	`70`	`{`
`66`		`- LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)`
	`71`	`+ LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response)`
`67`	`72`	`{`
`68`	`73`	`if (ChatPrompt.bGenerateReply)`
`69`	`74`	`{`
`70`	`75`	`OnResponseGenerated.Broadcast(Response);`
`71`	`76`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`72`	`77`	`}`
`73`		`- });`
	`78`	`+ });*/`
`74`	`79`	`}`
`75`	`80`
`76`	`81`	`void ULlamaSubsystem::InsertRawPrompt(const FString& Text, bool bGenerateReply)`
`77`	`82`	`{`
`78`		`- LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)`
	`83`	`+ LlamaNative->InsertRawPrompt(Text, bGenerateReply);/*, [this, bGenerateReply](const FString& Response)`
`79`	`84`	`{`
`80`	`85`	`if (bGenerateReply)`
`81`	`86`	`{`
`82`	`87`	`OnResponseGenerated.Broadcast(Response);`
`83`	`88`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`84`	`89`	`}`
`85`		`- });`
	`90`	`+ })*/;`
`86`	`91`	`}`
`87`	`92`
`88`	`93`	`void ULlamaSubsystem::LoadModel(bool bForceReload)`
`@@ -104,11 +109,6 @@ void ULlamaSubsystem::LoadModel(bool bForceReload)`
`104`	`109`	`return;`
`105`	`110`	`}`
`106`	`111`
`107`		`- if (ModelParams.bAutoInsertSystemPromptOnLoad)`
`108`		`- {`
`109`		`- InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);`
`110`		`- }`
`111`		`-`
`112`	`112`	`OnModelLoaded.Broadcast(ModelPath);`
`113`	`113`	`});`
`114`	`114`	`}`