getnamo
diff --git a/‎Llama.uplugin‎
Lines changed: 1 addition & 1 deletion b/‎Llama.uplugin‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Source/LlamaCore/Private/Internal/LlamaInternal.cpp‎
Lines changed: 8 additions & 8 deletions b/‎Source/LlamaCore/Private/Internal/LlamaInternal.cpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎Source/LlamaCore/Private/LlamaComponent.cpp‎
Lines changed: 24 additions & 10 deletions b/‎Source/LlamaCore/Private/LlamaComponent.cpp‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎Source/LlamaCore/Private/LlamaNative.cpp‎
Lines changed: 145 additions & 4 deletions b/‎Source/LlamaCore/Private/LlamaNative.cpp‎
Lines changed: 145 additions & 4 deletions
@@ -1,7 +1,7 @@
 {
 	"FileVersion": 3,
 	"Version": 1,
-	"VersionName": "0.9.2",
+	"VersionName": "0.9.4",
 	"FriendlyName": "Llama",
 	"Description": "Llama.cpp plugin for large language model (LLM) inference.",
 	"Category": "LLM",
 
@@ -273,7 +273,7 @@ int32 FLlamaInternal::UsedContext()
 {
     if (Context)
     {
-        return llama_get_kv_cache_used_cells(Context);
+        return llama_kv_self_used_cells(Context);
     }
     else
     {
@@ -313,16 +313,16 @@ void FLlamaInternal::ResetContextHistory(bool bKeepSystemsPrompt)
     ContextHistory.clear();
     Messages.clear();
 
-    llama_kv_cache_clear(Context);
+    llama_kv_self_clear(Context);
     FilledContextCharLength = 0;
 }
 
 void FLlamaInternal::RollbackContextHistoryByTokens(int32 NTokensToErase)
 {
     // clear the last n_regen tokens from the KV cache and update n_past
-    int32 TokensUsed = llama_get_kv_cache_used_cells(Context); //FilledContextCharLength
+    int32 TokensUsed = llama_kv_self_used_cells(Context); //FilledContextCharLength
 
-    llama_kv_cache_seq_rm(Context, 0, TokensUsed - NTokensToErase, -1);
+    llama_kv_self_seq_rm(Context, 0, TokensUsed - NTokensToErase, -1);
 
     //FilledContextCharLength -= NTokensToErase;
 
@@ -442,7 +442,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
     //Grab vocab
     const llama_vocab* Vocab = llama_model_get_vocab(LlamaModel);
-    const bool IsFirst = llama_get_kv_cache_used_cells(Context) == 0;
+    const bool IsFirst = llama_kv_self_used_cells(Context) == 0;
 
     // tokenize the prompt
     const int NPromptTokens = -llama_tokenize(Vocab, Prompt.c_str(), Prompt.size(), NULL, 0, IsFirst, true);
@@ -461,7 +461,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
         //check sizing before running prompt decode
         int NContext = llama_n_ctx(Context);
-        int NContextUsed = llama_get_kv_cache_used_cells(Context);
+        int NContextUsed = llama_kv_self_used_cells(Context);
 
         if (NContextUsed + NPromptTokens > NContext)
         {
@@ -506,7 +506,7 @@ int32 FLlamaInternal::ProcessPrompt(const std::string& Prompt, EChatTemplateRole
 
             // Check context before running decode
             int NContext = llama_n_ctx(Context);
-            int NContextUsed = llama_get_kv_cache_used_cells(Context);
+            int NContextUsed = llama_kv_self_used_cells(Context);
 
             if (NContextUsed + BatchTokens.size() > NContext)
             {
@@ -563,7 +563,7 @@ std::string FLlamaInternal::Generate(const std::string& Prompt, bool bAppendToMe
 
     // check if we have enough space in the context to evaluate this batch - might need to be inside loop
     int NContext = llama_n_ctx(Context);
-    int NContextUsed = llama_get_kv_cache_used_cells(Context);
+    int NContextUsed = llama_kv_self_used_cells(Context);
     bool bEOGExit = false;
 
     while (bGenerationActive) //processing can be aborted by flipping the boolean
 
@@ -19,6 +19,12 @@ ULlamaComponent::ULlamaComponent(const FObjectInitializer &ObjectInitializer)
         OnTokenGenerated.Broadcast(Token);
     };
 
+    LlamaNative->OnResponseGenerated = [this](const FString& Response)
+    {
+        OnResponseGenerated.Broadcast(Response);
+        OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
+    };
+
     LlamaNative->OnPartialGenerated = [this](const FString& Partial)
     {
         OnPartialGenerated.Broadcast(Partial);
@@ -87,26 +93,26 @@ void ULlamaComponent::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo
 
 void ULlamaComponent::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)
 {
-    LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)
-    {
+    LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response));
+     {
         if (ChatPrompt.bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    });*/
 }
 
 void ULlamaComponent::InsertRawPrompt(const FString& Text, bool bGenerateReply)
 {
-    LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)
+    LlamaNative->InsertRawPrompt(Text, bGenerateReply); /*, [this, bGenerateReply](const FString& Response)
     {
         if (bGenerateReply)
         {
             OnResponseGenerated.Broadcast(Response);
             OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);
         }
-    });
+    });*/
 }
 
 void ULlamaComponent::LoadModel(bool bForceReload)
@@ -120,11 +126,6 @@ void ULlamaComponent::LoadModel(bool bForceReload)
             return;
         }
 
-        if (ModelParams.bAutoInsertSystemPromptOnLoad)
-        {
-            InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);
-        }
-
         OnModelLoaded.Broadcast(ModelPath);
     });
 }
@@ -163,6 +164,19 @@ void ULlamaComponent::RemoveLastUserInput()
     LlamaNative->RemoveLastUserInput();
 }
 
+
+void ULlamaComponent::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& ChatPrompt)
+{
+    LlamaNative->SetModelParams(ModelParams);
+
+    LlamaNative->ImpersonateTemplatedPrompt(ChatPrompt);
+}
+
+void ULlamaComponent::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)
+{
+    LlamaNative->ImpersonateTemplatedToken(Token, Role, bEoS);
+}
+
 FString ULlamaComponent::WrapPromptForRole(const FString& Text, EChatTemplateRole Role, const FString& Template)
 {
     return LlamaNative->WrapPromptForRole(Text, Role, Template);
 
@@ -60,7 +60,7 @@ FLlamaNative::FLlamaNative()
     {
         if (ModelParams.Advanced.bLogGenerationStats)
         {
-            UE_LOG(LlamaLog, Log, TEXT("Generated %d tokens in %1.2fs (%1.2ftps)"), TokensGenerated, Duration, SpeedTps);
+            UE_LOG(LlamaLog, Log, TEXT("TGS - Generated %d tokens in %1.2fs (%1.2ftps)"), TokensGenerated, Duration, SpeedTps);
         }
 
         int32 UsedContext = UsedContextLength();
@@ -88,6 +88,11 @@ FLlamaNative::FLlamaNative()
 
     Internal->OnPromptProcessed = [this](int32 TokensProcessed, EChatTemplateRole RoleProcessed, float SpeedTps)
     {
+        if (ModelParams.Advanced.bLogGenerationStats)
+        {
+            UE_LOG(LlamaLog, Log, TEXT("PPS - Processed %d tokens at %1.2ftps"), TokensProcessed, SpeedTps);
+        }
+
         int32 UsedContext = UsedContextLength();
 
         //Sync history data with additional state updates
@@ -266,6 +271,7 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
         //already loaded, we're done
         return ModelLoadedCallback(ModelParams.PathToModel, 0);
     }
+    bModelLoadInitiated = true;
 
     //Copy so these dont get modified during enqueue op
     const FLLMModelParams ParamsAtLoad = ModelParams;
@@ -284,6 +290,14 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
             const FString TemplateString = FLlamaString::ToUE(Internal->Template);
             const FString TemplateSource = FLlamaString::ToUE(Internal->TemplateSource);
 
+            //Before we release the BG thread, ensure we enqueue the system prompt
+            //If we do it later, other queued calls will frontrun it. This enables startup chaining correctly
+            if (ParamsAtLoad.bAutoInsertSystemPromptOnLoad)
+            {
+                Internal->InsertTemplatedPrompt(FLlamaString::ToStd(ParamsAtLoad.SystemPrompt), EChatTemplateRole::System, false, false);
+            }
+
+            //Callback on game thread for data sync
             EnqueueGTTask([this, TemplateString, TemplateSource, ModelLoadedCallback]
             {
                 FJinjaChatTemplate ChatTemplate;
@@ -293,6 +307,8 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
                 ModelState.ChatTemplateInUse = ChatTemplate;
                 ModelState.bModelIsLoaded = true;
 
+                bModelLoadInitiated = false;
+
                 if (OnModelStateChanged)
                 {
                     OnModelStateChanged(ModelState);
@@ -308,15 +324,22 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
         {
             EnqueueGTTask([this, ModelLoadedCallback]
             {
+                bModelLoadInitiated = false;
+
                 //On error will be triggered earlier in the chain, but forward our model loading error status here
-                ModelLoadedCallback(ModelParams.PathToModel, 15);
+                if (ModelLoadedCallback)
+                {
+                    ModelLoadedCallback(ModelParams.PathToModel, 15);
+                }
             }, TaskId);
         }
     });
 }
 
 void FLlamaNative::UnloadModel(TFunction<void(int32 StatusCode)> ModelUnloadedCallback)
 {
+    bModelLoadInitiated = false;
+
     EnqueueBGTask([this, ModelUnloadedCallback](int64 TaskId)
     {
         if (IsModelLoaded())
@@ -349,7 +372,7 @@ bool FLlamaNative::IsModelLoaded()
 
 void FLlamaNative::InsertTemplatedPrompt(const FLlamaChatPrompt& Prompt, TFunction<void(const FString& Response)> OnResponseFinished)
 {
-    if (!IsModelLoaded())
+    if (!IsModelLoaded() && !bModelLoadInitiated)
     {
         UE_LOG(LlamaLog, Warning, TEXT("Model isn't loaded, can't run prompt."));
         return;
@@ -386,7 +409,7 @@ void FLlamaNative::InsertTemplatedPrompt(const FLlamaChatPrompt& Prompt, TFuncti
 
 void FLlamaNative::InsertRawPrompt(const FString& Prompt, bool bGenerateReply, TFunction<void(const FString& Response)>OnResponseFinished)
 {
-    if (!IsModelLoaded())
+    if (!IsModelLoaded() && !bModelLoadInitiated)
     {
         UE_LOG(LlamaLog, Warning, TEXT("Model isn't loaded, can't run prompt."));
         return;
@@ -407,6 +430,124 @@ void FLlamaNative::InsertRawPrompt(const FString& Prompt, bool bGenerateReply, T
     });
 }
 
+void FLlamaNative::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& Prompt)
+{
+    //modify model state
+    if (IsModelLoaded())
+    {
+        //insert it but make sure we don't do any token generation
+        FLlamaChatPrompt ModifiedPrompt = Prompt;
+        ModifiedPrompt.bGenerateReply = false;
+
+        InsertTemplatedPrompt(ModifiedPrompt);
+    }
+    else
+    {
+        //no model, so just run this in sync mode
+        FStructuredChatMessage Message;
+        Message.Role = Prompt.Role;
+        Message.Content = Prompt.Prompt;
+
+        //modify our chat history state
+        ModelState.ChatHistory.History.Add(Message);
+
+        if (OnModelStateChanged)
+        {
+            OnModelStateChanged(ModelState);
+        }
+        //was this an assistant message? emit response generated callback
+        if (Message.Role == EChatTemplateRole::Assistant)
+        {
+            if (OnResponseGenerated)
+            {
+                OnResponseGenerated(Prompt.Prompt);
+            }
+        }
+    }
+}
+
+void FLlamaNative::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)
+{
+    //Should be called on game thread.
+    
+    //NB: we don't support updating model internal state atm
+
+    //Check if we need to add a message before modifying it
+    bool bLastRoleWasMatchingRole = false;
+
+    if (ModelState.ChatHistory.History.Num() != 0)
+    {
+        FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+        bLastRoleWasMatchingRole = Message.Role == Role;
+    }
+
+    FString CurrentReplyText;
+    
+    if (!bLastRoleWasMatchingRole)
+    {
+        FStructuredChatMessage Message;
+        Message.Role = Role;
+        Message.Content = Token;
+
+        ModelState.ChatHistory.History.Add(Message);
+
+        CurrentReplyText += Token;
+    }
+    else
+    {
+        FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+        Message.Content += Token;
+
+        CurrentReplyText += Message.Content;
+    }
+
+    FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
+
+    FString Partial;
+
+    //Compute Partials
+    if (ModelParams.Advanced.bEmitPartials)
+    {
+        bool bSplitFound = false;
+        //Check new token for separators
+        for (const FString& Separator : ModelParams.Advanced.PartialsSeparators)
+        {
+            if (Token.Contains(Separator))
+            {
+                bSplitFound = true;
+            }
+        }
+        if (bSplitFound)
+        {
+            Partial = FLlamaString::GetLastSentence(CurrentReplyText);
+        }
+    }
+
+    //Emit token to game thread
+    if (OnTokenGenerated)
+    {
+        OnTokenGenerated(Token);
+
+        if (OnPartialGenerated && !Partial.IsEmpty())
+        {
+            OnPartialGenerated(Partial);
+        }
+    }
+
+    //full response reply on finish
+    if (bEoS)
+    {
+        if (OnModelStateChanged)
+        {
+            OnModelStateChanged(ModelState);
+        }
+        if (OnResponseGenerated)
+        {
+            OnResponseGenerated(CurrentReplyText);
+        }
+    }
+}
+
 void FLlamaNative::RemoveLastNMessages(int32 MessageCount)
 {
     EnqueueBGTask([this, MessageCount](int64 TaskId)
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"FileVersion": 3,`
`3`	`3`	`"Version": 1,`
`4`		`- "VersionName": "0.9.2",`
	`4`	`+ "VersionName": "0.9.4",`
`5`	`5`	`"FriendlyName": "Llama",`
`6`	`6`	`"Description": "Llama.cpp plugin for large language model (LLM) inference.",`
`7`	`7`	`"Category": "LLM",`
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,12 @@ ULlamaComponent::ULlamaComponent(const FObjectInitializer &ObjectInitializer)`
`19`	`19`	`OnTokenGenerated.Broadcast(Token);`
`20`	`20`	`};`
`21`	`21`
	`22`	`+ LlamaNative->OnResponseGenerated = [this](const FString& Response)`
	`23`	`+ {`
	`24`	`+ OnResponseGenerated.Broadcast(Response);`
	`25`	`+ OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
	`26`	`+ };`
	`27`	`+`
`22`	`28`	`LlamaNative->OnPartialGenerated = [this](const FString& Partial)`
`23`	`29`	`{`
`24`	`30`	`OnPartialGenerated.Broadcast(Partial);`
`@@ -87,26 +93,26 @@ void ULlamaComponent::InsertTemplatedPrompt(const FString& Text, EChatTemplateRo`
`87`	`93`
`88`	`94`	`void ULlamaComponent::InsertTemplatedPromptStruct(const FLlamaChatPrompt& ChatPrompt)`
`89`	`95`	`{`
`90`		`- LlamaNative->InsertTemplatedPrompt(ChatPrompt, [this, ChatPrompt](const FString& Response)`
`91`		`- {`
	`96`	`+ LlamaNative->InsertTemplatedPrompt(ChatPrompt);/*, [this, ChatPrompt](const FString& Response));`
	`97`	`+ {`
`92`	`98`	`if (ChatPrompt.bGenerateReply)`
`93`	`99`	`{`
`94`	`100`	`OnResponseGenerated.Broadcast(Response);`
`95`	`101`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`96`	`102`	`}`
`97`		`- });`
	`103`	`+ });*/`
`98`	`104`	`}`
`99`	`105`
`100`	`106`	`void ULlamaComponent::InsertRawPrompt(const FString& Text, bool bGenerateReply)`
`101`	`107`	`{`
`102`		`- LlamaNative->InsertRawPrompt(Text, bGenerateReply, [this, bGenerateReply](const FString& Response)`
	`108`	`+ LlamaNative->InsertRawPrompt(Text, bGenerateReply); /*, [this, bGenerateReply](const FString& Response)`
`103`	`109`	`{`
`104`	`110`	`if (bGenerateReply)`
`105`	`111`	`{`
`106`	`112`	`OnResponseGenerated.Broadcast(Response);`
`107`	`113`	`OnEndOfStream.Broadcast(true, ModelState.LastTokenGenerationSpeed);`
`108`	`114`	`}`
`109`		`- });`
	`115`	`+ });*/`
`110`	`116`	`}`
`111`	`117`
`112`	`118`	`void ULlamaComponent::LoadModel(bool bForceReload)`
`@@ -120,11 +126,6 @@ void ULlamaComponent::LoadModel(bool bForceReload)`
`120`	`126`	`return;`
`121`	`127`	`}`
`122`	`128`
`123`		`- if (ModelParams.bAutoInsertSystemPromptOnLoad)`
`124`		`- {`
`125`		`- InsertTemplatedPrompt(ModelParams.SystemPrompt, EChatTemplateRole::System, false, false);`
`126`		`- }`
`127`		`-`
`128`	`129`	`OnModelLoaded.Broadcast(ModelPath);`
`129`	`130`	`});`
`130`	`131`	`}`
`@@ -163,6 +164,19 @@ void ULlamaComponent::RemoveLastUserInput()`
`163`	`164`	`LlamaNative->RemoveLastUserInput();`
`164`	`165`	`}`
`165`	`166`
	`167`	`+`
	`168`	`+void ULlamaComponent::ImpersonateTemplatedPrompt(const FLlamaChatPrompt& ChatPrompt)`
	`169`	`+{`
	`170`	`+ LlamaNative->SetModelParams(ModelParams);`
	`171`	`+`
	`172`	`+ LlamaNative->ImpersonateTemplatedPrompt(ChatPrompt);`
	`173`	`+}`
	`174`	`+`
	`175`	`+void ULlamaComponent::ImpersonateTemplatedToken(const FString& Token, EChatTemplateRole Role, bool bEoS)`
	`176`	`+{`
	`177`	`+ LlamaNative->ImpersonateTemplatedToken(Token, Role, bEoS);`
	`178`	`+}`
	`179`	`+`
`166`	`180`	`FString ULlamaComponent::WrapPromptForRole(const FString& Text, EChatTemplateRole Role, const FString& Template)`
`167`	`181`	`{`
`168`	`182`	`return LlamaNative->WrapPromptForRole(Text, Role, Template);`