@@ -60,7 +60,7 @@ FLlamaNative::FLlamaNative()
6060 {
6161 if (ModelParams.Advanced .bLogGenerationStats )
6262 {
63- UE_LOG (LlamaLog, Log, TEXT (" Generated %d tokens in %1.2fs (%1.2ftps)" ), TokensGenerated, Duration, SpeedTps);
63+ UE_LOG (LlamaLog, Log, TEXT (" TGS - Generated %d tokens in %1.2fs (%1.2ftps)" ), TokensGenerated, Duration, SpeedTps);
6464 }
6565
6666 int32 UsedContext = UsedContextLength ();
@@ -88,6 +88,11 @@ FLlamaNative::FLlamaNative()
8888
8989 Internal->OnPromptProcessed = [this ](int32 TokensProcessed, EChatTemplateRole RoleProcessed, float SpeedTps)
9090 {
91+ if (ModelParams.Advanced .bLogGenerationStats )
92+ {
93+ UE_LOG (LlamaLog, Log, TEXT (" PPS - Processed %d tokens at %1.2ftps" ), TokensProcessed, SpeedTps);
94+ }
95+
9196 int32 UsedContext = UsedContextLength ();
9297
9398 // Sync history data with additional state updates
@@ -266,6 +271,7 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
266271 // already loaded, we're done
267272 return ModelLoadedCallback (ModelParams.PathToModel , 0 );
268273 }
274+ bModelLoadInitiated = true ;
269275
270276 // Copy so these dont get modified during enqueue op
271277 const FLLMModelParams ParamsAtLoad = ModelParams;
@@ -284,6 +290,14 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
284290 const FString TemplateString = FLlamaString::ToUE (Internal->Template );
285291 const FString TemplateSource = FLlamaString::ToUE (Internal->TemplateSource );
286292
293+ // Before we release the BG thread, ensure we enqueue the system prompt
294+ // If we do it later, other queued calls will frontrun it. This enables startup chaining correctly
295+ if (ParamsAtLoad.bAutoInsertSystemPromptOnLoad )
296+ {
297+ Internal->InsertTemplatedPrompt (FLlamaString::ToStd (ParamsAtLoad.SystemPrompt ), EChatTemplateRole::System, false , false );
298+ }
299+
300+ // Callback on game thread for data sync
287301 EnqueueGTTask ([this , TemplateString, TemplateSource, ModelLoadedCallback]
288302 {
289303 FJinjaChatTemplate ChatTemplate;
@@ -293,6 +307,8 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
293307 ModelState.ChatTemplateInUse = ChatTemplate;
294308 ModelState.bModelIsLoaded = true ;
295309
310+ bModelLoadInitiated = false ;
311+
296312 if (OnModelStateChanged)
297313 {
298314 OnModelStateChanged (ModelState);
@@ -308,15 +324,22 @@ void FLlamaNative::LoadModel(bool bForceReload, TFunction<void(const FString&, i
308324 {
309325 EnqueueGTTask ([this , ModelLoadedCallback]
310326 {
327+ bModelLoadInitiated = false ;
328+
311329 // On error will be triggered earlier in the chain, but forward our model loading error status here
312- ModelLoadedCallback (ModelParams.PathToModel , 15 );
330+ if (ModelLoadedCallback)
331+ {
332+ ModelLoadedCallback (ModelParams.PathToModel , 15 );
333+ }
313334 }, TaskId);
314335 }
315336 });
316337}
317338
318339void FLlamaNative::UnloadModel (TFunction<void (int32 StatusCode)> ModelUnloadedCallback)
319340{
341+ bModelLoadInitiated = false ;
342+
320343 EnqueueBGTask ([this , ModelUnloadedCallback](int64 TaskId)
321344 {
322345 if (IsModelLoaded ())
@@ -349,7 +372,7 @@ bool FLlamaNative::IsModelLoaded()
349372
350373void FLlamaNative::InsertTemplatedPrompt (const FLlamaChatPrompt& Prompt, TFunction<void (const FString& Response)> OnResponseFinished)
351374{
352- if (!IsModelLoaded ())
375+ if (!IsModelLoaded () && !bModelLoadInitiated )
353376 {
354377 UE_LOG (LlamaLog, Warning, TEXT (" Model isn't loaded, can't run prompt." ));
355378 return ;
@@ -386,7 +409,7 @@ void FLlamaNative::InsertTemplatedPrompt(const FLlamaChatPrompt& Prompt, TFuncti
386409
387410void FLlamaNative::InsertRawPrompt (const FString& Prompt, bool bGenerateReply, TFunction<void (const FString& Response)>OnResponseFinished)
388411{
389- if (!IsModelLoaded ())
412+ if (!IsModelLoaded () && !bModelLoadInitiated )
390413 {
391414 UE_LOG (LlamaLog, Warning, TEXT (" Model isn't loaded, can't run prompt." ));
392415 return ;
@@ -407,6 +430,124 @@ void FLlamaNative::InsertRawPrompt(const FString& Prompt, bool bGenerateReply, T
407430 });
408431}
409432
433+ void FLlamaNative::ImpersonateTemplatedPrompt (const FLlamaChatPrompt& Prompt)
434+ {
435+ // modify model state
436+ if (IsModelLoaded ())
437+ {
438+ // insert it but make sure we don't do any token generation
439+ FLlamaChatPrompt ModifiedPrompt = Prompt;
440+ ModifiedPrompt.bGenerateReply = false ;
441+
442+ InsertTemplatedPrompt (ModifiedPrompt);
443+ }
444+ else
445+ {
446+ // no model, so just run this in sync mode
447+ FStructuredChatMessage Message;
448+ Message.Role = Prompt.Role ;
449+ Message.Content = Prompt.Prompt ;
450+
451+ // modify our chat history state
452+ ModelState.ChatHistory .History .Add (Message);
453+
454+ if (OnModelStateChanged)
455+ {
456+ OnModelStateChanged (ModelState);
457+ }
458+ // was this an assistant message? emit response generated callback
459+ if (Message.Role == EChatTemplateRole::Assistant)
460+ {
461+ if (OnResponseGenerated)
462+ {
463+ OnResponseGenerated (Prompt.Prompt );
464+ }
465+ }
466+ }
467+ }
468+
469+ void FLlamaNative::ImpersonateTemplatedToken (const FString& Token, EChatTemplateRole Role, bool bEoS)
470+ {
471+ // Should be called on game thread.
472+
473+ // NB: we don't support updating model internal state atm
474+
475+ // Check if we need to add a message before modifying it
476+ bool bLastRoleWasMatchingRole = false ;
477+
478+ if (ModelState.ChatHistory .History .Num () != 0 )
479+ {
480+ FStructuredChatMessage& Message = ModelState.ChatHistory .History .Last ();
481+ bLastRoleWasMatchingRole = Message.Role == Role;
482+ }
483+
484+ FString CurrentReplyText;
485+
486+ if (!bLastRoleWasMatchingRole)
487+ {
488+ FStructuredChatMessage Message;
489+ Message.Role = Role;
490+ Message.Content = Token;
491+
492+ ModelState.ChatHistory .History .Add (Message);
493+
494+ CurrentReplyText += Token;
495+ }
496+ else
497+ {
498+ FStructuredChatMessage& Message = ModelState.ChatHistory .History .Last ();
499+ Message.Content += Token;
500+
501+ CurrentReplyText += Message.Content ;
502+ }
503+
504+ FStructuredChatMessage& Message = ModelState.ChatHistory .History .Last ();
505+
506+ FString Partial;
507+
508+ // Compute Partials
509+ if (ModelParams.Advanced .bEmitPartials )
510+ {
511+ bool bSplitFound = false ;
512+ // Check new token for separators
513+ for (const FString& Separator : ModelParams.Advanced .PartialsSeparators )
514+ {
515+ if (Token.Contains (Separator))
516+ {
517+ bSplitFound = true ;
518+ }
519+ }
520+ if (bSplitFound)
521+ {
522+ Partial = FLlamaString::GetLastSentence (CurrentReplyText);
523+ }
524+ }
525+
526+ // Emit token to game thread
527+ if (OnTokenGenerated)
528+ {
529+ OnTokenGenerated (Token);
530+
531+ if (OnPartialGenerated && !Partial.IsEmpty ())
532+ {
533+ OnPartialGenerated (Partial);
534+ }
535+ }
536+
537+ // full response reply on finish
538+ if (bEoS)
539+ {
540+ if (OnModelStateChanged)
541+ {
542+ OnModelStateChanged (ModelState);
543+ }
544+ if (OnResponseGenerated)
545+ {
546+ OnResponseGenerated (CurrentReplyText);
547+ }
548+ }
549+ }
550+
410551void FLlamaNative::RemoveLastNMessages (int32 MessageCount)
411552{
412553 EnqueueBGTask ([this , MessageCount](int64 TaskId)
0 commit comments