Skip to content

Commit ab11fb9

Browse files
committed
measure tps via impersonation by stamping start/stop
- not perfectly accurate, but should be close
1 parent b06036c commit ab11fb9

File tree

2 files changed

+17
-0
lines changed

2 files changed

+17
-0
lines changed

Source/LlamaCore/Private/LlamaNative.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,12 +491,17 @@ void FLlamaNative::ImpersonateTemplatedToken(const FString& Token, EChatTemplate
491491

492492
ModelState.ChatHistory.History.Add(Message);
493493

494+
ThenTimeStamp = FPlatformTime::Seconds();
495+
ImpersonationTokenCount = 1;
496+
497+
494498
CurrentReplyText += Token;
495499
}
496500
else
497501
{
498502
FStructuredChatMessage& Message = ModelState.ChatHistory.History.Last();
499503
Message.Content += Token;
504+
ImpersonationTokenCount++;
500505

501506
CurrentReplyText += Message.Content;
502507
}
@@ -537,6 +542,14 @@ void FLlamaNative::ImpersonateTemplatedToken(const FString& Token, EChatTemplate
537542
//full response reply on finish
538543
if (bEoS)
539544
{
545+
double Duration = FPlatformTime::Seconds() - ThenTimeStamp;
546+
double TotalTokens = ImpersonationTokenCount;
547+
ImpersonationTokenCount = 0;
548+
549+
ModelState.LastPromptProcessingSpeed = 0; //this can't be measured without more imput
550+
ModelState.LastTokenGenerationSpeed = TotalTokens / Duration;
551+
ModelState.LastRole = EChatTemplateRole::Assistant;
552+
540553
if (OnModelStateChanged)
541554
{
542555
OnModelStateChanged(ModelState);

Source/LlamaCore/Public/LlamaNative.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ class LLAMACORE_API FLlamaNative
8686
FLLMModelState ModelState;
8787
bool bModelLoadInitiated = false; //tracking model load attempts
8888

89+
//Temp states
90+
double ThenTimeStamp = 0.f;
91+
int32 ImpersonationTokenCount = 0;
92+
8993
//BG State - do not read/write on GT
9094
FString CombinedPieceText; //accumulates tokens into full string during per-token inference.
9195

0 commit comments

Comments
 (0)