@@ -807,7 +807,7 @@ var _ = Describe("Simulator", func() {
807807				simulator .config .TimeToFirstTokenStdDev  =  timeToFirstTokenStdDev 
808808				simulator .config .KVCacheTransferLatency  =  kvCacheLatency 
809809				simulator .config .KVCacheTransferLatencyStdDev  =  kvCacheLatencyStdDev 
810- 				timeToFirst  :=  simulator .getTimeToFirstToken (1 , doREmotePrefill )
810+ 				timeToFirst  :=  simulator .getTimeToFirstToken (1 , 0 ,  doREmotePrefill )
811811				if  doREmotePrefill  {
812812					Expect (timeToFirst ).To (BeNumerically (">=" , int (float32 (kvCacheLatency )* 0.3 )))
813813					Expect (timeToFirst ).To (BeNumerically ("<=" , int (float32 (kvCacheLatency )* 1.7 )))
@@ -838,7 +838,7 @@ var _ = Describe("Simulator", func() {
838838			simulator .config .PrefillTimePerToken  =  200 
839839			simulator .config .PrefillTimeStdDev  =  80 
840840
841- 			ttft  :=  simulator .getTimeToFirstToken (128 , false )
841+ 			ttft  :=  simulator .getTimeToFirstToken (128 , 0 ,  false )
842842
843843			Expect (ttft ).To (BeNumerically ("==" , timeToFirstToken ))
844844		})
@@ -851,33 +851,60 @@ var _ = Describe("Simulator", func() {
851851			simulator .config .PrefillTimePerToken  =  200 
852852			simulator .config .PrefillTimeStdDev  =  80 
853853
854- 			ttft  :=  simulator .getTimeToFirstToken (128 , false )
854+ 			ttft  :=  simulator .getTimeToFirstToken (128 , 0 ,  false )
855855			Expect (ttft ).NotTo (BeNumerically ("==" , 0 ))
856856		})
857857
858- 		DescribeTable ("time to first token is against number of prompt tokens" ,
859- 			func (prefillOverhead  int , prefillTimePerToken  int , stdDev  int , nTokens  int ) {
858+ 		DescribeTable ("time to first token is against number of prompt tokens with std " ,
859+ 			func (prefillOverhead  int , prefillTimePerToken  int , stdDev  int , nTokens  int ,  nCachedTokens   int ) {
860860				simulator .config .TimeToFirstToken  =  0 
861861				simulator .config .PrefillOverhead  =  prefillOverhead 
862862				simulator .config .PrefillTimePerToken  =  prefillTimePerToken 
863863				simulator .config .PrefillTimeStdDev  =  stdDev 
864864
865- 				ttft  :=  simulator .getTimeToFirstToken (nTokens , false )
865+ 				ttft  :=  simulator .getTimeToFirstToken (nTokens , nCachedTokens ,  false )
866866
867- 				expectedTTFT  :=  prefillOverhead  +  prefillTimePerToken * nTokens 
867+ 				expectedTTFT  :=  prefillOverhead  +  prefillTimePerToken * ( nTokens - nCachedTokens ) 
868868				Expect (ttft ).To (BeNumerically (">=" , int (float64 (expectedTTFT )* 0.3 )))
869869				Expect (ttft ).To (BeNumerically ("<=" , int (float64 (expectedTTFT )* 1.7 )))
870+ 			},
871+ 			func (prefillOverhead  int , prefillTimePerToken , stdDev  int , nTokens  int , nCachedTokens  int ) string  {
872+ 				return  fmt .Sprintf ("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d nCachedTokens: %d" ,
873+ 					prefillOverhead , prefillTimePerToken , stdDev , nTokens , nCachedTokens )
874+ 			},
875+ 			Entry ("single token" , 100 , 50 , 10 , 1 , 0 ),
876+ 			Entry ("single token big std" , 100 , 50 , 70 , 1 , 0 ),
877+ 			Entry ("stddev is 0" , 100 , 50 , 0 , 1 , 0 ),
878+ 			Entry ("medium overhead, 512 tokens" , 200 , 1000 , 150 , 512 , 0 ),
879+ 			Entry ("large overhead, 1024 tokens" , 2000 , 3000 , 800 , 1024 , 0 ),
880+ 			Entry ("very long prompt" , 150 , 200 , 70 , 20000 , 0 ),
881+ 			Entry ("medium overhead, 512 tokens, 256 cached" , 200 , 1000 , 150 , 512 , 256 ),
882+ 			Entry ("large overhead, 1024 tokens, 1008 cached" , 2000 , 3000 , 800 , 1024 , 1008 ),
883+ 			Entry ("very long prompt, 1024 cached" , 150 , 200 , 70 , 20000 , 1024 ),
884+ 		)
885+ 
886+ 		DescribeTable ("time to first token is against number of prompt tokens" ,
887+ 			func (prefillOverhead  int , prefillTimePerToken  int , nTokens  int , nCachedTokens  int ) {
888+ 				simulator .config .TimeToFirstToken  =  0 
889+ 				simulator .config .PrefillOverhead  =  prefillOverhead 
890+ 				simulator .config .PrefillTimePerToken  =  prefillTimePerToken 
891+ 				simulator .config .PrefillTimeStdDev  =  0 
870892
893+ 				ttft  :=  simulator .getTimeToFirstToken (nTokens , nCachedTokens , false )
894+ 				expectedTTFT  :=  prefillOverhead  +  prefillTimePerToken * (nTokens - nCachedTokens )
895+ 				Expect (ttft ).To (Equal (expectedTTFT ))
871896			},
872- 			func (prefillOverhead  int , prefillTimePerToken , stdDev  int , nTokens  int ) string  {
873- 				return  fmt .Sprintf ("prefillOverhead: %d, prefillTimePerToken: %d, stdDev : %d, nTokens : %d" ,
874- 					prefillOverhead , prefillTimePerToken , stdDev ,  nTokens )
897+ 			func (prefillOverhead  int , prefillTimePerToken , nTokens  int , nCachedTokens  int ) string  {
898+ 				return  fmt .Sprintf ("prefillOverhead: %d, prefillTimePerToken: %d, nTokens : %d nCachedTokens : %d" ,
899+ 					prefillOverhead , prefillTimePerToken , nTokens ,  nCachedTokens )
875900			},
876- 			Entry ("single token" , 100 , 50 , 70 , 1 ),
877- 			Entry ("stddev is 0" , 100 , 50 , 0 , 1 ),
878- 			Entry ("medium overhead, 512 tokens" , 200 , 1000 , 150 , 512 ),
879- 			Entry ("large overhead, 1024 tokens" , 2000 , 3000 , 1800 , 1024 ),
880- 			Entry ("very long prompt" , 150 , 200 , 100 , 20000 ),
901+ 			Entry ("single token" , 100 , 50 , 1 , 0 ),
902+ 			Entry ("medium overhead, 512 tokens" , 200 , 1000 , 512 , 0 ),
903+ 			Entry ("large overhead, 1024 tokens" , 2000 , 3000 , 1024 , 0 ),
904+ 			Entry ("very long prompt" , 150 , 200 , 20000 , 0 ),
905+ 			Entry ("medium overhead, 512 tokens, 256 cached" , 200 , 1000 , 512 , 256 ),
906+ 			Entry ("large overhead, 1024 tokens, 128 cached" , 2000 , 3000 , 1024 , 128 ),
907+ 			Entry ("very long prompt, 1024 cached" , 150 , 200 , 20000 , 1024 ),
881908		)
882909
883910		It ("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>" , func () {
@@ -887,7 +914,7 @@ var _ = Describe("Simulator", func() {
887914			simulator .config .KVCacheTransferTimePerToken  =  100 
888915			simulator .config .KVCacheTransferTimeStdDev  =  0 
889916
890- 			ttft  :=  simulator .getTimeToFirstToken (128 , true )
917+ 			ttft  :=  simulator .getTimeToFirstToken (128 , 0 ,  true )
891918			Expect (ttft ).To (BeNumerically ("==" , 200 ))
892919		})
893920
@@ -898,7 +925,7 @@ var _ = Describe("Simulator", func() {
898925			simulator .config .KVCacheTransferTimePerToken  =  100 
899926			simulator .config .KVCacheTransferTimeStdDev  =  0 
900927
901- 			ttft  :=  simulator .getTimeToFirstToken (128 , true )
928+ 			ttft  :=  simulator .getTimeToFirstToken (128 , 0 ,  true )
902929			Expect (ttft ).To (BeNumerically ("==" , 12800 ))
903930		})
904931
@@ -909,7 +936,7 @@ var _ = Describe("Simulator", func() {
909936				simulator .config .KVCacheTransferTimePerToken  =  kvCacheTransTPT 
910937				simulator .config .KVCacheTransferTimeStdDev  =  stddev 
911938
912- 				ttft  :=  simulator .getTimeToFirstToken (nTokens , true )
939+ 				ttft  :=  simulator .getTimeToFirstToken (nTokens , 0 ,  true )
913940
914941				expectedTTFT  :=  kvCacheTransTPT  *  nTokens 
915942				Expect (ttft ).To (BeNumerically (">=" , int (float64 (expectedTTFT )* 0.3 )))
0 commit comments