@@ -304,6 +304,7 @@ class Recorder
304304 std::vector<float > reqLatencies;
305305 std::vector<float > ftLatencies;
306306 std::vector<float > genT2TLatencies;
307+ std::vector<float > userTokensPerSecond;
307308
308309 int totalOutputTokens{0 };
309310 int totalDecodingIter{0 };
@@ -325,6 +326,10 @@ class Recorder
325326 {
326327 genT2TLatencies.push_back (reqInfo.second .avgGenT2TLatency .value ());
327328 }
329+ if (reqInfo.second .avgGenT2TLatency .value () > 0 )
330+ {
331+ userTokensPerSecond.push_back (1000 .F / reqInfo.second .avgGenT2TLatency .value ());
332+ }
328333 }
329334 ++mNumSamples ;
330335 }
@@ -377,6 +382,18 @@ class Recorder
377382 mMinGenT2TLatency = genT2TLatencies.front ();
378383 }
379384
385+ if (!userTokensPerSecond.empty ())
386+ {
387+ mAvgUserTokensPerSecond = std::accumulate (userTokensPerSecond.begin (), userTokensPerSecond.end (), 0 .F )
388+ / userTokensPerSecond.size ();
389+ std::sort (userTokensPerSecond.begin (), userTokensPerSecond.end ());
390+ mP99UserTokensPerSecond = calcPercentile (userTokensPerSecond, 99 );
391+ mP90UserTokensPerSecond = calcPercentile (userTokensPerSecond, 90 );
392+ mP50UserTokensPerSecond = calcPercentile (userTokensPerSecond, 50 );
393+ mMaxUserTokensPerSecond = userTokensPerSecond.back ();
394+ mMinUserTokensPerSecond = userTokensPerSecond.front ();
395+ }
396+
380397 mAvgReqQueueingLatency
381398 = std::accumulate (mRequestsQueueingLatencies .begin (), mRequestsQueueingLatencies .end (), 0 .F )
382399 / mRequestsQueueingLatencies .size ();
@@ -423,6 +440,13 @@ class Recorder
423440 printf (" [BENCHMARK] p90_inter_token_latency(ms) %.2f\n " , mP90GenT2TLatency );
424441 printf (" [BENCHMARK] p50_inter_token_latency(ms) %.2f\n\n " , mP50GenT2TLatency );
425442
443+ printf (" [BENCHMARK] avg_user_tokens_per_second(tokens/sec/user) %.2f\n " , mAvgUserTokensPerSecond );
444+ printf (" [BENCHMARK] max_user_tokens_per_second(tokens/sec/user) %.2f\n " , mMaxUserTokensPerSecond );
445+ printf (" [BENCHMARK] min_user_tokens_per_second(tokens/sec/user) %.2f\n " , mMinUserTokensPerSecond );
446+ printf (" [BENCHMARK] p99_user_tokens_per_second(tokens/sec/user) %.2f\n " , mP99UserTokensPerSecond );
447+ printf (" [BENCHMARK] p90_user_tokens_per_second(tokens/sec/user) %.2f\n " , mP90UserTokensPerSecond );
448+ printf (" [BENCHMARK] p50_user_tokens_per_second(tokens/sec/user) %.2f\n\n " , mP50UserTokensPerSecond );
449+
426450 printf (" [BENCHMARK] avg_request_queueing_latency(ms) %.2f\n " , mAvgReqQueueingLatency );
427451 printf (" [BENCHMARK] max_request_queueing_latency(ms) %.2f\n " , mMaxReqQueueingLatency );
428452 printf (" [BENCHMARK] min_request_queueing_latency(ms) %.2f\n " , mMinReqQueueingLatency );
@@ -443,11 +467,26 @@ class Recorder
443467
444468 if (mStreaming )
445469 {
446- std::vector<std::string> streamingHeaders
447- = {" avg_time_to_first_token(ms)" , " max_time_to_first_token(ms)" , " min_time_to_first_token(ms)" ,
448- " p99_time_to_first_token(ms)" , " p90_time_to_first_token(ms)" , " p50_time_to_first_token(ms)" ,
449- " avg_inter_token_latency(ms)" , " max_inter_token_latency(ms)" , " min_inter_token_latency(ms)" ,
450- " p99_inter_token_latency(ms)" , " p90_inter_token_latency(ms)" , " p50_inter_token_latency(ms)" };
470+ std::vector<std::string> streamingHeaders = {
471+ " avg_time_to_first_token(ms)" ,
472+ " max_time_to_first_token(ms)" ,
473+ " min_time_to_first_token(ms)" ,
474+ " p99_time_to_first_token(ms)" ,
475+ " p90_time_to_first_token(ms)" ,
476+ " p50_time_to_first_token(ms)" ,
477+ " avg_inter_token_latency(ms)" ,
478+ " max_inter_token_latency(ms)" ,
479+ " min_inter_token_latency(ms)" ,
480+ " p99_inter_token_latency(ms)" ,
481+ " p90_inter_token_latency(ms)" ,
482+ " p50_inter_token_latency(ms)" ,
483+ " avg_user_tokens_per_second(tokens/sec/user)" ,
484+ " max_user_tokens_per_second(tokens/sec/user)" ,
485+ " min_user_tokens_per_second(tokens/sec/user)" ,
486+ " p99_user_tokens_per_second(tokens/sec/user)" ,
487+ " p90_user_tokens_per_second(tokens/sec/user)" ,
488+ " p50_user_tokens_per_second(tokens/sec/user)" ,
489+ };
451490
452491 headers.insert (headers.end (), streamingHeaders.begin (), streamingHeaders.end ());
453492 }
@@ -470,7 +509,10 @@ class Recorder
470509 outputFile << " ," << mAvgFtLatency << " ," << mMaxFtLatency << " ," << mMinFtLatency << " ,"
471510 << mP99FtLatency << " ," << mP90FtLatency << " ," << mP50FtLatency << " ,"
472511 << mAvgGenT2TLatency << " ," << mMaxGenT2TLatency << " ," << mMinGenT2TLatency << " ,"
473- << mP99GenT2TLatency << " ," << mP90GenT2TLatency << " ," << mP50GenT2TLatency ;
512+ << mP99GenT2TLatency << " ," << mP90GenT2TLatency << " ," << mP50GenT2TLatency << " ,"
513+ << mAvgUserTokensPerSecond << " ," << mMaxUserTokensPerSecond << " ,"
514+ << mMinUserTokensPerSecond << " ," << mP99UserTokensPerSecond << " ,"
515+ << mP90UserTokensPerSecond << " ," << mP50UserTokensPerSecond << " ," ;
474516 }
475517
476518 outputFile << " \n " ;
@@ -524,6 +566,7 @@ class Recorder
524566 float mSeqThroughput {};
525567 float mAvgSeqLatency {};
526568 float mAvgGenT2TLatency {};
569+ float mAvgUserTokensPerSecond {};
527570 float mAvgFtLatency {};
528571 float mTokenThroughput {};
529572 float mAcceptanceRate {};
@@ -542,6 +585,11 @@ class Recorder
542585 float mP50GenT2TLatency {};
543586 float mMaxGenT2TLatency {};
544587 float mMinGenT2TLatency {};
588+ float mP99UserTokensPerSecond {};
589+ float mP90UserTokensPerSecond {};
590+ float mP50UserTokensPerSecond {};
591+ float mMaxUserTokensPerSecond {};
592+ float mMinUserTokensPerSecond {};
545593 float mAvgReqQueueingLatency {};
546594 float mP99ReqQueueingLatency {};
547595 float mP90ReqQueueingLatency {};
@@ -1054,7 +1102,7 @@ int main(int argc, char* argv[])
10541102 " Operate in streaming mode. Note: it reflects time-to-first-token and inter-token-latency" ,
10551103 cxxopts::value<bool >()->default_value (" false" ));
10561104 options.add_options ()(
1057- " enable_kv_cache_reuse" , " Enables the KV cache reuse." , cxxopts::value<bool >()->default_value (" false " ));
1105+ " enable_kv_cache_reuse" , " Enables the KV cache reuse." , cxxopts::value<bool >()->default_value (" true " ));
10581106 options.add_options ()(
10591107 " enable_chunked_context" , " Whether to enable context chunking." , cxxopts::value<bool >()->default_value (" true" ));
10601108 options.add_options ()(
@@ -1096,6 +1144,11 @@ int main(int argc, char* argv[])
10961144 " Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle" ,
10971145 cxxopts::value<float >());
10981146 options.add_options ()(" temperature" , " Sampling temperature for each request" , cxxopts::value<float >());
1147+ options.add_options ()(
1148+ " eagle_use_dynamic_tree" , " Whether to use Eagle-2" , cxxopts::value<bool >()->default_value (" false" ));
1149+ options.add_options ()(" eagle_dynamic_tree_max_top_k" ,
1150+ " The max topK for dynamic tree, also the number of draft tokens that will expand for each node" ,
1151+ cxxopts::value<SizeType32>());
10991152
11001153 options.add_options ()(" multi_block_mode" ,
11011154 " Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel" ,
@@ -1305,7 +1358,8 @@ int main(int argc, char* argv[])
13051358 benchmarkParams.medusaChoices = parseVectorOfVectors (result[" medusa_choices" ].as <std::string>());
13061359 }
13071360 // Argument: Eagle choices for the Eagle speculative decoding.
1308- if (result.count (" eagle_choices" ) || result.count (" eagle_posterior_threshold" ))
1361+ if (result.count (" eagle_choices" ) || result.count (" eagle_posterior_threshold" )
1362+ || result.count (" eagle_use_dynamic_tree" ) || result.count (" eagle_dynamic_tree_max_top_k" ))
13091363 {
13101364 std::optional<float > posteriorThreshold;
13111365 if (result.count (" eagle_posterior_threshold" ))
@@ -1317,7 +1371,18 @@ int main(int argc, char* argv[])
13171371 {
13181372 choices = parseVectorOfVectors (result[" eagle_choices" ].as <std::string>());
13191373 }
1320- benchmarkParams.eagleConfig = texec::EagleConfig (choices, !posteriorThreshold.has_value (), posteriorThreshold);
1374+ bool eagleUseDynamicTree = false ;
1375+ if (result.count (" eagle_use_dynamic_tree" ))
1376+ {
1377+ eagleUseDynamicTree = result[" eagle_use_dynamic_tree" ].as <bool >();
1378+ }
1379+ std::optional<SizeType32> eagleDynamicTreeMaxTopK;
1380+ if (result.count (" eagle_dynamic_tree_max_top_k" ))
1381+ {
1382+ eagleDynamicTreeMaxTopK = result[" eagle_dynamic_tree_max_top_k" ].as <SizeType32>();
1383+ }
1384+ benchmarkParams.eagleConfig = texec::EagleConfig (
1385+ choices, !posteriorThreshold.has_value (), posteriorThreshold, eagleUseDynamicTree, eagleDynamicTreeMaxTopK);
13211386 }
13221387 if (result.count (" temperature" ))
13231388 {
0 commit comments