@@ -145,6 +145,7 @@ struct BenchmarkParams
145145{
146146 std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt };
147147 std::optional<float > freeGpuMemoryFraction{std::nullopt };
148+ std::optional<float > crossKvCacheFraction{std::nullopt };
148149 bool enableTrtOverlap{false };
149150 bool enableBlockReuse{false };
150151 bool enableChunkedContext{false };
@@ -159,6 +160,8 @@ struct BenchmarkParams
159160 std::optional<int > sinkTokenLength{std::nullopt };
160161 bool multiBlockMode{true };
161162 bool enableContextFMHAFP32Acc{false };
163+ bool cudaGraphMode{false };
164+ SizeType32 cudaGraphCacheSize{0 };
162165
163166 // lora / peft params
164167 std::optional<std::string> loraDir{std::nullopt };
@@ -470,7 +473,38 @@ class Recorder
470473 mRequestBenchInfos [requestId].firstTokenSeen = true ;
471474 }
472475
473- mRequestBenchInfos [requestId].outputLength += 1 ;
476+ mRequestBenchInfos [requestId].decodingIter += 1 ;
477+ }
478+
479+ void recordToken (uint64_t requestId, std::list<NamedTensor> const & responseTensors)
480+ {
481+ int32_t outputLength = 1 ;
482+ for (auto & tensor : responseTensors)
483+ {
484+ if (tensor.name == inference_request::kSequenceLengthTensorName )
485+ {
486+ // Tensor of shape nBeams, and we only need the first one
487+ outputLength = *(bufferCast<int32_t >(*(tensor.tensor )));
488+ break ;
489+ }
490+ }
491+
492+ mRequestBenchInfos [requestId].outputLength += outputLength;
493+ this ->recordToken (requestId);
494+ }
495+
496+ void recordToken (uint64_t requestId, texec::Response const & response)
497+ {
498+ auto outputTokenIds = response.getResult ().outputTokenIds ;
499+
500+ int32_t outputLength = 1 ;
501+ for (auto const & beam : outputTokenIds)
502+ {
503+ outputLength = std::max (static_cast <int32_t >(beam.size ()), outputLength);
504+ }
505+
506+ mRequestBenchInfos [requestId].outputLength += outputLength;
507+ this ->recordToken (requestId);
474508 }
475509
476510 void recordEnd (uint64_t requestId, std::list<NamedTensor> const & responseTensors, bool hasError)
@@ -500,7 +534,7 @@ class Recorder
500534 }
501535 else
502536 {
503- this ->recordToken (requestId);
537+ this ->recordToken (requestId, responseTensors );
504538 }
505539 }
506540
@@ -532,7 +566,7 @@ class Recorder
532566 }
533567 else
534568 {
535- this ->recordToken (requestId);
569+ this ->recordToken (requestId, response );
536570 }
537571 }
538572 }
@@ -818,11 +852,13 @@ class ExecutorServer
818852 texec::SchedulerConfig schedulerConfig (capacitySchedulerPolicy);
819853 texec::KvCacheConfig kvCacheConfig (benchmarkParams.enableBlockReuse , benchmarkParams.maxTokensInPagedKvCache ,
820854 benchmarkParams.maxAttentionWindowVec , benchmarkParams.sinkTokenLength ,
821- benchmarkParams.freeGpuMemoryFraction , benchmarkParams.kvHostCacheSize , benchmarkParams.kvOnboardBlocks );
855+ benchmarkParams.freeGpuMemoryFraction , benchmarkParams.kvHostCacheSize , benchmarkParams.kvOnboardBlocks ,
856+ benchmarkParams.crossKvCacheFraction );
822857 texec::PeftCacheConfig peftCacheConfig (0 , benchmarkParams.loraDeviceNumModLayers , 8 , 64 , 4 , 4 , 4 , 24 , 8 ,
823858 std::nullopt , benchmarkParams.loraHostCacheSize );
824- texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig (
825- benchmarkParams.multiBlockMode , benchmarkParams.enableContextFMHAFP32Acc );
859+ texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig (benchmarkParams.multiBlockMode ,
860+ benchmarkParams.enableContextFMHAFP32Acc , benchmarkParams.cudaGraphMode ,
861+ benchmarkParams.cudaGraphCacheSize );
826862 texec::ExecutorConfig executorConfig (
827863 maxBeamWidth, schedulerConfig, kvCacheConfig, benchmarkParams.enableChunkedContext , true );
828864 executorConfig.setGpuWeightsPercent (benchmarkParams.gpuWeightsPercent );
@@ -940,7 +976,7 @@ class ExecutorServer
940976 {
941977 if (!warmup && !response.hasError ())
942978 {
943- mRecorder ->recordToken (reqId);
979+ mRecorder ->recordToken (reqId, response );
944980 }
945981 }
946982 }
@@ -1228,7 +1264,7 @@ class GptServer
12281264 {
12291265 if (errMsg.empty ())
12301266 {
1231- mRecorder ->recordToken (requestId);
1267+ mRecorder ->recordToken (requestId, response_tensors );
12321268 }
12331269 }
12341270 }
@@ -1430,6 +1466,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
14301466 {
14311467 optionalParams.kvCacheConfig .freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction ;
14321468 }
1469+ if (benchmarkParams.crossKvCacheFraction )
1470+ {
1471+ optionalParams.kvCacheConfig .crossKvCacheFraction = benchmarkParams.crossKvCacheFraction ;
1472+ }
14331473 if (benchmarkParams.maxAttentionWindowVec )
14341474 {
14351475 optionalParams.kvCacheConfig .maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec ;
@@ -1458,8 +1498,8 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
14581498 : benchmarkParams.executorLookaheadConfig .has_value () ? texec::DecodingMode::Lookahead ()
14591499 : texec::DecodingMode::Auto (),
14601500 benchmarkParams.executorLookaheadConfig , benchmarkParams.medusaChoices );
1461- optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig (
1462- benchmarkParams.multiBlockMode , benchmarkParams.enableContextFMHAFP32Acc );
1501+ optionalParams.extendedRuntimePerfKnobConfig = texec::ExtendedRuntimePerfKnobConfig (benchmarkParams. multiBlockMode ,
1502+ benchmarkParams.enableContextFMHAFP32Acc , benchmarkParams.cudaGraphMode , benchmarkParams. cudaGraphCacheSize );
14631503
14641504 auto const jsonConfig = GptJsonConfig::parse (engineDir / " config.json" );
14651505 auto const worldConfig = WorldConfig::mpi (jsonConfig.getGpusPerNode (), jsonConfig.getTensorParallelism (),
@@ -1874,6 +1914,8 @@ int main(int argc, char* argv[])
18741914 " random_seed" , " integer random seed for exponential time delays." , cxxopts::value<int >()->default_value (" 420" ));
18751915 options.add_options ()(
18761916 " kv_cache_free_gpu_mem_fraction" , " K-V Cache Free Gpu Mem Fraction." , cxxopts::value<float >());
1917+ options.add_options ()(
1918+ " cross_kv_cache_fraction" , " Cross K-V Cache Fraction (from 0.0 to 1.0)." , cxxopts::value<float >());
18771919 options.add_options ()(" request_rate" ,
18781920 " request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay." ,
18791921 cxxopts::value<float >());
@@ -1895,7 +1937,8 @@ int main(int argc, char* argv[])
18951937 options.add_options ()(" return_generation_logits" , " Whether to return generation logits." ,
18961938 cxxopts::value<bool >()->default_value (" false" ));
18971939
1898- options.add_options ()(" scheduler_policy" , " Choose scheduler policy between max_utilization/guaranteed_no_evict." ,
1940+ options.add_options ()(" scheduler_policy" ,
1941+ " Choose scheduler policy between max_utilization/guaranteed_no_evict/static_batch." ,
18991942 cxxopts::value<std::string>()->default_value (" guaranteed_no_evict" ));
19001943
19011944 options.add_options ()(" first_batch_delay" ,
@@ -1946,6 +1989,12 @@ int main(int argc, char* argv[])
19461989 cxxopts::value<bool >()->default_value (" true" ));
19471990 options.add_options ()(
19481991 " encoder_engine_dir" , " Directory that store the engines of the encoder models." , cxxopts::value<std::string>());
1992+ options.add_options ()(" cuda_graph_mode" , " When enabled, inference is executed with cuda graph." ,
1993+ cxxopts::value<bool >()->default_value (" false" ));
1994+ options.add_options ()(" cuda_graph_cache_size" ,
1995+ " Specify how many cuda graphs are cached in the runtime. Larger cache gives better perf, but consumes more GPU "
1996+ " memory." ,
1997+ cxxopts::value<SizeType32>()->default_value (" 0" ));
19491998
19501999 options.add_options ()(" enable_context_fmha_fp32_acc" , " Enable FMHA runner FP32 accumulation" ,
19512000 cxxopts::value<bool >()->default_value (" false" ));
@@ -2040,6 +2089,20 @@ int main(int argc, char* argv[])
20402089 {
20412090 benchmarkParams.freeGpuMemoryFraction = result[" kv_cache_free_gpu_mem_fraction" ].as <float >();
20422091 }
2092+ // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
2093+ if (result.count (" encoder_engine_dir" ) && result.count (" decoder_engine_dir" ))
2094+ {
2095+ if (result.count (" cross_kv_cache_fraction" ))
2096+ {
2097+ benchmarkParams.crossKvCacheFraction = result[" cross_kv_cache_fraction" ].as <float >();
2098+ }
2099+ else
2100+ {
2101+ benchmarkParams.crossKvCacheFraction
2102+ = 0 .5f ; // default value if not set. but non enc-dec should not even have this param set
2103+ }
2104+ }
2105+
20432106 // Argument: Enable TRT overlap
20442107 benchmarkParams.enableTrtOverlap = result[" enable_trt_overlap" ].as <bool >();
20452108
@@ -2131,6 +2194,12 @@ int main(int argc, char* argv[])
21312194 // Argument: enable_context_fmha_fp32_acc
21322195 benchmarkParams.enableContextFMHAFP32Acc = result[" enable_context_fmha_fp32_acc" ].as <bool >();
21332196
2197+ // Argument: cuda_graph_mode
2198+ benchmarkParams.cudaGraphMode = result[" cuda_graph_mode" ].as <bool >();
2199+
2200+ // Argument: cuda_graph_mode
2201+ benchmarkParams.cudaGraphCacheSize = result[" cuda_graph_cache_size" ].as <SizeType32>();
2202+
21342203 std::optional<TokenIdType> padId;
21352204 // Argument: Padding token id
21362205 if (result.count (" pad_id" ))
@@ -2168,6 +2237,10 @@ int main(int argc, char* argv[])
21682237 {
21692238 capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT ;
21702239 }
2240+ else if (capacitySchedulerPolicyArg == " static_batch" )
2241+ {
2242+ capacitySchedulerPolicy = texec::CapacitySchedulerPolicy::kSTATIC_BATCH ;
2243+ }
21712244 else
21722245 {
21732246 TLLM_LOG_ERROR (" Unexpected scheduler policy: " + capacitySchedulerPolicyArg);
@@ -2246,14 +2319,14 @@ int main(int argc, char* argv[])
22462319 {
22472320 texec::ModelType executorModelType;
22482321 std::optional<std::string> decoderEngineDir = std::nullopt , encoderEngineDir = std::nullopt ;
2249- if (result.count (" encoder_engine_dir" ) && result.count (" engine_dir " ))
2322+ if (result.count (" encoder_engine_dir" ) && result.count (" decoder_engine_dir " ))
22502323 {
22512324 TLLM_CHECK_WITH_INFO (api == " executor" , " encoder-decoder only support executor api." );
22522325 TLLM_CHECK_WITH_INFO (
22532326 modelType == TrtGptModelType::InflightFusedBatching, " encoder-decoder only support inflight batching." );
22542327 executorModelType = texec::ModelType::kENCODER_DECODER ;
2255- decoderEngineDir = result[" engine_dir" ].as <std::string>();
22562328 encoderEngineDir = result[" encoder_engine_dir" ].as <std::string>();
2329+ decoderEngineDir = result[" decoder_engine_dir" ].as <std::string>();
22572330 }
22582331 else if (result.count (" engine_dir" ))
22592332 {
0 commit comments