NVTesnorRtRtx: Support num_beam > 1 (microsoft#1688)

anujj · web-flow · commit 625815e71549 · 2025-08-18T12:34:20.000-07:00
- Pass the num_beams though the overlay - max batch shapes for NVTensorRtRtx = batch_size * num_beams - Ading @baijumeswani @kunal-vaishnavi @gaugarg-nv for review
diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py
@@ -19,7 +19,7 @@ def main(args):
     batch_size = len(prompts)
 
     config = og.Config(args.model_path)
-    config.overlay(f'{{"search": {{"batch_size": {batch_size}}}}}')
+    config.overlay(f'{{"search": {{"batch_size": {batch_size}, "num_beams": {3}}}}}')
 
     if args.execution_provider != "follow_config":
         config.clear_providers()
@@ -45,7 +45,6 @@ def main(args):
     params = og.GeneratorParams(model)
 
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} 
-    search_options['num_beams'] = 3
 
     if (args.verbose): print(f'Args: {args}')
     if (args.verbose): print(f'Search options: {search_options}')
diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -352,7 +352,7 @@ void ConfigureNvTensorRtRTxProfile(const Config& config, OrtSessionOptions& sess
   const int num_layers = config.model.decoder.num_hidden_layers;
   const int num_kv_heads = config.model.decoder.num_key_value_heads;
   const int head_dim = config.model.decoder.head_size;
-  const int batch_size = config.search.batch_size;
+  const int batch_size = config.search.batch_size * config.search.num_beams;
 
   // Get max context length from config
   const int max_context_len = config.model.context_length;