NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 12 additions & 4 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/layers/defaultDecodingParams.h‎
Lines changed: 5 additions & 0 deletions b/‎cpp/include/tensorrt_llm/layers/defaultDecodingParams.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/samplingConfig.h‎
Lines changed: 36 additions & 32 deletions b/‎cpp/include/tensorrt_llm/runtime/samplingConfig.h‎
Lines changed: 36 additions & 32 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelV1.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelV1.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -75,7 +75,8 @@ class SamplingConfig
         std::optional<SizeType32> const& earlyStopping = std::nullopt,
         std::optional<SizeType32> const& noRepeatNgramSize = std::nullopt,
         std::optional<SizeType32> const& numReturnSequences = std::nullopt,
-        std::optional<FloatType> const& minP = std::nullopt);
+        std::optional<FloatType> const& minP = std::nullopt,
+        std::optional<std::vector<SizeType32>> const& beamWidthArray = std::nullopt);
 
     bool operator==(SamplingConfig const& other) const;
 
@@ -100,6 +101,7 @@ class SamplingConfig
     [[nodiscard]] std::optional<SizeType32> getNoRepeatNgramSize() const;
     [[nodiscard]] std::optional<SizeType32> getNumReturnSequences() const;
     [[nodiscard]] std::optional<FloatType> getMinP() const;
+    [[nodiscard]] std::optional<std::vector<SizeType32>> getBeamWidthArray() const;
 
     void setBeamWidth(SizeType32 beamWidth);
     void setTopK(std::optional<SizeType32> const& topK);
@@ -121,6 +123,7 @@ class SamplingConfig
     void setNoRepeatNgramSize(std::optional<SizeType32> const& noRepeatNgramSize);
     void setNumReturnSequences(std::optional<SizeType32> const& numReturnSequences);
     void setMinP(std::optional<FloatType> const& minP);
+    void setBeamWidthArray(std::optional<std::vector<SizeType32>> const& beamWidthArray);
 
 private:
     static SizeType32 checkBeamWidth(SizeType32 beamWidth);
@@ -130,15 +133,18 @@ class SamplingConfig
     static std::optional<TokenIdType> const& checkTopPResetIds(std::optional<TokenIdType> const& topPResetIds);
     static std::optional<FloatType> const& checkTopPDecay(std::optional<FloatType> const& topPDecay);
     static std::optional<FloatType> const& checkTemperature(std::optional<FloatType> const& temperature);
-    static std::optional<FloatType> const& checkRepetitionPenalty(std::optional<FloatType> const& penalty);
     static std::optional<SizeType32> const& checkMinTokens(std::optional<SizeType32> const& minTokens);
-    static std::optional<SizeType32> const& checkNoRepeatNgramSize(std::optional<SizeType32> const& noRepeatNgramSize);
     static std::optional<FloatType> const& checkBeamSearchDiversityRate(
         std::optional<FloatType> const& beamSearchDiversityRate);
+    static std::optional<FloatType> const& checkRepetitionPenalty(std::optional<FloatType> const& repetitionpenalty);
+    static std::optional<FloatType> const& checkLengthPenalty(std::optional<FloatType> const& lengthPenalty);
+    static std::optional<SizeType32> const& checkEarlyStopping(std::optional<SizeType32> const& earlyStopping);
+    static std::optional<SizeType32> const& checkNoRepeatNgramSize(std::optional<SizeType32> const& noRepeatNgramSize);
     static std::optional<SizeType32> const& checkNumReturnSequences(
         std::optional<SizeType32> const& numReturnSequences, SizeType32 beamWidth);
     static std::optional<FloatType> const& checkMinP(std::optional<FloatType> const& minP);
-
+    static std::optional<std::vector<SizeType32>> const& checkBeamWidthArray(
+        std::optional<std::vector<SizeType32>> const& beamWidthArray, std::optional<SizeType32> const beamWidth);
     void updateNumReturnBeams();
 
     friend class Serialization;
@@ -188,6 +194,8 @@ class SamplingConfig
     /// @brief Controls the min_p scaling for sampling.
     /// It masks x which P_x < min_p * P_max, where P_x is probability of candidate x. Default is 0.f
     std::optional<FloatType> mMinP;
+    /// @brief Controls the beam width for each step for Variable-Beam-Width-Search.
+    std::optional<std::vector<SizeType32>> mBeamWidthArray;
 };
 
 /// @brief Configuration that controls the outputs of a Result
 
@@ -128,6 +128,11 @@ class DefaultDecodingParams
     {
         return 0.0f;
     }
+
+    [[nodiscard]] static std::vector<runtime::SizeType32> getBeamWidthArray()
+    {
+        return std::vector<runtime::SizeType32>{1};
+    }
 };
 } // namespace layers
 } // namespace tensorrt_llm
@@ -74,9 +74,6 @@ class SamplingConfig
         }
     }
 
-    template <typename T>
-    using Vec = std::vector<T>;
-
     template <typename T>
     bool validateVec(std::string name, OptVec<T> const& vec, T min, std::optional<T> max = std::nullopt)
     {
@@ -185,6 +182,9 @@ class SamplingConfig
             configs, [&configs](size_t ci) { return configs[ci].outputLogProbs; }, false);
         cumLogProbs = fuseValues<bool>(
             configs, [&configs](size_t ci) { return configs[ci].cumLogProbs; }, false);
+        beamWidthArray = fuseValues<std::vector<SizeType32>>(
+            configs, [&configs](size_t ci) { return configs[ci].beamWidthArray; },
+            layers::DefaultDecodingParams::getBeamWidthArray());
         // Only used for tests.
         draftAcceptanceThreshold = fuseValues<FloatType>(
             configs, [&configs](size_t ci) { return configs[ci].draftAcceptanceThreshold; }, 0);
@@ -193,22 +193,22 @@ class SamplingConfig
     }
 
     explicit SamplingConfig(executor::SamplingConfig const& samplingConfig,
-        std::optional<executor::ExternalDraftTokensConfig> const& externalDraftTokensConfig)
+        std::optional<executor::ExternalDraftTokensConfig> const& externalDraftTokensConfig = std::nullopt)
         : beamWidth{samplingConfig.getBeamWidth()}
         , numReturnSequences(samplingConfig.getNumReturnSequences())
     {
 
         if (externalDraftTokensConfig && externalDraftTokensConfig.value().getAcceptanceThreshold())
         {
             draftAcceptanceThreshold
-                = Vec<FloatType>{externalDraftTokensConfig.value().getAcceptanceThreshold().value()};
+                = std::vector<FloatType>{externalDraftTokensConfig.value().getAcceptanceThreshold().value()};
         }
 
 #define SET_FROM_OPTIONAL(varName, VarName, VarType)                                                                   \
                                                                                                                        \
     if (samplingConfig.get##VarName())                                                                                 \
     {                                                                                                                  \
-        varName = Vec<VarType>{samplingConfig.get##VarName().value()};                                                 \
+        varName = std::vector<VarType>{samplingConfig.get##VarName().value()};                                         \
     }
 
         SET_FROM_OPTIONAL(topK, TopK, SizeType32)
@@ -228,6 +228,7 @@ class SamplingConfig
         SET_FROM_OPTIONAL(earlyStopping, EarlyStopping, SizeType32)
         SET_FROM_OPTIONAL(noRepeatNgramSize, NoRepeatNgramSize, SizeType32)
         SET_FROM_OPTIONAL(minP, MinP, FloatType)
+        SET_FROM_OPTIONAL(beamWidthArray, BeamWidthArray, std::vector<SizeType32>)
 #undef SET_FROM_OPTIONAL
     }
 
@@ -266,16 +267,18 @@ class SamplingConfig
         valid &= validateVec("topK", topK, -1);
         valid &= validateVec("topP", topP, -fltEpsilon, {1.f});
         valid &= validateVec("topPMin", topPMin, 0.f, {1.f});
-        valid &= validateVec("topPDecay", topPDecay, 0.f, {1.f});
         valid &= validateVec("topPResetIds", topPResetIds, -1);
-
+        valid &= validateVec("topPDecay", topPDecay, 0.f, {1.f});
         valid &= validateVec("temperature", temperature, -fltEpsilon);
-        valid &= validateVec("repetitionPenalty", repetitionPenalty, 0.f);
         valid &= validateVec("minLength", minLength, -1);
+        valid &= validateVec("beamSearchDiversityRate", beamSearchDiversityRate, -fltEpsilon);
+        valid &= validateVec("repetitionPenalty", repetitionPenalty, 0.f);
+        // TODO: checking `lengthPenalty`leads to a failure in
+        // `test_openai_chat_example`, debug and re-enable it later.
+        // valid &= validateVec("lengthPenalty", lengthPenalty, 0.f);
         valid &= validateVec("noRepeatNgramSize", noRepeatNgramSize, 0);
         valid &= validateVec("minP", minP, -fltEpsilon, {1.f});
-
-        valid &= validateVec("beamSearchDiversityRate", beamSearchDiversityRate, -fltEpsilon);
+        // TODO: check `beamWidthArray`
 
         // Detect greedy sampling and overwrite params.
         if (temperature)
@@ -332,38 +335,39 @@ class SamplingConfig
     SizeType32 beamWidth;
     std::optional<SizeType32> numReturnSequences;
 
-    // penalties
-    OptVec<FloatType> temperature;         // [1] or [batch_size] on cpu
-    OptVec<FloatType> originalTemperature; // [1] or [batch_size] on cpu
-    OptVec<SizeType32> minLength;          // [1] or [batch_size] on cpu
-    OptVec<FloatType> repetitionPenalty;   // [1] or [batch_size] on cpu
-    OptVec<FloatType> presencePenalty;     // [1] or [batch_size] on cpu
-    OptVec<FloatType> frequencyPenalty;    // [1] or [batch_size] on cpu
-    OptVec<SizeType32> noRepeatNgramSize;  // [1] or [batch_size] on cpu
+    // penalties, [1] for one request, [batchSize] for one batch, the same for other parameters below
+    OptVec<FloatType> temperature;         // [1] or [batchSize]
+    OptVec<FloatType> originalTemperature; // [1] or [batchSize]
+    OptVec<SizeType32> minLength;          // [1] or [batchSize]
+    OptVec<FloatType> repetitionPenalty;   // [1] or [batchSize]
+    OptVec<FloatType> presencePenalty;     // [1] or [batchSize]
+    OptVec<FloatType> frequencyPenalty;    // [1] or [batchSize]
+    OptVec<SizeType32> noRepeatNgramSize;  // [1] or [batchSize]
 
     // probs
     OptVec<bool> outputLogProbs;
     OptVec<bool> cumLogProbs;
 
     // sampling layers
-    OptVec<SizeType32> topK;          // [1] or [batch_size] on cpu
-    OptVec<FloatType> topP;           // [1] or [batch_size] on cpu
-    OptVec<uint64_t> randomSeed;      // [1] or [batch_size] on cpu
-    OptVec<FloatType> topPDecay;      // [batch_size], must between [0, 1]
-    OptVec<FloatType> topPMin;        // [batch_size], must between [0, 1]
-    OptVec<TokenIdType> topPResetIds; // [batch_size]
-    OptVec<FloatType> minP;           // [1] or [batch_size] on cpu
+    OptVec<SizeType32> topK;          // [1] or [batchSize]
+    OptVec<FloatType> topP;           // [1] or [batchSize]
+    OptVec<uint64_t> randomSeed;      // [1] or [batchSize]
+    OptVec<FloatType> topPDecay;      // [1] or [batchSize], between [0, 1]
+    OptVec<FloatType> topPMin;        // [1] or [batchSize], between [0, 1]
+    OptVec<TokenIdType> topPResetIds; // [1] or [batchSize]
+    OptVec<FloatType> minP;           // [1] or [batchSize]
 
     // beam search layer
-    OptVec<FloatType> beamSearchDiversityRate; // [1] or [batch_size]
-    OptVec<FloatType> lengthPenalty;           // [1] or [batch_size]
-    OptVec<SizeType32> earlyStopping;          // [1] or [batch_size]
+    OptVec<FloatType> beamSearchDiversityRate;      // [1] or [batchSize]
+    OptVec<FloatType> lengthPenalty;                // [1] or [batchSize]
+    OptVec<SizeType32> earlyStopping;               // [1] or [batchSize]
+    OptVec<std::vector<SizeType32>> beamWidthArray; // [maxBeamWidthArrayLength] or [batchSize, maxBeamWidthArrayLength]
 
     // speculative decoding, only the first value is used (in gptDecoderBatched.cpp)
-    OptVec<FloatType> draftAcceptanceThreshold; // [1] or [batch_size]
+    OptVec<FloatType> draftAcceptanceThreshold; // [1] or [batchSize]
 
     // medusa params
-    OptVec<std::vector<runtime::SizeType32>> topKMedusaHeads; // [batchSize, maxMedusaHeads]
+    OptVec<std::vector<SizeType32>> topKMedusaHeads; // [batchSize, maxMedusaHeads]
 
     std::optional<bool> normalizeLogProbs;
 
@@ -379,7 +383,7 @@ class SamplingConfig
             && lengthPenalty == other.lengthPenalty && earlyStopping == other.earlyStopping
             && draftAcceptanceThreshold == other.draftAcceptanceThreshold && topKMedusaHeads == other.topKMedusaHeads
             && normalizeLogProbs == other.normalizeLogProbs && outputLogProbs == other.outputLogProbs
-            && cumLogProbs == other.cumLogProbs && minP == other.minP;
+            && cumLogProbs == other.cumLogProbs && minP == other.minP && beamWidthArray == other.beamWidthArray;
     }
 
     SizeType32 getNumReturnBeams() const
 
@@ -120,6 +120,7 @@ void addToSamplingConfig(SamplingConfig& batchSamplingConfig, SamplingConfig con
     TLLM_CHECK(batchSamplingConfig.beamSearchDiversityRate == addSamplingConfig.beamSearchDiversityRate);
     TLLM_CHECK(batchSamplingConfig.lengthPenalty == addSamplingConfig.lengthPenalty);
     TLLM_CHECK(batchSamplingConfig.earlyStopping == addSamplingConfig.earlyStopping);
+    TLLM_CHECK(batchSamplingConfig.beamWidthArray == addSamplingConfig.beamWidthArray);
 
     auto addOptional = [](auto& batch, auto const& add, char const* name)
     {
Original file line number	Diff line number	Diff line change
`@@ -128,6 +128,11 @@ class DefaultDecodingParams`
`128`	`128`	`{`
`129`	`129`	`return 0.0f;`
`130`	`130`	`}`
	`131`	`+`
	`132`	`+ [[nodiscard]] static std::vector<runtime::SizeType32> getBeamWidthArray()`
	`133`	`+ {`
	`134`	`+ return std::vector<runtime::SizeType32>{1};`
	`135`	`+ }`
`131`	`136`	`};`
`132`	`137`	`} // namespace layers`
`133`	`138`	`} // namespace tensorrt_llm`
Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ void addToSamplingConfig(SamplingConfig& batchSamplingConfig, SamplingConfig con`
`120`	`120`	`TLLM_CHECK(batchSamplingConfig.beamSearchDiversityRate == addSamplingConfig.beamSearchDiversityRate);`
`121`	`121`	`TLLM_CHECK(batchSamplingConfig.lengthPenalty == addSamplingConfig.lengthPenalty);`
`122`	`122`	`TLLM_CHECK(batchSamplingConfig.earlyStopping == addSamplingConfig.earlyStopping);`
	`123`	`+ TLLM_CHECK(batchSamplingConfig.beamWidthArray == addSamplingConfig.beamWidthArray);`
`123`	`124`
`124`	`125`	`auto addOptional = [](auto& batch, auto const& add, char const* name)`
`125`	`126`	`{`