NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 13 additions & 3 deletions b/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h‎
Lines changed: 5 additions & 7 deletions b/‎cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 6 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h‎
Lines changed: 4 additions & 3 deletions b/‎cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h‎
Lines changed: 7 additions & 40 deletions b/‎cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h‎
Lines changed: 7 additions & 40 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/llmRequest.cpp‎
Lines changed: 3 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/llmRequest.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -38,6 +38,7 @@ class DecoderInputBuffers
 public:
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
+    using TensorConstPtr = runtime::ITensor::SharedConstPtr;
 
     explicit DecoderInputBuffers(
         SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, runtime::BufferManager const& manager);
@@ -60,13 +61,22 @@ class DecoderInputBuffers
     //! Requests for considered in decoder forward
     RequestVector decoderRequests;
 
+    //! Logits of decoder requests
+    std::vector<TensorPtr> decoderLogits;
+
+    //! Maximum number of decoding steps of decoder requests.
+    //! This is only more than 1 for external draft tokens speculative decoding.
+    SizeType32 maxDecoderSteps{1};
+
     //! Batch slots for all decoder steps, [maxDecoderSteps][maxBatchSize]
     std::vector<TensorPtr> forwardBatchSlots;
 
-    //! Logits of decoder requests
-    std::vector<TensorPtr> logits;
+    //! Logits for requests in forwardBatchSlots (in the same order).
+    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+    std::vector<std::vector<TensorConstPtr>> batchLogits;
 
-    //! Logits for speculative decoding (Medusa)
+    //! Logits for speculative decoding (Medusa).
+    //! The vector is sparse, only slots in forwardBatchSlots are used.
     //! [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded]
     std::vector<std::vector<runtime::ITensor::SharedPtr>> predictedDraftLogits;
 };
 
@@ -40,19 +40,17 @@ class MakeDecodingBatchInputOutput : Algorithm
     constexpr static auto name{"MakeDecodingBatchInputOutput"};
 
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using TensorPtr = runtime::decoder_batch::Input::TensorPtr;
+    using TensorPtr = runtime::ITensor::SharedPtr;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
     MakeDecodingBatchInputOutput() = default;
 
-    std::unique_ptr<runtime::decoder_batch::Input> operator()(DecoderInputBuffers& inputBuffers,
-        runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
-        SizeType32 maxNumSequences, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
+    void operator()(DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+        runtime::ModelConfig const& modelConfig, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
-    [[nodiscard]] static std::unique_ptr<runtime::decoder_batch::Input> createDecoderBatchInputs(
-        std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-        std::vector<TensorPtr> const& logits, SizeType32 maxNumSequences, std::vector<TensorPtr> const& batchSlots);
+    static void createDecoderBatchInputs(DecoderInputBuffers& inputBuffers, std::vector<SizeType32> const& activeSlots,
+        runtime::decoder::DecoderState const& decoderState);
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -19,6 +19,9 @@
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
+#if ENABLE_FP4
+#include <cuda_fp4.h>
+#endif
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <algorithm>
@@ -545,6 +548,9 @@ template void printArrayInfo(__nv_bfloat16 const* ptr, uint64_t nElement, std::s
 #ifdef ENABLE_FP8
 template void printArrayInfo(__nv_fp8_e4m3 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 #endif
+#ifdef ENABLE_FP4
+template void printArrayInfo(__nv_fp4_e2m1 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
+#endif
 template void printArrayInfo(uint32_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(uint64_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(int const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 
@@ -52,8 +52,9 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
 
-    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
-    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
+    CudaEvent forwardAsync(
+        decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input) override;
+    void forward(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input) override;
 
     //! @brief Gather final beam search results for request `batchSlot`.
     //! Result will only be available after event returned.
@@ -77,7 +78,7 @@ class GptDecoderBatched : public IGptDecoderBatched
 
 private:
     //! @brief Calls decoders for tokens per engine step
-    void forwardDispatch(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
+    void forwardDispatch(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input);
 
 private:
     CudaStreamPtr mRuntimeStream;
 
@@ -27,8 +27,9 @@
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 class LlmRequest;
-}
+} // namespace tensorrt_llm::batch_manager
 
 namespace tensorrt_llm::runtime
 {
@@ -39,43 +40,6 @@ namespace decoder
 class DecoderState;
 }
 
-namespace decoder_batch
-{
-
-class Input
-{
-public:
-    using TensorConstPtr = ITensor::SharedConstPtr;
-    using TensorPtr = ITensor::SharedPtr;
-
-    explicit Input(std::vector<std::vector<TensorConstPtr>> const& logits, SizeType32 maxDecoderSteps)
-        : logits{logits}
-        , maxDecoderSteps{maxDecoderSteps}
-    {
-        TLLM_CHECK_WITH_INFO(
-            logits.size() == static_cast<size_t>(maxDecoderSteps), "logits vector size does not match maxDecoderSteps");
-    }
-
-    explicit Input(std::vector<TensorConstPtr> const& logits)
-        : Input{{logits}, 1}
-    {
-    }
-
-    //! Mandatory parameters
-    //! Logits
-    // FIXME: remove first dimension of tensors
-    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-    std::vector<std::vector<TensorConstPtr>> logits;
-
-    //! Maximum number of decoding tokens of active slots
-    SizeType32 maxDecoderSteps;
-
-    //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize]
-    std::vector<TensorPtr> batchSlots;
-};
-
-} // namespace decoder_batch
-
 //! GPT decoder class with support for in-flight batching
 class IGptDecoderBatched
 {
@@ -94,10 +58,13 @@ class IGptDecoderBatched
     virtual void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) = 0;
 
     //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
-    virtual CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
+    virtual CudaEvent forwardAsync(
+        decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input)
+        = 0;
 
     //! @brief Run one step for all requests and wait for completion on the host.
-    virtual void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
+    virtual void forward(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input)
+        = 0;
 
     //! @brief Gather final beam search results for request `batchIdx`.
     //! Result will only be available after event returned
 
@@ -182,7 +182,7 @@ void GuidedDecoder::execute(DecoderInputBuffers const& decoderInputBuffers, Buff
             {
                 auto const seqSlot = llmReq->mSeqSlot.value();
 
-                auto const& logits = decoderInputBuffers.logits.at(requestIdx);
+                auto const& logits = decoderInputBuffers.decoderLogits.at(requestIdx);
                 auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
 
                 // Use void* to unify the code for different mLogitsDtype
 
@@ -79,7 +79,7 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
     auto& decoderRequests = inputBuffers.decoderRequests;
     decoderRequests.clear();
     decoderRequests.reserve(contextRequests.size());
-    auto& allDecoderLogits = inputBuffers.logits;
+    auto& allDecoderLogits = inputBuffers.decoderLogits;
     allDecoderLogits.clear();
     allDecoderLogits.reserve(contextRequests.size());
 
 
@@ -85,7 +85,7 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
 
     auto& decoderRequests = inputBuffers.decoderRequests;
     decoderRequests.reserve(decoderRequests.size() + generationRequests.size());
-    auto& allDecoderLogits = inputBuffers.logits;
+    auto& allDecoderLogits = inputBuffers.decoderLogits;
     allDecoderLogits.reserve(allDecoderLogits.size() + generationRequests.size());
 
     for (auto const& llmReq : generationRequests)
 
@@ -69,7 +69,9 @@ void LlmRequest::createSerializedResult(
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
-    if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
+    auto const streamingInProgress = mIsStreaming
+        && (mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE);
+    if (!(isFinished() || streamingInProgress))
     {
         return std::nullopt;
     }
 
@@ -49,7 +49,7 @@ bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool rep
     for (size_t batchIdx = 0; batchIdx < inputBuffers.decoderRequests.size(); ++batchIdx)
     {
         auto const& llmReq = inputBuffers.decoderRequests.at(batchIdx);
-        auto& logits = inputBuffers.logits.at(batchIdx);
+        auto& logits = inputBuffers.decoderLogits.at(batchIdx);
 
         // Invoke non-batched processor or collect arguments for batched processor
         if (llmReq->mLogitsPostProcessor)
Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ void GuidedDecoder::execute(DecoderInputBuffers const& decoderInputBuffers, Buff`
`182`	`182`	`{`
`183`	`183`	`auto const seqSlot = llmReq->mSeqSlot.value();`
`184`	`184`
`185`		`- auto const& logits = decoderInputBuffers.logits.at(requestIdx);`
	`185`	`+ auto const& logits = decoderInputBuffers.decoderLogits.at(requestIdx);`
`186`	`186`	`auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});`
`187`	`187`
`188`	`188`	`// Use void* to unify the code for different mLogitsDtype`
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,9 @@ void LlmRequest::createSerializedResult(`
`69`	`69`	`/// Note that there is some dependency on the order of operations in this method. Modify with care!`
`70`	`70`	`std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)`
`71`	`71`	`{`
`72`		`- if (!(isFinished() \|\| (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))`
	`72`	`+ auto const streamingInProgress = mIsStreaming`
	`73`	`+ && (mState == LlmRequestState::kGENERATION_IN_PROGRESS \|\| mState == LlmRequestState::kGENERATION_TO_COMPLETE);`
	`74`	`+ if (!(isFinished() \|\| streamingInProgress))`
`73`	`75`	`{`
`74`	`76`	`return std::nullopt;`
`75`	`77`	`}`
Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool rep`
`49`	`49`	`for (size_t batchIdx = 0; batchIdx < inputBuffers.decoderRequests.size(); ++batchIdx)`
`50`	`50`	`{`
`51`	`51`	`auto const& llmReq = inputBuffers.decoderRequests.at(batchIdx);`
`52`		`- auto& logits = inputBuffers.logits.at(batchIdx);`
	`52`	`+ auto& logits = inputBuffers.decoderLogits.at(batchIdx);`
`53`	`53`
`54`	`54`	`// Invoke non-batched processor or collect arguments for batched processor`
`55`	`55`	`if (llmReq->mLogitsPostProcessor)`