NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 5 additions & 3 deletions b/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/gptDecoder.h‎
Lines changed: 15 additions & 9 deletions b/‎cpp/include/tensorrt_llm/runtime/gptDecoder.h‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 0 additions & 1 deletion b/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp‎
Lines changed: 25 additions & 16 deletions b/‎cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp‎
Lines changed: 25 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 7 additions & 7 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 5 additions & 5 deletions b/‎cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/runtime/bindings.cpp‎
Lines changed: 15 additions & 7 deletions b/‎cpp/tensorrt_llm/pybind/runtime/bindings.cpp‎
Lines changed: 15 additions & 7 deletions
@@ -71,7 +71,8 @@ class CreateNewDecoderRequests : Algorithm
     {
     }
 
-    std::tuple<TensorPtr, std::vector<runtime::decoder_batch::Request>, std::vector<runtime::SamplingConfig>>
+    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+        std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
         runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
@@ -113,8 +114,9 @@ class CreateNewDecoderRequests : Algorithm
     static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
 
-    [[nodiscard]] std::vector<runtime::decoder_batch::Request> createDecoderRequests(
-        RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
+    [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
+        std::vector<executor::LookaheadDecodingConfig>>
+    createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
         executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
         runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
         runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
 
@@ -53,9 +53,12 @@ class IGptDecoder
 
     virtual ~IGptDecoder() = default;
 
+    /// @param explicitDraftTokensDType is only used by ExplicitDraftTokens model to WAR the lack of bf16 decoder.
     virtual void setup(SamplingConfig const& samplingConfig, size_t batchSize, TensorConstPtr const& batchSlots,
         std::optional<DecodingOutput> const& output = std::nullopt,
-        std::optional<std::vector<decoder_batch::Request> const> const& requests = std::nullopt)
+        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+        std::optional<std::vector<TensorConstPtr>> const& lookaheadPrompt = std::nullopt,
+        std::optional<std::vector<executor::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt)
         = 0;
 
     virtual void forwardAsync(DecodingOutput& output, DecodingInput const& input) = 0;
@@ -69,7 +72,7 @@ class IGptDecoder
         = 0;
 
     static std::unique_ptr<IGptDecoder> create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
-        size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength,
+        size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
         BufferManager::CudaStreamPtr const& stream,
         std::shared_ptr<SpeculativeDecodingModule const> const& speculativeDecodingModule = nullptr);
 };
@@ -83,12 +86,15 @@ class GptDecoder : public virtual IGptDecoder
     using TensorPtr = std::shared_ptr<ITensor>;
 
     GptDecoder(executor::DecodingMode const& mode, size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize,
-        size_t vocabSizePadded, size_t maxSequenceLength, CudaStreamPtr const& stream,
+        size_t vocabSizePadded, CudaStreamPtr const& stream,
         std::shared_ptr<SpeculativeDecodingModule const> speculativeDecodingModule = nullptr);
 
     void setup(SamplingConfig const& samplingConfig, size_t batchSize, TensorConstPtr const& batchSlots,
         std::optional<DecodingOutput> const& output = std::nullopt,
-        std::optional<std::vector<decoder_batch::Request> const> const& requests = std::nullopt) override;
+        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+        std::optional<std::vector<TensorConstPtr>> const& lookaheadPrompt = std::nullopt,
+        std::optional<std::vector<executor::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs
+        = std::nullopt) override;
 
     void forwardAsync(DecodingOutput& output, DecodingInput const& input) override;
 
@@ -117,18 +123,18 @@ class GptDecoder : public virtual IGptDecoder
 };
 
 inline std::unique_ptr<IGptDecoder> IGptDecoder::create(executor::DecodingMode const& mode, nvinfer1::DataType dtype,
-    size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded, size_t maxSequenceLength,
+    size_t maxBatchSize, size_t maxBeamWidth, size_t vocabSize, size_t vocabSizePadded,
     BufferManager::CudaStreamPtr const& stream,
     std::shared_ptr<SpeculativeDecodingModule const> const& speculativeDecodingModule)
 {
     switch (dtype)
     {
     case nvinfer1::DataType::kFLOAT:
-        return std::make_unique<GptDecoder<float>>(mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded,
-            maxSequenceLength, stream, speculativeDecodingModule);
+        return std::make_unique<GptDecoder<float>>(
+            mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
     case nvinfer1::DataType::kHALF:
-        return std::make_unique<GptDecoder<half>>(mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded,
-            maxSequenceLength, stream, speculativeDecodingModule);
+        return std::make_unique<GptDecoder<half>>(
+            mode, maxBatchSize, maxBeamWidth, vocabSize, vocabSizePadded, stream, speculativeDecodingModule);
     default:
         TLLM_THROW("Unsupported decoder data type: %d. Use either kFLOAT or kHALF.", static_cast<int>(dtype));
         return nullptr;
 
@@ -48,8 +48,7 @@ class GptDecoderBatched : public IGptDecoderBatched
     explicit GptDecoderBatched(CudaStreamPtr stream);
 
     void setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
-        SizeType32 maxSequenceLength, nvinfer1::DataType dtype, ModelConfig const& modelConfig,
-        WorldConfig const& worldConfig) override;
+        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override;
 
     void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
 
 
@@ -119,8 +119,7 @@ class IGptDecoderBatched
 
     //! @brief Setup the decoder before calling `forward()`
     virtual void setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
-        SizeType32 maxSequenceLength, nvinfer1::DataType dtype, ModelConfig const& modelConfig,
-        WorldConfig const& worldConfig)
+        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig)
         = 0;
 
     //! @brief Disable Lookahead decoding.
 
@@ -57,7 +57,6 @@ class Request
     std::optional<TensorPtr> draftLogits; // [generatedTokensPerEngineStep - 1, vocabSize] on gpu
     TensorPtr medusaPaths;                // [maxDecodingTokens, maxPathLen], on gpu
     TensorPtr medusaTreeIds;              // [maxDecodingTokens], on gpu
-    nvinfer1::DataType dtype;             // Request data type, only used by explicit draft tokens.
     std::optional<executor::LookaheadDecodingConfig> lookaheadRuntimeConfig;
     std::optional<executor::EagleConfig> eagleConfig;
 };
 
@@ -122,7 +122,8 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
 
 } // namespace
 
-std::tuple<TensorPtr, std::vector<runtime::decoder_batch::Request>, std::vector<runtime::SamplingConfig>>
+std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    std::vector<executor::LookaheadDecodingConfig>>
 CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
     executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
     runtime::BufferManager const& bufferManager, nvinfer1::DataType logitsType, DecoderInputBuffers& inputBuffers,
@@ -139,9 +140,9 @@ CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, ru
     copySequenceLengths(finishedContextRequests, inputBuffers, *decoderState.getSequenceLengths(), beamWidth,
         bufferManager, runtimeStream);
 
-    auto decoderRequests = createDecoderRequests(finishedContextRequests, inputBuffers.inputsIds, decodingConfig,
-        decoderState, bufferManager, logitsType, modelConfig, worldConfig, runtimeStream, decoderStream,
-        maxSequenceLength, medusaBuffers);
+    auto [lookaheadPrompt, lookaheadAlgoConfigs] = createDecoderRequests(finishedContextRequests,
+        inputBuffers.inputsIds, decodingConfig, decoderState, bufferManager, logitsType, modelConfig, worldConfig,
+        runtimeStream, decoderStream, maxSequenceLength, medusaBuffers);
 
     auto const batchSize = finishedContextRequests.size();
 
@@ -155,7 +156,8 @@ CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, ru
     TensorPtr batchSlotsView = runtime::ITensor::slice(inputBuffers.setupBatchSlots, 0, batchSize);
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return {std::move(batchSlotsView), std::move(decoderRequests), std::move(samplingConfigs)};
+    return {std::move(batchSlotsView), std::move(samplingConfigs), std::move(lookaheadPrompt),
+        std::move(lookaheadAlgoConfigs)};
 }
 
 void CreateNewDecoderRequests::newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
@@ -555,8 +557,8 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-[[nodiscard]] std::vector<runtime::decoder_batch::Request> CreateNewDecoderRequests::createDecoderRequests(
-    RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
+std::tuple<std::vector<runtime::ITensor::SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
+CreateNewDecoderRequests::createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
     executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
     BufferManager const& bufferManager, nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig,
     runtime::WorldConfig const& worldConfig, runtime::CudaStream const& runtimeStream,
@@ -574,6 +576,16 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec
     std::vector<decoder_batch::Request> decoderRequests;
     decoderRequests.reserve(finishedContextRequests.size());
 
+    std::vector<runtime::ITensor::SharedConstPtr> lookaheadPrompt;
+    std::vector<executor::LookaheadDecodingConfig> lookaheadAlgoConfigs;
+    if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding())
+    {
+        TLLM_CHECK_WITH_INFO(
+            decodingConfig.getLookaheadDecodingConfig().has_value(), "Lookahead decoding config must be provided");
+        lookaheadPrompt.reserve(finishedContextRequests.size());
+        lookaheadAlgoConfigs.reserve(finishedContextRequests.size());
+    }
+
     SizeType32 inputOffset{0};
     for (auto const& llmReq : finishedContextRequests)
     {
@@ -620,14 +632,11 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec
         }
         else if (modelConfig.getSpeculativeDecodingMode().isLookaheadDecoding())
         {
-            decoderRequest.lookaheadRuntimeConfig = llmReq->getLookaheadConfig()
-                ? llmReq->getLookaheadConfig()
-                : decodingConfig.getLookaheadDecodingConfig();
-        }
-        else if (modelConfig.getSpeculativeDecodingMode().isExplicitDraftTokens())
-        {
-            // Only Explicit draft tokens model needs dtype to WAR the lack of bf16 decoder.
-            decoderRequest.dtype = modelConfig.getDataType();
+            lookaheadPrompt.emplace_back(ITensor::slice(decoderRequest.ids, 0, decoderRequest.inputLen));
+
+            auto const& lookaheadRuntimeConfig
+                = llmReq->getLookaheadConfig().value_or(decodingConfig.getLookaheadDecodingConfig().value());
+            lookaheadAlgoConfigs.emplace_back(lookaheadRuntimeConfig);
         }
         else if (modelConfig.getSpeculativeDecodingMode().isEagle())
         {
@@ -659,7 +668,7 @@ void CreateNewDecoderRequests::newRequestEagle(SizeType32 batchIdx, runtime::dec
         inputOffset += promptLen;
     }
 
-    return decoderRequests;
+    return {std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
 }
 
 std::shared_ptr<runtime::ITensor> CreateNewDecoderRequests::retrieveDraftLogits(ModelConfig const& modelConfig,
 
@@ -1424,8 +1424,8 @@ void TrtGptModelInflightBatching::createDecoder(std::optional<executor::Decoding
         }
 
         mDecoder = std::make_unique<runtime::GptDecoderBatched>(mRuntime->getStreamPtr());
-        mDecoder->setup(decodingMode, getMaxNumSequences(), mOperatingBeamWidth, getMaxSequenceLen(), decoderType,
-            mModelConfig, mWorldConfig);
+        mDecoder->setup(
+            decodingMode, getMaxNumSequences(), mOperatingBeamWidth, decoderType, mModelConfig, mWorldConfig);
 
         mDecoderState = std::make_unique<runtime::decoder::DecoderState>(decoderType, mRuntime->getBufferManager());
         if (!mModelConfig.getSpeculativeDecodingMode().isNone())
@@ -1786,18 +1786,18 @@ void TrtGptModelInflightBatching::setupDecoderStep(
     {
         auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
 
-        auto [batchSlots, decoderRequests, samplingConfigs]
+        auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
             = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests,
                 mRuntime->getBufferManager(), logitsType, inputBuffers, *mDecoderState, mRuntime->getStream(),
                 *mDecoder->getDecoderStream(), getMaxSequenceLen(), mOperatingBeamWidth, buffers.mMedusaBuffers);
 
-        if (!decoderRequests.empty())
+        auto const localBatchSize = batchSlots->getSize();
+        if (localBatchSize > 0)
         {
-            // Setup underlying decoder.
-            auto const localBatchSize = batchSlots->getSize();
             auto samplingConfig = SamplingConfig(samplingConfigs);
             mDecoder->getUnderlyingDecoder().setup(samplingConfig, localBatchSize, batchSlots,
-                {mDecoderState->getJointDecodingOutput()}, {decoderRequests});
+                {mDecoderState->getJointDecodingOutput()}, mModelConfig.getDataType(), lookaheadPrompt,
+                lookaheadAlgoConfigs);
 
             auto const& stream = mDecoder->getDecoderStream();
             CudaEvent event{};
 
@@ -158,12 +158,12 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
                 tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
                 SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
             {
-                auto [batchSlots, decoderRequests, samplingConfigs] = self(modelConfig, worldConfig, decodingConfig,
-                    contextRequests, bufferManager, logitsType, inputBuffers, decoderState, runtimeStream,
-                    decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
+                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
+                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
 
-                return std::tuple{
-                    runtime::Torch::tensor(batchSlots), std::move(decoderRequests), std::move(samplingConfigs)};
+                return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
+                    std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
             },
             py::arg("model_config"), py::arg("world_config"), py::arg("decoding_config"), py::arg("context_requests"),
             py::arg("buffer_manager"), py::arg("logits_type"), py::arg("decoder_input_buffers"),
 
@@ -38,13 +38,15 @@
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/torchView.h"
+
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAStream.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 #include <torch/extension.h>
 
 namespace tr = tensorrt_llm::runtime;
+namespace te = tensorrt_llm::executor;
 
 class PyITensor : public tensorrt_llm::runtime::ITensor
 {
@@ -160,9 +162,12 @@ class PyIGptDecoder : public tr::IGptDecoder
     void setup(tr::SamplingConfig const& samplingConfig, size_t batchSize,
         tr::DecodingInput::TensorConstPtr const& batchSlots,
         std::optional<tr::DecodingOutput> const& output = std::nullopt,
-        std::optional<std::vector<tr::decoder_batch::Request> const> const& requests = std::nullopt) override
+        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+        std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+        std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt) override
     {
-        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output, requests);
+        PYBIND11_OVERRIDE_PURE(void, IGptDecoder, setup, samplingConfig, batchSize, batchSlots, output,
+            explicitDraftTokensDType, lookaheadPrompt, lookaheadAlgoConfigs);
     }
 
     void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
@@ -314,13 +319,17 @@ void initBindings(pybind11::module_& m)
             "setup",
             [](tr::IGptDecoder& self, tr::SamplingConfig const& samplingConfig, size_t batchSize,
                 at::Tensor const& batchSlots, std::optional<tr::DecodingOutput> const& output = std::nullopt,
-                std::optional<std::vector<tr::decoder_batch::Request> const> const& requests = std::nullopt)
+                std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+                std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+                std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt)
             {
                 auto tensorPtrBatchSlots = tr::TorchView::of(batchSlots);
-                return self.setup(samplingConfig, batchSize, std::move(tensorPtrBatchSlots), output, requests);
+                self.setup(samplingConfig, batchSize, std::move(tensorPtrBatchSlots), output, explicitDraftTokensDType,
+                    lookaheadPrompt, lookaheadAlgoConfigs);
             },
             py::arg("sampling_config"), py::arg("batch_size"), py::arg("batch_slots"), py::arg("output") = std::nullopt,
-            py::arg("requests") = std::nullopt);
+            py::arg("explicit_draft_tokens_d_type") = std::nullopt, py::arg("lookahead_prompt") = std::nullopt,
+            py::arg("lookahead_algo_configs") = std::nullopt);
 
     py::class_<tr::decoder::DecoderState>(m, "DecoderState")
         .def(py::init<nvinfer1::DataType, tr::BufferManager const&>(), py::arg("dtype"), py::arg("buffer_manager"))
@@ -381,8 +390,7 @@ void initBindings(pybind11::module_& m)
     py::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
         .def(py::init<tr::GptDecoderBatched::CudaStreamPtr>(), py::arg("stream"))
         .def("setup", &tr::GptDecoderBatched::setup, py::arg("mode"), py::arg("max_batch_size"),
-            py::arg("max_beam_width"), py::arg("max_sequence_length"), py::arg("dtype"), py::arg("model_config"),
-            py::arg("world_config"))
+            py::arg("max_beam_width"), py::arg("dtype"), py::arg("model_config"), py::arg("world_config"))
         .def("forward_async", &tr::GptDecoderBatched::forwardAsync, py::arg("decoder_state"), py::arg("output"),
             py::arg("input"))
         .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, py::return_value_policy::reference)