rmittal-github · vklimkov-nvidia · Jun 27, 2025 · Jun 30, 2025 · Jun 30, 2025 · Jul 1, 2025
diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -40,6 +40,8 @@
 #include <string>
 #include <thread>
 #include <utility>
+#include <fstream>
+#include <filesystem>
 
 using namespace tensorrt_llm::batch_manager;
 using namespace tensorrt_llm::runtime;
@@ -53,6 +55,91 @@ namespace
 
 using TensorPtr = ITensor::SharedPtr;
 
+// Add function to read model dtype from config
+std::string getModelDtype(std::optional<std::filesystem::path> const& engineDir, texec::ModelType modelType)
+{
+    if (!engineDir.has_value())
+    {
+        return "float16"; // default fallback
+    }
+
+    std::filesystem::path configPath = engineDir.value() / "config.json";
+    if (!std::filesystem::exists(configPath))
+    {
+        TLLM_LOG_WARNING("Config file not found at %s, using default dtype float16", configPath.string().c_str());
+        return "float32";
+    }
+
+    try
+    {
+        std::ifstream configFile(configPath);
+        nlohmann::json config;
+        configFile >> config;
+
+        std::string dtype = "float32"; // default
+
+        // Check if this is an engine config or checkpoint config
+        if (config.contains("pretrained_config"))
+        {
+            // Engine format
+            if (config["pretrained_config"].contains("dtype"))
+            {
+                dtype = config["pretrained_config"]["dtype"].get<std::string>();
+            }
+        }
+        TLLM_LOG_INFO("Detected model dtype: %s", dtype.c_str());
+        return dtype;
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("Failed to read dtype from config: %s, using default float16", e.what());
+        return "float32";
+    }
+}
+
+// Add function to cast inputFeat tensor from fp32 to fp16 on CPU
+texec::Tensor castInputFeatHalf(texec::Tensor const& inputFeat, std::string const& modelDtype)
+{
+    auto currentDtype = inputFeat.getDataType();
+    auto memoryType = inputFeat.getMemoryType();
+
+    // Only cast CPU tensors from fp32 to fp16
+    if (currentDtype != texec::DataType::kFP32 || memoryType != texec::MemoryType::kCPU)
+    {
+        TLLM_LOG_WARNING("InputFeat tensor is not fp32 or not on CPU, skipping cast");
+        return inputFeat;
+    }
+
+    try
+    {
+        auto shape = inputFeat.getShape();
+        auto numElements = inputFeat.getSize();
+
+        // Create new fp16 tensor on CPU
+        texec::Tensor castedTensor = texec::Tensor::cpu(texec::DataType::kFP16, shape);
+
+        // Cast data from fp32 to fp16
+        auto const* sourceData = static_cast<float const*>(inputFeat.getData());
+        auto* targetData = static_cast<half*>(castedTensor.getData());
+
+        for (size_t i = 0; i < numElements; ++i)
+        {
+            targetData[i] = static_cast<half>(sourceData[i]);
+        }
+        TLLM_LOG_DEBUG("Casted inputFeat tensor from fp32 to fp16");
+
+        auto stream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
+        texec::Tensor castedTensorGpu = castedTensor.copyToGpu(stream);
+
+        return castedTensorGpu;
+    }
+    catch (std::exception const& e)
+    {
+        TLLM_LOG_WARNING("Failed to cast inputFeat tensor to fp16: %s, keeping original", e.what());
+        return inputFeat;
+    }
+}
+
 class LoraLib
 {
 public:
@@ -810,16 +897,19 @@ class ExecutorServer
 namespace
 {
 
-texec::Request makeExecutorRequest(texec::VecTokens &inputTokenIds, int32_t outputLen, SizeType32 const& beamWidth,
+texec::Request makeExecutorRequest(texec::VecTokens inputTokenIds, SizeType32 outputLen, SizeType32 const& beamWidth,
     std::optional<SizeType32> const& eosId, std::optional<SizeType32> const& padId, SizeType32 num_vocabs = 1, bool streaming = false,
     bool const& returnContextLogits = false, bool const& returnGenerationLogits = false,
     std::optional<texec::LoraConfig> const& loraConfig = std::nullopt,
     std::optional<texec::LookaheadDecodingConfig> const& lookaheadConfig = std::nullopt,
     std::optional<texec::VecTokens> encoderInputTokenIds = std::nullopt,
-    std::optional<float> temperature = std::nullopt)
+    std::optional<texec::Tensor> encoderFeatures = std::nullopt,
+    std::optional<SizeType32> encoderOutLen = std::nullopt,
+    std::optional<float> temperature = std::nullopt, std::optional<float> cfgScale = std::nullopt)
 {
     auto samplingConfig = texec::SamplingConfig{beamWidth};
     samplingConfig.setTemperature(temperature);
+    samplingConfig.setCfgScale(cfgScale);
     auto outputConfig = texec::OutputConfig{false, returnContextLogits, returnGenerationLogits, false};
     auto request = texec::Request(inputTokenIds, outputLen, streaming, samplingConfig, outputConfig, eosId, padId,
         std::nullopt,    // positionIds
@@ -834,7 +924,14 @@ texec::Request makeExecutorRequest(texec::VecTokens &inputTokenIds, int32_t outp
         std::nullopt,    // kvCacheRetentionConfig
         std::nullopt,    // logitsPostProcessorName
         std::nullopt,    // logitsPostProcessor
-        encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
+        encoderInputTokenIds.has_value() && encoderInputTokenIds.value().size() > 0 ? encoderInputTokenIds : std::nullopt,
+        std::nullopt,    // client id
+        false,           // returnAllGeneratedTokens
+        tensorrt_llm::executor::Request::kDefaultPriority,  // priority
+        tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,  // type
+        std::nullopt,    // ContextPhaseParams
+        encoderFeatures.has_value() && encoderFeatures.value().getSize() > 0 ? encoderFeatures : std::nullopt,
+        encoderOutLen);
     if (num_vocabs > 1) {
         request.setNumVocabs(num_vocabs);
     }
@@ -853,10 +950,25 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
     auto const& world = tensorrt_llm::mpi::MpiComm::world();
     auto worldRank = world.getRank();
 
+    // Determine model dtype from config
+    std::string modelDtype = "float32"; // default
+    if (decoderEngineDir.has_value()) {
+        modelDtype = getModelDtype(decoderEngineDir, executorModelType);
+    }
+
     // Load dataset
     auto samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
     auto const numSamples = samples.size();
 
+    // Cast inputFeat tensors from fp32 to fp16 if model is fp16
+    for (auto& sample : samples)
+    {
+        if (sample.inputFeat.getSize() > 0)
+        {
+            sample.inputFeat = castInputFeatHalf(sample.inputFeat, modelDtype);
+        }
+    }
+
     auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth, responsesJsonFile);
     int32_t decoderStartTokenId = 0;
     std::shared_ptr<ExecutorServer> executorServer;
@@ -955,20 +1067,24 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
             std::vector<texec::Request> requests;
             for (auto i = 0; i < warmUp; ++i)
             {
-                if (executorModelType == texec::ModelType::kENCODER_DECODER)
+                if (executorModelType == texec::ModelType::kENCODER_DECODER || samples[0].inputIds.empty())
                 {
                     if (samples[0].contextIds.empty()) {
                         samples[0].contextIds.push_back(decoderStartTokenId);
                     }
                     requests.emplace_back(makeExecutorRequest(samples[0].contextIds, samples[0].outputLen, beamWidth, eosId, padId, num_vocabs,
                         benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt,
-                        benchmarkParams.requestLookaheadConfig, samples[0].inputIds));
+                        benchmarkParams.requestLookaheadConfig, samples[0].inputIds, samples[0].inputFeat, samples[0].inputLen,
+                        benchmarkParams.temperature, benchmarkParams.cfgScale));
                 }
                 else
                 {
                     requests.emplace_back(makeExecutorRequest(samples[0].inputIds, samples[0].outputLen, beamWidth, eosId, padId, num_vocabs,
                         benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt,
-                        benchmarkParams.requestLookaheadConfig, std::nullopt, benchmarkParams.temperature));
+                        benchmarkParams.requestLookaheadConfig, std::nullopt,
+                        std::nullopt,
+                        std::nullopt,
+                        benchmarkParams.temperature, benchmarkParams.cfgScale));
                 }
             }
             executorServer->enqueue(std::move(requests), true);
@@ -990,20 +1106,22 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
                 {
                     loraConfig = texec::LoraConfig(samples[i].taskId);
                 }
-                if (executorModelType == texec::ModelType::kENCODER_DECODER)
+                if (executorModelType == texec::ModelType::kENCODER_DECODER || samples[i].inputIds.empty())
                 {
                     if (samples[i].contextIds.empty()) {
                         samples[i].contextIds.push_back(decoderStartTokenId);
                     }
                     requests.emplace_back(makeExecutorRequest(samples[i].contextIds, samples[i].outputLen, beamWidth, eosId, padId, num_vocabs,
                         benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig,
-                        benchmarkParams.requestLookaheadConfig, samples[i].inputIds));
+                        benchmarkParams.requestLookaheadConfig, samples[i].inputIds, samples[i].inputFeat, samples[i].inputLen,
+                        benchmarkParams.temperature, benchmarkParams.cfgScale));
                 }
                 else
                 {
                     requests.emplace_back(makeExecutorRequest(samples[i].inputIds, samples[i].outputLen, beamWidth, eosId, padId, num_vocabs,
                         benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig,
-                        benchmarkParams.requestLookaheadConfig, std::nullopt, benchmarkParams.temperature));
+                        benchmarkParams.requestLookaheadConfig, std::nullopt, std::nullopt, std::nullopt,
+                        benchmarkParams.temperature, benchmarkParams.cfgScale));
                 }
             }
 
@@ -1160,6 +1278,7 @@ int main(int argc, char* argv[])
         "Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle",
         cxxopts::value<float>());
     options.add_options()("temperature", "Sampling temperature for each request", cxxopts::value<float>());
+    options.add_options()("cfg_scale", "Scale of classifier-free guidance (CFG) for each request", cxxopts::value<float>());
     options.add_options()(
         "eagle_use_dynamic_tree", "Whether to use Eagle-2", cxxopts::value<bool>()->default_value("false"));
     options.add_options()("eagle_dynamic_tree_max_top_k",
@@ -1280,18 +1399,10 @@ int main(int argc, char* argv[])
     {
         benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
     }
-    // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
-    if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
+    // Argument: K-V Cache Cross Attention Fraction. Only applicable to models with xattn
+    if (result.count("cross_kv_cache_fraction"))
     {
-        if (result.count("cross_kv_cache_fraction"))
-        {
-            benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
-        }
-        else
-        {
-            benchmarkParams.crossKvCacheFraction
-                = 0.5f; // default value if not set. but non enc-dec should not even have this param set
-        }
+        benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
     }
 
     // Argument: Enable dynamic tuning of batch size
@@ -1407,6 +1518,10 @@ int main(int argc, char* argv[])
     {
         benchmarkParams.temperature = result["temperature"].as<float>();
     }
+    if (result.count("cfg_scale"))
+    {
+        benchmarkParams.cfgScale = result["cfg_scale"].as<float>();
+    }
 
     if (result.count("executor_lookahead_config"))
     {

diff --git a/benchmarks/cpp/utils/utils.cpp b/benchmarks/cpp/utils/utils.cpp
@@ -22,6 +22,9 @@
 
 #include <filesystem>
 #include <fstream>
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include <algorithm>
 
 namespace tensorrt_llm::benchmark
 {
@@ -78,16 +81,19 @@ Samples parseWorkloadJson(
     auto constexpr ignoreComments = true;
     TLLM_CHECK_WITH_INFO(std::filesystem::exists(datasetPath), "File does not exist: %s", datasetPath.c_str());
     std::ifstream jsonStream(datasetPath);
-    auto json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ignoreComments);
-
     Samples samples;
+    auto json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ignoreComments);
 
     for (auto const& sample : json["samples"])
     {
         if (samples.size() >= maxNumSamples)
             break;
         int32_t taskId = sample.count("task_id") ? sample["task_id"].template get<int32_t>() : -1;
-        auto input_ids(sample["input_ids"].template get<std::vector<int32_t>>());
+        int32_t inputLen = sample["input_len"];
+        std::vector<int32_t> input_ids;
+        if (sample.count("input_ids")) {
+            input_ids = sample["input_ids"].template get<std::vector<int32_t>>();
+        }
         if (maxPromptLen && (input_ids.size() > maxPromptLen.value()))
         {
             input_ids.resize(maxPromptLen.value());
@@ -97,7 +103,23 @@ Samples parseWorkloadJson(
         {
             context_ids = sample["context_ids"].template get<std::vector<int32_t>>();
         }
-        samples.emplace_back(Sample{std::move(input_ids), std::move(context_ids), sample["output_len"], taskId});
+        texec::Tensor inputFeat;
+        if (sample.count("input_feat"))
+        {
+            auto inputFeatVec = sample["input_feat"].template get<std::vector<float>>();
+
+            if (!inputFeatVec.empty())
+            {
+                TLLM_CHECK_WITH_INFO(inputFeatVec.size() % static_cast<size_t>(inputLen) == 0,
+                    "input_feat size %zu is not divisible by input_len %d", inputFeatVec.size(), inputLen);
+
+                int32_t hiddenDim = static_cast<int32_t>(inputFeatVec.size() / inputLen);
+                inputFeat = texec::Tensor::of(inputFeatVec.data(), {inputLen, hiddenDim});
+            }
+        }
+
+        samples.emplace_back(Sample{std::move(input_ids), std::move(context_ids), inputFeat, inputLen,
+            sample["output_len"], taskId});
     }
 
     if (samples.size() < maxNumSamples)

diff --git a/benchmarks/cpp/utils/utils.h b/benchmarks/cpp/utils/utils.h
@@ -17,6 +17,8 @@
  */
 
 #include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/runtime/iTensor.h"
 
 #include <cstdint>
 #include <cxxopts.hpp>
@@ -87,6 +89,7 @@ struct BenchmarkParams
 
     std::optional<texec::EagleConfig> eagleConfig;
     std::optional<float> temperature;
+    std::optional<float> cfgScale;
 
     std::optional<texec::LookaheadDecodingConfig> executorLookaheadConfig;
     std::optional<texec::LookaheadDecodingConfig> requestLookaheadConfig;
@@ -225,6 +228,8 @@ struct Sample
 {
     std::vector<int32_t> inputIds;
     std::vector<int32_t> contextIds;
+    texec::Tensor inputFeat;
+    int32_t inputLen;
     int32_t outputLen;
     int32_t taskId;
 };

diff --git a/cpp/include/tensorrt_llm/batch_manager/handleGenerationLogits.h b/cpp/include/tensorrt_llm/batch_manager/handleGenerationLogits.h
@@ -46,7 +46,8 @@ class HandleGenerationLogits : Algorithm
     HandleGenerationLogits() = default;
 
     void operator()(tr::SizeType32 logitsIndex, RequestVector const& generationRequests, DecoderBuffers& decoderBuffers,
-        tr::ModelConfig const& modelConfig, tr::BufferManager const& manager, tr::ITensor::SharedPtr const& logits,
+        tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+        tensorrt_llm::runtime::CudaStream const& stream, tr::ITensor::SharedPtr const& logits,
         OptionalRef<RuntimeBuffers> genRuntimeBuffers, tr::SizeType32 vocabId = 0) const;
 };
 

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -39,6 +39,19 @@ class BlockRange
     {
     }
 
+    BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest const& llmRequest, SizeType32 beam, SizeType32 poolIdx = 0)
+        : mManager(&cacheManager)
+        , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))
+    {
+        std::vector<SizeType32> blockIds;
+        for (int i = 0; i < llmRequest.getNumSequences(); i++) {
+            auto const requestId = llmRequest.getSeqSlotId(i);
+            const auto& thisBlockIds = cacheManager.getSequence(requestId).getCacheBlockIds().at(beam);
+            blockIds.insert(blockIds.end(), thisBlockIds.begin(), thisBlockIds.end());
+        }
+        mBlockIds = std::move(blockIds);
+    }
+
     BlockRange(BaseKVCacheManager const& cacheManager, std::vector<SizeType32> blockIds, SizeType32 poolIdx = 0)
         : mManager(&cacheManager)
         , mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))