Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 135 additions & 20 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include <string>
#include <thread>
#include <utility>
#include <fstream>
#include <filesystem>

using namespace tensorrt_llm::batch_manager;
using namespace tensorrt_llm::runtime;
Expand All @@ -53,6 +55,91 @@ namespace

using TensorPtr = ITensor::SharedPtr;

// Add function to read model dtype from config
std::string getModelDtype(std::optional<std::filesystem::path> const& engineDir, texec::ModelType modelType)
{
if (!engineDir.has_value())
{
return "float16"; // default fallback
}

std::filesystem::path configPath = engineDir.value() / "config.json";
if (!std::filesystem::exists(configPath))
{
TLLM_LOG_WARNING("Config file not found at %s, using default dtype float16", configPath.string().c_str());
return "float32";
}

try
{
std::ifstream configFile(configPath);
nlohmann::json config;
configFile >> config;

std::string dtype = "float32"; // default

// Check if this is an engine config or checkpoint config
if (config.contains("pretrained_config"))
{
// Engine format
if (config["pretrained_config"].contains("dtype"))
{
dtype = config["pretrained_config"]["dtype"].get<std::string>();
}
}
TLLM_LOG_INFO("Detected model dtype: %s", dtype.c_str());
return dtype;
}
catch (std::exception const& e)
{
TLLM_LOG_WARNING("Failed to read dtype from config: %s, using default float16", e.what());
return "float32";
}
}

// Add function to cast inputFeat tensor from fp32 to fp16 on CPU
texec::Tensor castInputFeatHalf(texec::Tensor const& inputFeat, std::string const& modelDtype)
{
auto currentDtype = inputFeat.getDataType();
auto memoryType = inputFeat.getMemoryType();

// Only cast CPU tensors from fp32 to fp16
if (currentDtype != texec::DataType::kFP32 || memoryType != texec::MemoryType::kCPU)
{
TLLM_LOG_WARNING("InputFeat tensor is not fp32 or not on CPU, skipping cast");
return inputFeat;
}

try
{
auto shape = inputFeat.getShape();
auto numElements = inputFeat.getSize();

// Create new fp16 tensor on CPU
texec::Tensor castedTensor = texec::Tensor::cpu(texec::DataType::kFP16, shape);

// Cast data from fp32 to fp16
auto const* sourceData = static_cast<float const*>(inputFeat.getData());
auto* targetData = static_cast<half*>(castedTensor.getData());

for (size_t i = 0; i < numElements; ++i)
{
targetData[i] = static_cast<half>(sourceData[i]);
}
TLLM_LOG_DEBUG("Casted inputFeat tensor from fp32 to fp16");

auto stream = std::make_shared<tensorrt_llm::runtime::CudaStream>();
texec::Tensor castedTensorGpu = castedTensor.copyToGpu(stream);

return castedTensorGpu;
}
catch (std::exception const& e)
{
TLLM_LOG_WARNING("Failed to cast inputFeat tensor to fp16: %s, keeping original", e.what());
return inputFeat;
}
}

class LoraLib
{
public:
Expand Down Expand Up @@ -810,16 +897,19 @@ class ExecutorServer
namespace
{

texec::Request makeExecutorRequest(texec::VecTokens &inputTokenIds, int32_t outputLen, SizeType32 const& beamWidth,
texec::Request makeExecutorRequest(texec::VecTokens inputTokenIds, SizeType32 outputLen, SizeType32 const& beamWidth,
std::optional<SizeType32> const& eosId, std::optional<SizeType32> const& padId, SizeType32 num_vocabs = 1, bool streaming = false,
bool const& returnContextLogits = false, bool const& returnGenerationLogits = false,
std::optional<texec::LoraConfig> const& loraConfig = std::nullopt,
std::optional<texec::LookaheadDecodingConfig> const& lookaheadConfig = std::nullopt,
std::optional<texec::VecTokens> encoderInputTokenIds = std::nullopt,
std::optional<float> temperature = std::nullopt)
std::optional<texec::Tensor> encoderFeatures = std::nullopt,
std::optional<SizeType32> encoderOutLen = std::nullopt,
std::optional<float> temperature = std::nullopt, std::optional<float> cfgScale = std::nullopt)
{
auto samplingConfig = texec::SamplingConfig{beamWidth};
samplingConfig.setTemperature(temperature);
samplingConfig.setCfgScale(cfgScale);
auto outputConfig = texec::OutputConfig{false, returnContextLogits, returnGenerationLogits, false};
auto request = texec::Request(inputTokenIds, outputLen, streaming, samplingConfig, outputConfig, eosId, padId,
std::nullopt, // positionIds
Expand All @@ -834,7 +924,14 @@ texec::Request makeExecutorRequest(texec::VecTokens &inputTokenIds, int32_t outp
std::nullopt, // kvCacheRetentionConfig
std::nullopt, // logitsPostProcessorName
std::nullopt, // logitsPostProcessor
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
encoderInputTokenIds.has_value() && encoderInputTokenIds.value().size() > 0 ? encoderInputTokenIds : std::nullopt,
std::nullopt, // client id
false, // returnAllGeneratedTokens
tensorrt_llm::executor::Request::kDefaultPriority, // priority
tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION, // type
std::nullopt, // ContextPhaseParams
encoderFeatures.has_value() && encoderFeatures.value().getSize() > 0 ? encoderFeatures : std::nullopt,
encoderOutLen);
if (num_vocabs > 1) {
request.setNumVocabs(num_vocabs);
}
Expand All @@ -853,10 +950,25 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
auto const& world = tensorrt_llm::mpi::MpiComm::world();
auto worldRank = world.getRank();

// Determine model dtype from config
std::string modelDtype = "float32"; // default
if (decoderEngineDir.has_value()) {
modelDtype = getModelDtype(decoderEngineDir, executorModelType);
}

// Load dataset
auto samples = parseWorkloadJson(datasetPath, maxNumSamples, maxPromptLen);
auto const numSamples = samples.size();

// Cast inputFeat tensors from fp32 to fp16 if model is fp16
for (auto& sample : samples)
{
if (sample.inputFeat.getSize() > 0)
{
sample.inputFeat = castInputFeatHalf(sample.inputFeat, modelDtype);
}
}

auto recorder = std::make_shared<Recorder>(opCsvFile, benchmarkParams.streaming, beamWidth, responsesJsonFile);
int32_t decoderStartTokenId = 0;
std::shared_ptr<ExecutorServer> executorServer;
Expand Down Expand Up @@ -955,20 +1067,24 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
std::vector<texec::Request> requests;
for (auto i = 0; i < warmUp; ++i)
{
if (executorModelType == texec::ModelType::kENCODER_DECODER)
if (executorModelType == texec::ModelType::kENCODER_DECODER || samples[0].inputIds.empty())
{
if (samples[0].contextIds.empty()) {
samples[0].contextIds.push_back(decoderStartTokenId);
}
requests.emplace_back(makeExecutorRequest(samples[0].contextIds, samples[0].outputLen, beamWidth, eosId, padId, num_vocabs,
benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt,
benchmarkParams.requestLookaheadConfig, samples[0].inputIds));
benchmarkParams.requestLookaheadConfig, samples[0].inputIds, samples[0].inputFeat, samples[0].inputLen,
benchmarkParams.temperature, benchmarkParams.cfgScale));
}
else
{
requests.emplace_back(makeExecutorRequest(samples[0].inputIds, samples[0].outputLen, beamWidth, eosId, padId, num_vocabs,
benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, std::nullopt,
benchmarkParams.requestLookaheadConfig, std::nullopt, benchmarkParams.temperature));
benchmarkParams.requestLookaheadConfig, std::nullopt,
std::nullopt,
std::nullopt,
benchmarkParams.temperature, benchmarkParams.cfgScale));
}
}
executorServer->enqueue(std::move(requests), true);
Expand All @@ -990,20 +1106,22 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
{
loraConfig = texec::LoraConfig(samples[i].taskId);
}
if (executorModelType == texec::ModelType::kENCODER_DECODER)
if (executorModelType == texec::ModelType::kENCODER_DECODER || samples[i].inputIds.empty())
{
if (samples[i].contextIds.empty()) {
samples[i].contextIds.push_back(decoderStartTokenId);
}
requests.emplace_back(makeExecutorRequest(samples[i].contextIds, samples[i].outputLen, beamWidth, eosId, padId, num_vocabs,
benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig,
benchmarkParams.requestLookaheadConfig, samples[i].inputIds));
benchmarkParams.requestLookaheadConfig, samples[i].inputIds, samples[i].inputFeat, samples[i].inputLen,
benchmarkParams.temperature, benchmarkParams.cfgScale));
}
else
{
requests.emplace_back(makeExecutorRequest(samples[i].inputIds, samples[i].outputLen, beamWidth, eosId, padId, num_vocabs,
benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig,
benchmarkParams.requestLookaheadConfig, std::nullopt, benchmarkParams.temperature));
benchmarkParams.requestLookaheadConfig, std::nullopt, std::nullopt, std::nullopt,
benchmarkParams.temperature, benchmarkParams.cfgScale));
}
}

Expand Down Expand Up @@ -1160,6 +1278,7 @@ int main(int argc, char* argv[])
"Minimum token probability threshold for typical acceptance. Enables typical acceptance in Eagle",
cxxopts::value<float>());
options.add_options()("temperature", "Sampling temperature for each request", cxxopts::value<float>());
options.add_options()("cfg_scale", "Scale of classifier-free guidance (CFG) for each request", cxxopts::value<float>());
options.add_options()(
"eagle_use_dynamic_tree", "Whether to use Eagle-2", cxxopts::value<bool>()->default_value("false"));
options.add_options()("eagle_dynamic_tree_max_top_k",
Expand Down Expand Up @@ -1280,18 +1399,10 @@ int main(int argc, char* argv[])
{
benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
}
// Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
// Argument: K-V Cache Cross Attention Fraction. Only applicable to models with xattn
if (result.count("cross_kv_cache_fraction"))
{
if (result.count("cross_kv_cache_fraction"))
{
benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
}
else
{
benchmarkParams.crossKvCacheFraction
= 0.5f; // default value if not set. but non enc-dec should not even have this param set
}
benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
}

// Argument: Enable dynamic tuning of batch size
Expand Down Expand Up @@ -1407,6 +1518,10 @@ int main(int argc, char* argv[])
{
benchmarkParams.temperature = result["temperature"].as<float>();
}
if (result.count("cfg_scale"))
{
benchmarkParams.cfgScale = result["cfg_scale"].as<float>();
}

if (result.count("executor_lookahead_config"))
{
Expand Down
30 changes: 26 additions & 4 deletions benchmarks/cpp/utils/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@

#include <filesystem>
#include <fstream>
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include <algorithm>

namespace tensorrt_llm::benchmark
{
Expand Down Expand Up @@ -78,16 +81,19 @@ Samples parseWorkloadJson(
auto constexpr ignoreComments = true;
TLLM_CHECK_WITH_INFO(std::filesystem::exists(datasetPath), "File does not exist: %s", datasetPath.c_str());
std::ifstream jsonStream(datasetPath);
auto json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ignoreComments);

Samples samples;
auto json = nlohmann::json::parse(jsonStream, nullptr, allowExceptions, ignoreComments);

for (auto const& sample : json["samples"])
{
if (samples.size() >= maxNumSamples)
break;
int32_t taskId = sample.count("task_id") ? sample["task_id"].template get<int32_t>() : -1;
auto input_ids(sample["input_ids"].template get<std::vector<int32_t>>());
int32_t inputLen = sample["input_len"];
std::vector<int32_t> input_ids;
if (sample.count("input_ids")) {
input_ids = sample["input_ids"].template get<std::vector<int32_t>>();
}
if (maxPromptLen && (input_ids.size() > maxPromptLen.value()))
{
input_ids.resize(maxPromptLen.value());
Expand All @@ -97,7 +103,23 @@ Samples parseWorkloadJson(
{
context_ids = sample["context_ids"].template get<std::vector<int32_t>>();
}
samples.emplace_back(Sample{std::move(input_ids), std::move(context_ids), sample["output_len"], taskId});
texec::Tensor inputFeat;
if (sample.count("input_feat"))
{
auto inputFeatVec = sample["input_feat"].template get<std::vector<float>>();

if (!inputFeatVec.empty())
{
TLLM_CHECK_WITH_INFO(inputFeatVec.size() % static_cast<size_t>(inputLen) == 0,
"input_feat size %zu is not divisible by input_len %d", inputFeatVec.size(), inputLen);

int32_t hiddenDim = static_cast<int32_t>(inputFeatVec.size() / inputLen);
inputFeat = texec::Tensor::of(inputFeatVec.data(), {inputLen, hiddenDim});
}
}

samples.emplace_back(Sample{std::move(input_ids), std::move(context_ids), inputFeat, inputLen,
sample["output_len"], taskId});
}

if (samples.size() < maxNumSamples)
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/cpp/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/

#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/executor/tensor.h"
#include "tensorrt_llm/runtime/iTensor.h"

#include <cstdint>
#include <cxxopts.hpp>
Expand Down Expand Up @@ -87,6 +89,7 @@ struct BenchmarkParams

std::optional<texec::EagleConfig> eagleConfig;
std::optional<float> temperature;
std::optional<float> cfgScale;

std::optional<texec::LookaheadDecodingConfig> executorLookaheadConfig;
std::optional<texec::LookaheadDecodingConfig> requestLookaheadConfig;
Expand Down Expand Up @@ -225,6 +228,8 @@ struct Sample
{
std::vector<int32_t> inputIds;
std::vector<int32_t> contextIds;
texec::Tensor inputFeat;
int32_t inputLen;
int32_t outputLen;
int32_t taskId;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class HandleGenerationLogits : Algorithm
HandleGenerationLogits() = default;

void operator()(tr::SizeType32 logitsIndex, RequestVector const& generationRequests, DecoderBuffers& decoderBuffers,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager, tr::ITensor::SharedPtr const& logits,
tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
tensorrt_llm::runtime::CudaStream const& stream, tr::ITensor::SharedPtr const& logits,
OptionalRef<RuntimeBuffers> genRuntimeBuffers, tr::SizeType32 vocabId = 0) const;
};

Expand Down
13 changes: 13 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ class BlockRange
{
}

BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest const& llmRequest, SizeType32 beam, SizeType32 poolIdx = 0)
: mManager(&cacheManager)
, mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))
{
std::vector<SizeType32> blockIds;
for (int i = 0; i < llmRequest.getNumSequences(); i++) {
auto const requestId = llmRequest.getSeqSlotId(i);
const auto& thisBlockIds = cacheManager.getSequence(requestId).getCacheBlockIds().at(beam);
blockIds.insert(blockIds.end(), thisBlockIds.begin(), thisBlockIds.end());
}
mBlockIds = std::move(blockIds);
}

BlockRange(BaseKVCacheManager const& cacheManager, std::vector<SizeType32> blockIds, SizeType32 poolIdx = 0)
: mManager(&cacheManager)
, mPool(cacheManager.getBlockManager().getPrimaryPool(poolIdx))
Expand Down
Loading