diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index dda8f52cc82..787fa0bb7e9 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -1468,7 +1468,8 @@ class CacheTransceiverConfig DEFAULT = 0, MPI = 1, UCX = 2, - NIXL = 3 + NIXL = 3, + MOONCAKE = 4 }; explicit CacheTransceiverConfig(std::optional backendType = std::nullopt, std::optional maxNumTokens = std::nullopt, std::optional kvTransferTimeoutMs = std::nullopt, diff --git a/cpp/include/tensorrt_llm/executor/transferAgent.h b/cpp/include/tensorrt_llm/executor/transferAgent.h index ac469fcb403..5f4ff1f0616 100644 --- a/cpp/include/tensorrt_llm/executor/transferAgent.h +++ b/cpp/include/tensorrt_llm/executor/transferAgent.h @@ -391,6 +391,14 @@ template "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); return func(std::forward(args)...); } + if (backend == "mooncake") + { + auto& loader = DynLibLoader::getInstance(); + using CreateMooncakeFuncType = std::unique_ptr (*)(BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + return func(std::forward(args)...); + } TLLM_THROW("Unknown backend name."); } diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index a9e4a007290..76604ec2296 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -159,6 +159,10 @@ if(NIXL_ROOT) set(NIXL_WRAPPER_TARGET tensorrt_llm_nixl_wrapper) endif() +if(MOONCAKE_ROOT) + set(MOONCAKE_WRAPPER_TARGET tensorrt_llm_mooncake_wrapper) +endif() + add_subdirectory(executor) find_package(Threads REQUIRED) @@ -272,6 +276,11 @@ if(TARGET ${NIXL_WRAPPER_TARGET}) add_dependencies(${SHARED_TARGET} ${NIXL_WRAPPER_TARGET}) endif() +if(TARGET ${MOONCAKE_WRAPPER_TARGET}) + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} INTERFACE ${SHARED_TARGET}) + add_dependencies(${SHARED_TARGET} ${MOONCAKE_WRAPPER_TARGET}) +endif() + if(NOT WIN32) # Load libraries at $PREFIX/lib from # $PREFIX/lib/python3.12/site-packages/tensorrt_llm/libs diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index bb253c969f3..7e4c26bfd78 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -81,6 +81,11 @@ std::unique_ptr CacheTransceiverFactory::createCacheTransc backendType = executor::CacheTransceiverConfig::BackendType::NIXL; TLLM_LOG_INFO("Enable NIXL KV cache transport."); } + else if (common::getEnvUseMooncakeKvCache()) + { + backendType = executor::CacheTransceiverConfig::BackendType::MOONCAKE; + TLLM_LOG_INFO("Enable MOONCAKE KV cache transport."); + } else if (common::getEnvUseMPIKvCache()) { backendType = executor::CacheTransceiverConfig::BackendType::MPI; @@ -203,9 +208,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL) { mManager = std::make_unique( - mCacheTransBufferManagerPtrs, *mCacheState); + mCacheTransBufferManagerPtrs, *mCacheState, "nixl"); TLLM_LOG_INFO("NIXL Connection Manager created"); } + else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE) + { + mManager = std::make_unique( + mCacheTransBufferManagerPtrs, *mCacheState, "mooncake"); + TLLM_LOG_INFO("MOONCAKE Connection Manager created"); + } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI) { mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world()); diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index fc85975acb3..4a082a4ff3a 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -281,6 +281,12 @@ bool getEnvUseNixlKvCache() return useNixlKvCache; } +bool getEnvUseMooncakeKvCache() +{ + static bool const useMooncakeKvCache = getBoolEnv("TRTLLM_USE_MOONCAKE_KVCACHE"); + return useMooncakeKvCache; +} + bool getEnvUseRoundRobinBlockDistForCP() { static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP"); @@ -343,6 +349,23 @@ std::string getEnvNixlBackend() return nixlBackend; } +std::string getEnvMooncakeInterface() +{ + static std::once_flag flag; + static std::string mooncakeInterface; + + std::call_once(flag, + [&]() + { + char const* mooncake_interface = std::getenv("TRTLLM_MOONCAKE_INTERFACE"); + if (mooncake_interface) + { + mooncakeInterface = mooncake_interface; + } + }); + return mooncakeInterface; +} + bool getEnvDisaggLayerwise() { static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE"); diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 8a3af2458dd..f838f0e9ae0 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -83,8 +83,11 @@ inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 g bool getEnvUseUCXKvCache(); bool getEnvUseMPIKvCache(); + bool getEnvUseNixlKvCache(); +bool getEnvUseMooncakeKvCache(); + bool getEnvUseRoundRobinBlockDistForCP(); std::string getEnvUCXInterface(); @@ -93,6 +96,8 @@ std::string getEnvNixlInterface(); std::string getEnvNixlBackend(); +std::string getEnvMooncakeInterface(); + bool getEnvDisaggLayerwise(); bool getEnvParallelCacheSend(); diff --git a/cpp/tensorrt_llm/common/ipUtils.cpp b/cpp/tensorrt_llm/common/ipUtils.cpp new file mode 100644 index 00000000000..e4e9767194e --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.cpp @@ -0,0 +1,226 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ipUtils.h" +#include "tensorrt_llm/common/logger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ + +std::string getLocalIpByNic(std::string const& interface, int rank) +{ + struct ifaddrs* ifaddr = nullptr; + if (getifaddrs(&ifaddr) == -1) + { + TLLM_LOG_ERROR(rank, + "getLocalIpByNic: Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is " + "set " + "correctly."); + return std::string{}; + } + + for (struct ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next) + { + if (ifa->ifa_addr == nullptr) + { + continue; + } + + if (ifa->ifa_name == interface) + { + if (ifa->ifa_addr->sa_family == AF_INET) + { + char ip[INET_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "0.0.0.0") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + else if (ifa->ifa_addr->sa_family == AF_INET6) + { + char ip[INET6_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + } + } + + freeifaddrs(ifaddr); + TLLM_LOG_ERROR( + rank, "Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is set correctly."); + return std::string{}; +} + +std::string getLocalIpByHostname(int rank) +{ + char hostname[256]{}; + if (gethostname(hostname, sizeof(hostname)) == -1) + { + TLLM_LOG_ERROR(rank, "getLocalIpByHostname: Can't get hostname"); + return std::string{}; + } + + struct addrinfo hints = {}; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_CANONNAME; + + struct addrinfo* res = nullptr; + if (getaddrinfo(hostname, nullptr, &hints, &res) != 0) + { + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get address info for hostname"); + return std::string{}; + } + + for (struct addrinfo* p = res; p != nullptr; p = p->ai_next) + { + + if (p->ai_family == AF_INET) + { // IPv4 + char ip[INET_ADDRSTRLEN]{}; + struct sockaddr_in* ipv4 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv4->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "127.0.0.1") != 0 + && std::strcmp(ip, "0.0.0.0") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + else if (p->ai_family == AF_INET6) + { // IPv6 + char ip[INET6_ADDRSTRLEN]{}; + struct sockaddr_in6* ipv6 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv6->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + } + + freeaddrinfo(res); + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get local ip from hostname"); + return std::string{}; +} + +std::string getLocalIpByRemoteOrHostName(int rank) +{ + + // Try IPv4 + struct sockaddr_in addr + { + }; + + addr.sin_family = AF_INET; + addr.sin_port = htons(80); + // using google's public dns server to get the local ip which can be accessed from remote + char const* dns_ip_v4 = "8.8.8.8"; + inet_pton(AF_INET, dns_ip_v4, &addr.sin_addr); + + int sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr), sizeof(addr)) != -1) + { + socklen_t addr_len = sizeof(addr); + if (getsockname(sock, reinterpret_cast(&addr), &addr_len) != -1) + { + char ip[INET_ADDRSTRLEN]{}; + inet_ntop(AF_INET, &addr.sin_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try IPv6 + struct sockaddr_in6 addr6 + { + }; + + addr6.sin6_family = AF_INET6; + addr6.sin6_port = htons(80); + // using google's public dns server + char const* dns_ipv6 = "2001:4860:4860::8888"; + inet_pton(AF_INET6, dns_ipv6, &addr6.sin6_addr); + + sock = socket(AF_INET6, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr6), sizeof(addr6)) != -1) + { + socklen_t addr_len = sizeof(addr6); + if (getsockname(sock, reinterpret_cast(&addr6), &addr_len) != -1) + { + char ip[INET6_ADDRSTRLEN]{}; + inet_ntop(AF_INET6, &addr6.sin6_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try hostname + return getLocalIpByHostname(rank); +} + +std::string getLocalIp(std::string interface, int rank) +{ + std::string localIP = {}; + if (!interface.empty()) + { + localIP = getLocalIpByNic(interface, rank); + } + if (localIP.empty()) + { + localIP = getLocalIpByRemoteOrHostName(rank); + } + // check whether the localIP is valid + if (localIP.empty()) + { + TLLM_THROW("getLocalIp: Can't get local ip"); + } + return localIP; +} +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ipUtils.h b/cpp/tensorrt_llm/common/ipUtils.h new file mode 100644 index 00000000000..9e8081683df --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/config.h" +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ +std::string getLocalIp(std::string interface, int rank); +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/executor/CMakeLists.txt b/cpp/tensorrt_llm/executor/CMakeLists.txt index e0e91d4b993..6639b582751 100644 --- a/cpp/tensorrt_llm/executor/CMakeLists.txt +++ b/cpp/tensorrt_llm/executor/CMakeLists.txt @@ -91,3 +91,4 @@ target_compile_definitions(${EXECUTOR_STATIC_TARGET} add_subdirectory(cache_transmission/ucx_utils) add_subdirectory(cache_transmission/nixl_utils) +add_subdirectory(cache_transmission/mooncake_utils) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index b9dcc22a578..ee8e8e21b35 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -231,12 +231,12 @@ bool AgentConnection::recvReadySignal(DataContext const& ctx) const { ReadySignalInfo readySignalInfo{mAgentName, ctx, false}; mAgentConnectionManager->waitForReadySignal(mRemoteAgentName, readySignalInfo); - return true; + return readySignalInfo.mIsReady; } AgentConnectionManager::AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState) + CacheState cacheState, std::string const& backendType) : mCacheState(std::move(cacheState)) , mCacheTransBufferManagers(std::move(cacheTransBufferManagers)) , mRegMemDescs(MemoryType::kVRAM, {}) @@ -247,7 +247,7 @@ AgentConnectionManager::AgentConnectionManager( mAgentName = genUniqueAgentName(); // Create Agent BaseAgentConfig config{mAgentName, true}; - m_Agent = makeTransferAgent("nixl", &config); + m_Agent = makeTransferAgent(backendType, &config); TLLM_CHECK(!mCacheTransBufferManagers.empty()); std::vector memDescs; for (auto* cacheTransBufferManager : mCacheTransBufferManagers) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h index d5a780bf45b..6b8bd875e4a 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h @@ -277,7 +277,7 @@ class AgentConnectionManager : public ConnectionManager public: AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState); + CacheState cacheState, std::string const& backendType); ~AgentConnectionManager(); AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override; [[nodiscard]] std::vector getConnections(CommState const& state) override; diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt new file mode 100644 index 00000000000..105d3b93f1f --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT +# Source Code License Agreement +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this material and related documentation without an express +# license agreement from NVIDIA CORPORATION or its affiliates is strictly +# prohibited. + +# MOONCAKE is not supported on Rocky8 for now +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(MOONCAKE_ROOT AND NOT IS_ROCKY8) + find_library(TRANSFER_ENGINE_LIB transfer_engine ${MOONCAKE_ROOT}/lib) + find_path(TRANSFER_ENGINE_INCLUDE_DIR transfer_engine_c.h + ${MOONCAKE_ROOT}/include) + + message(STATUS "Find transfer engine results:") + message(STATUS " TRANSFER_ENGINE_LIB = ${TRANSFER_ENGINE_LIB}") + message( + STATUS " TRANSFER_ENGINE_INCLUDE_DIR = ${TRANSFER_ENGINE_INCLUDE_DIR}") + + if(TRANSFER_ENGINE_LIB AND TRANSFER_ENGINE_INCLUDE_DIR) + set(MOONCAKE_WRAPPER_TARGET "tensorrt_llm_mooncake_wrapper") + + add_library(${MOONCAKE_WRAPPER_TARGET} SHARED transferAgent.cpp) + target_compile_options(${MOONCAKE_WRAPPER_TARGET} PRIVATE -Wno-error) + + target_include_directories(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR}) + + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_LIB} CUDA::cudart) + endif() +endif() diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp new file mode 100644 index 00000000000..eabbca98c3c --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp @@ -0,0 +1,546 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h" +#include "tensorrt_llm/common/envUtils.h" +#include "tensorrt_llm/common/ipUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/transferAgent.h" +#include "tensorrt_llm/runtime/utils/mpiUtils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorrt_llm::executor::kv_cache +{ + +MooncakeTransferStatus::MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount) + : mEngine{engine} + , mBatchId{batchId} + , mRequestCount{requestCount} +{ + TLLM_CHECK(mEngine); +} + +void MooncakeTransferStatus::wait() const +{ + while (!isCompleted()) + { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } +} + +[[nodiscard]] bool MooncakeTransferStatus::isCompleted() const +{ + if (mBatchFreed) + { + return true; + } + + bool has_failed = false; + for (size_t index = 0; index < mRequestCount; ++index) + { + transfer_status_t status; + int rc = getTransferStatus(mEngine, mBatchId, index, &status); + if (rc || status.status == STATUS_FAILED) + { + has_failed = true; + if (rc) + { + TLLM_LOG_ERROR( + "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc); + } + else + { + TLLM_LOG_ERROR("Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status); + } + } + else if (status.status == STATUS_PENDING || status.status == STATUS_WAITING) + { + TLLM_LOG_DEBUG("Transfer is pending for batch %lu, task %zu", mBatchId, index); + return false; + } + } + if (!has_failed) + { + // Each batchId has the batch size, and cannot process more requests + // than the batch size. So, free the batch id here to workaround the issue + // where the same batchId could be used to post multiple transfer. + freeBatchID(mEngine, mBatchId); + mBatchFreed = true; + TLLM_LOG_DEBUG("Batch ID %lu freed, future calls will return true directly", mBatchId); + } + // Currently, we cannot distinguish between failed and completed from return value. + TLLM_LOG_DEBUG("Transfer is completed for batch %lu", mBatchId); + return true; +} + +const std::string MooncakeBase64Helper::STANDARD_CHARS + = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +std::string MooncakeBase64Helper::encode(std::vector const& data) +{ + return encodeInternal(data, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::encode(std::string const& data) +{ + std::vector vec(data.begin(), data.end()); + return encode(vec); +} + +std::vector MooncakeBase64Helper::decode(std::string const& encoded) +{ + return decodeInternal(encoded, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::decodeToString(std::string const& encoded) +{ + auto vec = decode(encoded); + return std::string(vec.begin(), vec.end()); +} + +std::string MooncakeBase64Helper::encodeInternal(std::vector const& data, std::string const& chars) +{ + std::string encoded; + size_t i = 0; + size_t j = 0; + std::array charArray3{}; + std::array charArray4{}; + size_t dataLen = data.size(); + uint8_t const* bytes = data.data(); + + while (dataLen--) + { + charArray3[i++] = *(bytes++); + if (i == 3) + { + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (i = 0; i < 4; i++) + { + encoded += chars[charArray4[i]]; + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 3; j++) + { + charArray3[j] = '\0'; + } + + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (j = 0; j < i + 1; j++) + { + encoded += chars[charArray4[j]]; + } + + while (i++ < 3) + { + encoded += '='; + } + } + + return encoded; +} + +std::vector MooncakeBase64Helper::decodeInternal(std::string const& encoded, std::string const& chars) +{ + size_t encodedLen = encoded.size(); + size_t i = 0; + size_t j = 0; + size_t in_ = 0; + std::array charArray3{}; + std::array charArray4{}; + std::vector decoded; + + std::string cleanEncoded; + for (char c : encoded) + { + if (!isWhitespace(c)) + { + cleanEncoded += c; + } + } + + encodedLen = cleanEncoded.size(); + + while (encodedLen-- && cleanEncoded[in_] != '=' && isBase64(cleanEncoded[in_], chars)) + { + charArray4[i++] = cleanEncoded[in_]; + in_++; + if (i == 4) + { + for (i = 0; i < 4; i++) + { + charArray4[i] = chars.find(charArray4[i]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (i = 0; i < 3; i++) + { + decoded.push_back(charArray3[i]); + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 4; j++) + { + charArray4[j] = 0; + } + + for (j = 0; j < 4; j++) + { + charArray4[j] = chars.find(charArray4[j]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (j = 0; j < i - 1; j++) + { + decoded.push_back(charArray3[j]); + } + } + + return decoded; +} + +bool MooncakeBase64Helper::isBase64(uint8_t c, std::string const& chars) +{ + return (isalnum(c) || (c == chars[62]) || (c == chars[63])); +} + +bool MooncakeBase64Helper::isWhitespace(uint8_t c) +{ + return (c == ' ' || c == '\n' || c == '\r' || c == '\t'); +} + +MooncakeTransferAgent::MooncakeTransferAgent(BaseAgentConfig const& config) +{ + mLocalAgentName = config.mName; + std::string segmentName = "127.0.0.1"; + + if (getenv("TLLM_MOONCAKE_IP_ADDR")) + { + segmentName = std::string(getenv("TLLM_MOONCAKE_IP_ADDR")); + } + else + { + auto ip = common::getLocalIp(common::getEnvMooncakeInterface(), mpi::MpiComm::session().getRank()); + if (!ip.empty()) + segmentName = ip; + } + + mEngine = createTransferEngine("P2PHANDSHAKE", segmentName.c_str(), "", 0, true); +} + +void MooncakeTransferAgent::registerMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::registerMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + it->second->addRef(); + continue; + } + + int err = registerLocalMemory(mEngine, reinterpret_cast(desc.getAddr()), desc.getLen(), "*", 1); + + TLLM_CHECK_WITH_INFO(err == 0, "registerLocalMemory failed, addr: %p, len: %lu", + reinterpret_cast(desc.getAddr()), desc.getLen()); + + auto mooncakeDesc = std::make_shared(desc); + mMemRegInfo[desc.getAddr()] = std::move(mooncakeDesc); + } +} + +void MooncakeTransferAgent::deregisterMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::deregisterMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + auto const& mooncakeDesc = it->second; + mooncakeDesc->releaseRef(); + if (mooncakeDesc->getRefCount()) + continue; + + int err = unregisterLocalMemory(mEngine, reinterpret_cast(desc.getAddr())); + + TLLM_CHECK_WITH_INFO( + err == 0, "unregisterLocalMemory failed, addr: %p", reinterpret_cast(desc.getAddr())); + + mMemRegInfo.erase(desc.getAddr()); + } + } +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::loadRemoteAgent"); + + // Do the same thing as loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) + loadRemoteAgent(name, std::move(agentDesc.getBackendAgentDesc())); +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) +{ + TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), + "MooncakeTransferAgent::loadRemoteAgent loadRemoteAgent to %s remoteagent name: %s", connectionInfo.c_str(), + name.c_str()); + + std::lock_guard lock(mMutex); + auto segmentId = openSegment(mEngine, connectionInfo.c_str()); + + TLLM_CHECK_WITH_INFO( + segmentId >= 0, "loadRemoteAgent openSegment failed, connectionInfo: %s", connectionInfo.c_str()); + + mConnectedAgents[name].segmentId = segmentId; +} + +void MooncakeTransferAgent::invalidateRemoteAgent(std::string const& name) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::invalidateRemoteAgent"); +} + +AgentDesc MooncakeTransferAgent::getLocalAgentDesc() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalAgentDesc"); + + // Using connection info as agent desc + const static size_t kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalIpAndPort failed"); + + return AgentDesc{std::string(connectionInfo)}; +} + +ConnectionInfoType MooncakeTransferAgent::getLocalConnectionInfo() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalConnectionInfo"); + + const static size_t kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalConnectionInfo failed"); + + return std::string(connectionInfo); +} + +[[nodiscard]] std::unique_ptr MooncakeTransferAgent::submitTransferRequests( + TransferRequest const& request) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::submitTransferRequests"); + + bool hasNotif = false; + std::string syncMessage; + + if (request.getSyncMessage().has_value()) + { + hasNotif = true; + syncMessage = request.getSyncMessage().value(); + } + + const static size_t kMaxRequestCount = 1024; + uint64_t batchId = allocateBatchID(mEngine, kMaxRequestCount); + + TLLM_CHECK_WITH_INFO(batchId != INVALID_BATCH, "allocateBatchID failed"); + + int segmentId; + { + std::lock_guard lock(mMutex); + std::string remoteName = request.getRemoteName(); + + auto it = mConnectedAgents.find(remoteName); + if (it == mConnectedAgents.end()) + { + std::string error = "Remote agent " + remoteName + "not found"; + TLLM_THROW(error); + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + auto localDescs = request.getSrcDescs().getDescs(); + auto remoteDescs = request.getDstDescs().getDescs(); + + TLLM_CHECK_WITH_INFO(localDescs.size() == remoteDescs.size(), "Number of local and remote memory must match"); + + size_t requestCount = localDescs.size(); + std::vector transferRequests(requestCount); + + for (size_t index = 0; index < requestCount; ++index) + { + TLLM_CHECK_WITH_INFO( + localDescs[index].getLen() == remoteDescs[index].getLen(), "Length of local and remote memory must match"); + + transferRequests[index].opcode = (request.getOp() == TransferOp::kREAD) ? OPCODE_READ : OPCODE_WRITE; + transferRequests[index].source = reinterpret_cast(localDescs[index].getAddr()); + transferRequests[index].target_offset = remoteDescs[index].getAddr(); + transferRequests[index].length = localDescs[index].getLen(); + transferRequests[index].target_id = segmentId; + } + + int rc = 0; + if (hasNotif) + { + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + notifyMsg.msg = const_cast(syncMessage.c_str()); + rc = submitTransferWithNotify(mEngine, batchId, transferRequests.data(), requestCount, notifyMsg); + } + else + { + rc = submitTransfer(mEngine, batchId, transferRequests.data(), requestCount); + } + + TLLM_CHECK_WITH_INFO(rc == 0, "submitTransfer failed with status: %d", rc); + + return std::make_unique(mEngine, batchId, requestCount); +} + +void MooncakeTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage"); + int segmentId; + { + std::lock_guard lock(mMutex); + auto it = mConnectedAgents.find(name); + + if (it == mConnectedAgents.end()) + { + TLLM_LOG_WARNING("Remote agent %s not found", name.c_str()); + return; + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + std::string encoded = MooncakeBase64Helper::encode(syncMessage); + notifyMsg.msg = const_cast(encoded.c_str()); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage notifyMsg.name: %s, notifyMsg.msg: %s", notifyMsg.name, + notifyMsg.msg); + + int ret = genNotifyInEngine(mEngine, segmentId, notifyMsg); + + TLLM_CHECK_WITH_INFO(ret == 0, "genNotifyInEngine failed with status: %d", ret); +} + +[[nodiscard]] std::unordered_map> MooncakeTransferAgent::getNotifiedSyncMessages() +{ + std::unordered_map> notifs; + int size = 0; + + notify_msg_t* notifyMsgs = getNotifsFromEngine(mEngine, &size); + + TLLM_CHECK_WITH_INFO(size >= 0, "getNotifsFromEngine returned negative size: %d", size); + + for (int i = 0; i < size; i++) + { + if (notifyMsgs[i].msg == nullptr) + { + TLLM_LOG_WARNING("Message pointer is null for: %s", notifyMsgs[i].name); + continue; + } + + std::string decoded = MooncakeBase64Helper::decodeToString(notifyMsgs[i].msg); + notifs[notifyMsgs[i].name].emplace_back(std::move(decoded)); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::getNotifiedSyncMessages getNotifsFromEngine: %s, %s", notifyMsgs[i].name, + notifyMsgs[i].msg); + } + + freeNotifsMsgBuf(notifyMsgs, size); + return notifs; +} + +bool MooncakeTransferAgent::checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::checkRemoteDescs"); + return true; +} + +MooncakeTransferAgent::~MooncakeTransferAgent() +{ + destroyTransferEngine(mEngine); + TLLM_LOG_DEBUG("MooncakeTransferAgent::~MooncakeTransferAgent"); +} + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config) + { + TLLM_CHECK(config); + return std::make_unique(*config); + } +} + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h new file mode 100644 index 00000000000..0aeeedeae17 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h @@ -0,0 +1,165 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "tensorrt_llm/executor/transferAgent.h" +#include "transfer_engine_c.h" + +namespace tensorrt_llm::executor::kv_cache +{ + +class MooncakeTransferStatus final : public TransferStatus +{ +public: + MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount); + + [[nodiscard]] bool isCompleted() const override; + + void wait() const override; + +private: + transfer_engine_t mEngine; + uint64_t mBatchId; + size_t mRequestCount; + mutable bool mBatchFreed = false; +}; + +class MooncakeMemoryDesc +{ +public: + MooncakeMemoryDesc(MemoryDesc desc) + : mDesc{std::move(desc)} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc(MooncakeMemoryDesc const& other) + : mDesc{other.mDesc} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc& operator=(MooncakeMemoryDesc const&) = delete; + + ~MooncakeMemoryDesc() = default; + + void addRef() noexcept + { + ++mRefCnt; + } + + int releaseRef() noexcept + { + return --mRefCnt; + } + + int getRefCount() const noexcept + { + return mRefCnt; + } + + MemoryDesc const& getDesc() const noexcept + { + return mDesc; + } + +private: + MemoryDesc mDesc; + int mRefCnt; +}; + +class MooncakeBase64Helper +{ +public: + static std::string encode(std::vector const& data); + static std::string encode(std::string const& data); + + static std::vector decode(std::string const& encoded); + static std::string decodeToString(std::string const& encoded); + +private: + static const std::string STANDARD_CHARS; + + static std::string encodeInternal(std::vector const& data, std::string const& chars); + static std::vector decodeInternal(std::string const& encoded, std::string const& chars); + + static inline bool isBase64(uint8_t c, std::string const& chars); + static inline bool isWhitespace(uint8_t c); +}; + +class MooncakeTransferAgent final : public BaseTransferAgent +{ +public: + MooncakeTransferAgent(BaseAgentConfig const& config); + ~MooncakeTransferAgent(); + + void registerMemory(RegisterDescs const& descs) override; + + void deregisterMemory(RegisterDescs const& descs) override; + + void loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) override; + + void loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) override; + + void invalidateRemoteAgent(std::string const& name) override; + + AgentDesc getLocalAgentDesc() override; + + ConnectionInfoType getLocalConnectionInfo() override; + + [[nodiscard]] std::unique_ptr submitTransferRequests(TransferRequest const& request) override; + + void notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) override; + + [[nodiscard]] std::unordered_map> getNotifiedSyncMessages() override; + + bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) override; + +private: + struct AgentInfo + { + int segmentId; + }; + + mutable std::mutex mMutex; + transfer_engine_t mEngine; + std::unordered_map> mMemRegInfo; + std::unordered_map mConnectedAgents; + std::string mLocalAgentName; +}; + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + [[nodiscard]] std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config); +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp index bed5db70f74..051586b7fe3 100644 --- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp @@ -449,6 +449,7 @@ void initConfigBindings(nb::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -460,6 +461,8 @@ void initConfigBindings(nb::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp index 79194232560..4fe20a6c664 100644 --- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp @@ -431,6 +431,7 @@ void initConfigBindings(pybind11::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -442,6 +443,8 @@ void initConfigBindings(pybind11::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tests/unit_tests/executor/CMakeLists.txt b/cpp/tests/unit_tests/executor/CMakeLists.txt index de3a694d21d..069363c5edb 100644 --- a/cpp/tests/unit_tests/executor/CMakeLists.txt +++ b/cpp/tests/unit_tests/executor/CMakeLists.txt @@ -38,10 +38,31 @@ add_gtest(ucxCommTest ucxCommTest.cpp) target_link_libraries(ucxCommTest PRIVATE ${Python3_LIBRARIES}) target_link_libraries(serializeUtilsTest PRIVATE ${Python3_LIBRARIES}) -if(NIXL_ROOT) - add_gtest(transferAgentTest transferAgentTest.cpp) +# Skip MOONCAKE related tests on Rocky8 +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(NIXL_ROOT OR (MOONCAKE_ROOT AND NOT IS_ROCKY8)) add_gtest(agentCommTest agentCommTest.cpp) - target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) - target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper - ${Python3_LIBRARIES}) + add_gtest(transferAgentTest transferAgentTest.cpp) + + if(NIXL_ROOT) + target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest PRIVATE TEST_NIXL_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_NIXL_BACKEND=1) + endif() + + if(MOONCAKE_ROOT) + target_link_libraries(transferAgentTest + PRIVATE tensorrt_llm_mooncake_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_mooncake_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest + PRIVATE TEST_MOONCAKE_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_MOONCAKE_BACKEND=1) + endif() endif() diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp index ccd54ab926f..025a3a8bc6a 100644 --- a/cpp/tests/unit_tests/executor/agentCommTest.cpp +++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp @@ -22,22 +22,54 @@ using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::executor::kv_cache; -bool needSkipTest(std::string& skipReason) +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + +bool needSkipTest(std::string const& backend, std::string& skipReason) { bool skip = false; try { auto& loader = tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance(); - using CreateNixlFuncType = std::unique_ptr (*)( - tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); - auto* func = loader.getFunctionPointer( - "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + if (backend == "nixl") + { + using CreateNixlFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + } + else if (backend == "mooncake") + { + using CreateMooncakeFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + } + else + { + skip = true; + skipReason = "Unknown backend: " + backend; + } } catch (std::exception const& e) { std::string error = e.what(); - if (error.find("libtensorrt_llm_nixl_wrapper.so") != std::string::npos) + std::string libName + = (backend == "nixl") ? "libtensorrt_llm_nixl_wrapper.so" : "libtensorrt_llm_mooncake_wrapper.so"; + if (error.find(libName) != std::string::npos) { skip = true; skipReason = error; @@ -46,17 +78,26 @@ bool needSkipTest(std::string& skipReason) return skip; } -class AgentCommTest : public ::testing::Test +class AgentCommTest : public ::testing::TestWithParam { protected: void SetUp() override { + backend = GetParam(); std::string skipReason; - if (needSkipTest(skipReason)) + if (needSkipTest(backend, skipReason)) { GTEST_SKIP() << skipReason; } - setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + + if (backend == "nixl") + { + setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + } + else if (backend == "mooncake") + { + setenv("TRTLLM_USE_MOONCAKE_KVCACHE", "1", 1); + } auto constexpr numLayers = 8; auto constexpr numHeads = 16; @@ -106,15 +147,16 @@ class AgentCommTest : public ::testing::Test mCacheState.reset(); } + std::string backend; std::unique_ptr mTransBufferManager; std::unique_ptr mCacheManager; std::unique_ptr mCacheState; }; -TEST_F(AgentCommTest, AgentConnectionManagerBasic) +TEST_P(AgentCommTest, AgentConnectionManagerBasic) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager = std::make_unique(bufferManagers, *mCacheState, backend); ASSERT_TRUE(connectionManager != nullptr); ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size()); ASSERT_TRUE(connectionManager->getCacheTransBufferManagers().front() != nullptr); @@ -126,11 +168,11 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic) ASSERT_EQ(commState.getAgentState().size(), 1); } -TEST_F(AgentCommTest, AgentConnectionManagerConnect) +TEST_P(AgentCommTest, AgentConnectionManagerConnect) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState); - auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState, backend); + auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState, backend); auto agentName0 = connectionManager0->getAgentName(); auto agentName1 = connectionManager1->getAgentName(); ASSERT_TRUE(!agentName0.empty()); @@ -189,3 +231,6 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect) } TLLM_LOG_INFO("after finish"); } + +INSTANTIATE_TEST_SUITE_P(AvailableBackends, AgentCommTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index 0f21449f30a..7218611a0e4 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -22,11 +22,27 @@ #include #include +#include namespace fs = std::filesystem; using namespace tensorrt_llm::executor::kv_cache; +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + class RegisteredHostMemory { public: @@ -54,100 +70,105 @@ class RegisteredHostMemory BaseTransferAgent* mAgentPtr{}; }; -class TransferAgentTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) +class TransferAgentTest : public ::testing::TestWithParam // NOLINT(cppcoreguidelines-pro-type-member-init) { public: - void SetUp() override {} + void SetUp() override + { + backend = GetParam(); + } void TearDown() override {} [[nodiscard]] std::unique_ptr makeTransferAgent(BaseAgentConfig const& config) { - return tensorrt_llm::executor::kv_cache::makeTransferAgent("nixl", &config); + return tensorrt_llm::executor::kv_cache::makeTransferAgent(backend, &config); } + + std::string backend; }; -TEST_F(TransferAgentTest, Basic) +TEST_P(TransferAgentTest, Basic) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); - // wait for regMem is unpacked by nixlAgent0 + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + // wait for regMem is unpacked by xferAgent0 } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Basic2) +TEST_P(TransferAgentTest, Basic2) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest readReq{TransferOp::kREAD, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(readReq); + auto status = xferAgent0->submitTransferRequests(readReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, DeviceMemory) +TEST_P(TransferAgentTest, DeviceMemory) { std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); char* dev_ptr0; char* dev_ptr1; size_t size = 100; @@ -159,20 +180,20 @@ TEST_F(TransferAgentTest, DeviceMemory) cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice); cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice); RegisteredHostMemory regMem0( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, nixlAgent0.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, xferAgent0.get()); RegisteredHostMemory regMem1( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, nixlAgent1.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost); @@ -181,98 +202,99 @@ TEST_F(TransferAgentTest, DeviceMemory) TLLM_CHECK(memory0 == memory1); TLLM_CUDA_CHECK(cudaFree(dev_ptr0)); TLLM_CUDA_CHECK(cudaFree(dev_ptr1)); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Connect) +TEST_P(TransferAgentTest, Connect) { std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}, config2{agent2, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); - auto nixlAgent2 = makeTransferAgent(config2); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); + auto xferAgent2 = makeTransferAgent(config2); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); MemoryDescs memDescs0{MemoryType::kDRAM, {MemoryDesc{memory0}}}; MemoryDescs memDescs1{MemoryType::kDRAM, {MemoryDesc{memory1}}}; - nixlAgent0->registerMemory(memDescs0); - nixlAgent1->registerMemory(memDescs1); - nixlAgent2->registerMemory(memDescs0); + xferAgent0->registerMemory(memDescs0); + xferAgent1->registerMemory(memDescs1); + xferAgent2->registerMemory(memDescs0); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent0->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); + auto status = xferAgent0->submitTransferRequests(writeReq); status->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent2->loadRemoteAgent(agent1, connectionInfo); + xferAgent2->loadRemoteAgent(agent1, connectionInfo); checked = false; do { - checked = nixlAgent2->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent2->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq2{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status2 = nixlAgent2->submitTransferRequests(writeReq2); + auto status2 = xferAgent2->submitTransferRequests(writeReq2); status2->wait(); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent2->invalidateRemoteAgent(agent1); - nixlAgent0->deregisterMemory(memDescs0); - nixlAgent1->deregisterMemory(memDescs1); - nixlAgent2->deregisterMemory(memDescs0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent2->invalidateRemoteAgent(agent1); + xferAgent0->deregisterMemory(memDescs0); + xferAgent1->deregisterMemory(memDescs1); + xferAgent2->deregisterMemory(memDescs0); } -TEST_F(TransferAgentTest, SyncMessage) +TEST_P(TransferAgentTest, SyncMessage) { constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits::max(); std::string const agent0{"agent0"}, agent1{"agent1"}; BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent0.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent0.get()); - RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent1.get()); - RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent1.get()); + RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - nixlAgent0->notifySyncMessage(agent1, syncMessage); + auto status = xferAgent0->submitTransferRequests(writeReq); + xferAgent0->notifySyncMessage(agent1, syncMessage); - auto notif = nixlAgent1->getNotifiedSyncMessages(); + auto notif = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++) { - notif = nixlAgent1->getNotifiedSyncMessages(); + notif = xferAgent1->getNotifiedSyncMessages(); } + status->wait(); TLLM_CHECK(status->isCompleted()); TLLM_CHECK(notif.size() == 1); TLLM_CHECK(notif[agent0].size() == 1); @@ -281,25 +303,25 @@ TEST_F(TransferAgentTest, SyncMessage) TLLM_CHECK(memory0 == memory1); std::string syncMessage2 = "two_agent_sync_message"; - nixlAgent0->notifySyncMessage(agent1, syncMessage2); - auto notif2 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, syncMessage2); + auto notif2 = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++) { - notif2 = nixlAgent1->getNotifiedSyncMessages(); + notif2 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif2.size() == 1); TLLM_CHECK(notif2[agent0].size() == 1); TLLM_CHECK(notif2[agent0][0] == syncMessage2); - // nixlAgent1->loadRemoteAgent(agent0); - auto connectionInfo2 = nixlAgent0->getLocalConnectionInfo(); - nixlAgent1->loadRemoteAgent(agent0, connectionInfo2); + // xferAgent1->loadRemoteAgent(agent0); + auto connectionInfo2 = xferAgent0->getLocalConnectionInfo(); + xferAgent1->loadRemoteAgent(agent0, connectionInfo2); std::string syncMessage3 = "three_agent_sync_message"; - nixlAgent1->notifySyncMessage(agent0, syncMessage3); - auto notif3 = nixlAgent0->getNotifiedSyncMessages(); + xferAgent1->notifySyncMessage(agent0, syncMessage3); + auto notif3 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++) { - notif3 = nixlAgent0->getNotifiedSyncMessages(); + notif3 = xferAgent0->getNotifiedSyncMessages(); } TLLM_CHECK(notif3.size() == 1); TLLM_CHECK(notif3[agent1].size() == 1); @@ -308,19 +330,20 @@ TEST_F(TransferAgentTest, SyncMessage) bool checked2 = false; do { - checked2 = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked2 = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; - auto status1 = nixlAgent1->submitTransferRequests(writeReq1); - nixlAgent1->notifySyncMessage(agent0, syncMessage4); + auto status1 = xferAgent1->submitTransferRequests(writeReq1); + xferAgent1->notifySyncMessage(agent0, syncMessage4); - auto notif4 = nixlAgent0->getNotifiedSyncMessages(); + auto notif4 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) { - notif4 = nixlAgent0->getNotifiedSyncMessages(); + notif4 = xferAgent0->getNotifiedSyncMessages(); } + status1->wait(); TLLM_CHECK(status1->isCompleted()); TLLM_CHECK(notif4.size() == 1); TLLM_CHECK(notif4[agent1].size() == 1); @@ -335,11 +358,11 @@ TEST_F(TransferAgentTest, SyncMessage) std::stringstream ss; Serialization::serialize(state, ss); std::string serializedState = ss.str(); - nixlAgent0->notifySyncMessage(agent1, serializedState); - auto notif5 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, serializedState); + auto notif5 = xferAgent1->getNotifiedSyncMessages(); for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++) { - notif5 = nixlAgent1->getNotifiedSyncMessages(); + notif5 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif5.size() == 1); TLLM_CHECK(notif5[agent0].size() == 1); @@ -348,10 +371,16 @@ TEST_F(TransferAgentTest, SyncMessage) auto state2 = Serialization::deserializeCommState(ss2); TLLM_CHECK(state2 == state); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent1->invalidateRemoteAgent(agent0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent1->invalidateRemoteAgent(agent0); } +INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); + +// Skip LoopbackAgentTest for mooncake backend for now +#ifdef TEST_NIXL_BACKEND + class LoopbackAgentTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) { @@ -466,3 +495,5 @@ TEST_P(LoopbackAgentTest, GpuToFile) } INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false)); + +#endif // TEST_NIXL_BACKEND diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index 17ca989eee5..41dd8e7a92c 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -713,7 +714,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam(bufferManagers, *mCacheState); + = std::make_unique(bufferManagers, *mCacheState, "nixl"); + } + else if (isMooncake) + { + mConnectionManager = std::make_unique( + bufferManagers, *mCacheState, "mooncake"); } else { @@ -783,7 +797,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam contextRankVec(mContextRankSize); std::iota(contextRankVec.begin(), contextRankVec.end(), 0); - if (isUcx || isNixl) + if (isUcx || isNixl || isMooncake) { auto commState = mConnectionManager->getCommState(); namespace su = tensorrt_llm::executor::serialize_utils; @@ -1286,9 +1300,9 @@ TEST_P(AsymmetricalCacheTest, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } std::vector lenList = {30, 10, 60, 80}; if (genCp > 1) @@ -1410,9 +1424,9 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst index d28fed25a8e..c2e1e5b55df 100644 --- a/docs/source/deployment-guide/config_table.rst +++ b/docs/source/deployment-guide/config_table.rst @@ -167,162 +167,162 @@ - 4 - `1k1k_tp4_conc4.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 128 - `1k1k_tp4_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 128 - - `1k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` * - 4xB200_NVL - - High Throughput + - Max Throughput - 1024 / 1024 - 256 - `1k1k_tp4_conc256.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 1024 - - 256 - - `1k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` * - 4xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp4_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp8_conc128.yaml `_ - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 256 - - `8k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 @@ -356,714 +356,714 @@ - 4 - `1k1k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` * - B200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` * - B200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` * - B200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` * - B200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` * - B200_NVL - - High Throughput + - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp1_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` * - 2xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - Max Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` - * - H200_SXM - Min Latency - 1024 / 1024 - 4 - - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` - * - 2xH200_SXM + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + * - 2xB200_NVL - Low Latency - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` - * - 8xH200_SXM - - Low Latency + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput - 1024 / 1024 - - 8 - - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` - * - 2xH200_SXM + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + * - 2xB200_NVL - Low Latency - - 1024 / 1024 + - 1024 / 8192 - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` + * - H200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` + * - H200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp1_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` + * - H200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp1_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` + * - H200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp1_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` + * - H200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp1_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 8192 / 1024 + - 32 + - `8k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Min Latency - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` - * - H200_SXM - - Min Latency - - 1024 / 8192 - - 4 - - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` * - 8xH200_SXM - - Low Latency + - Min Latency - 1024 / 8192 - 4 - `1k8k_tp8_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 1024 / 8192 - 16 - `1k8k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 8192 - 64 - `1k8k_tp8_conc64.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` - * - H200_SXM - - Min Latency - - 8192 / 1024 - - 4 - - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` * - 8xH200_SXM - - Low Latency + - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` - * - H200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` * - 8xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` - * - H200_SXM - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` * - 8xH200_SXM - - High Throughput + - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` - * - H200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 diff --git a/examples/configs/database/database.py b/examples/configs/database/database.py index e0c73a8ef1c..f2c6d45b0b2 100644 --- a/examples/configs/database/database.py +++ b/examples/configs/database/database.py @@ -15,15 +15,20 @@ from pathlib import Path -from typing import Any, Dict, Iterator, List +from typing import Any, Dict, Iterator, List, Tuple import yaml from pydantic import BaseModel, Field, RootModel +REPO_ROOT = Path(__file__).parent.parent.parent.parent DATABASE_LIST_PATH = Path(__file__).parent / "lookup.yaml" +LOW_LATENCY_CONCURRENCY_THRESHOLD = 8 +HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32 +KEY_PROFILES = {"Min Latency", "Balanced", "Max Throughput"} -class RecipeConstraints(BaseModel): + +class Recipe(BaseModel): """Recipe record for scenario list.""" model: str = Field(description="Model name") @@ -36,29 +41,68 @@ class RecipeConstraints(BaseModel): def load_config(self) -> Dict[str, Any]: """Load and return the YAML config at config_path.""" - with open(self.config_path) as f: - data = yaml.safe_load(f) - return data if data is not None else {} - - -class Recipe(BaseModel): - """Recipe that describes a single scenario.""" - - constraints: RecipeConstraints = Field(description="Recipe constraints") - env_overrides: Dict[str, Any] = Field(description="Environment overrides", default_factory=dict) - config: Dict[str, Any] = Field(description="Configuration overrides", default_factory=dict) + config_relative_path = Path(self.config_path) + # Ensure config path is within the repo root + if config_relative_path.is_absolute() or ".." in config_relative_path.parts: + raise ValueError(f"Invalid config path: {self.config_path}") + full_path = REPO_ROOT / self.config_path + if not full_path.exists(): + raise FileNotFoundError(f"Config not found: {full_path}") + with open(full_path, encoding="utf-8") as f: + return yaml.safe_load(f) -class RecipeList(RootModel[List[RecipeConstraints]]): +class RecipeList(RootModel[List[Recipe]]): @classmethod def from_yaml(cls, yaml_path: Path) -> "RecipeList": """Load and validate recipe list from YAML file.""" - with open(yaml_path) as f: + with open(yaml_path, encoding="utf-8") as f: data = yaml.safe_load(f) return cls(data) - def __iter__(self) -> Iterator[RecipeConstraints]: + def __iter__(self) -> Iterator[Recipe]: return iter(self.root) def __len__(self) -> int: return len(self.root) + + +def assign_profile(num_recipes: int, idx: int, concurrency: int) -> str: + """Assign performance profile to a recipe based on its position in a concurrency-sorted list.""" + if num_recipes == 1: + if concurrency <= LOW_LATENCY_CONCURRENCY_THRESHOLD: + return "Low Latency" + elif concurrency >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD: + return "High Throughput" + else: + return "Balanced" + elif idx == 0: + return "Min Latency" + elif idx == num_recipes - 1: + return "Max Throughput" + elif idx in ((num_recipes - 1) // 2, num_recipes // 2): + return "Balanced" + elif idx < num_recipes // 2: + return "Low Latency" + else: + return "High Throughput" + + +def select_key_recipes(recipes: List[Recipe]) -> List[Tuple[Recipe, str]]: + """Select key recipes (min latency, balanced, max throughput) from a list of recipes.""" + if not recipes: + return [] + + sorted_recipes = sorted(recipes, key=lambda r: r.concurrency) + n = len(sorted_recipes) + + result = [] + seen_profiles = set() + for idx, recipe in enumerate(sorted_recipes): + profile = assign_profile(n, idx, recipe.concurrency) + # For n==1, keep whatever profile is assigned + # For n>=2, only keep key profiles and dedupe (for even n, two indices get "Balanced") + if n == 1 or (profile in KEY_PROFILES and profile not in seen_profiles): + result.append((recipe, profile)) + seen_profiles.add(profile) + return result diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index bfa3af44129..261c0a6d3a0 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -60,12 +60,12 @@ def BUILD_CONFIGS = [ // Vanilla TARNAME is used for packaging in runLLMPackage // cmake-vars cannot be empty, so passing (default) multi-device configuration. (CONFIG_LINUX_X86_64_VANILLA) : [ - (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", + (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks", (TARNAME) : "TensorRT-LLM.tar.gz", (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real", ], (CONFIG_LINUX_X86_64_PYBIND) : [ - (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", + (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks", (TARNAME) : "pybind-TensorRT-LLM.tar.gz", (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real", ], @@ -80,13 +80,13 @@ def BUILD_CONFIGS = [ (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real", ], (CONFIG_LINUX_AARCH64): [ - (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl", + (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA ], (CONFIG_LINUX_AARCH64_PYBIND): [ - (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl", + (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 03aae586175..ed2d1b88fd7 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -458,6 +458,7 @@ def main(*, trt_root: str = '/usr/local/tensorrt', nccl_root: str = None, nixl_root: str = None, + mooncake_root: str = None, internal_cutlass_kernels_root: str = None, clean: bool = False, clean_wheel: bool = False, @@ -559,6 +560,11 @@ def main(*, if nixl_root is not None: cmake_def_args.append(f"-DNIXL_ROOT={nixl_root}") + if mooncake_root is not None: + if on_windows: + raise RuntimeError("Mooncake is not supported on Windows.") + cmake_def_args.append(f"-DMOONCAKE_ROOT={mooncake_root}") + build_dir = get_build_dir(build_dir, build_type) first_build = not Path(build_dir, "CMakeFiles").exists() @@ -819,6 +825,14 @@ def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True): build_run( f"find {nixl_dir} -type f -name '*.so*' -exec patchelf --set-rpath \'$ORIGIN:$ORIGIN/plugins:$ORIGIN/../:$ORIGIN/../ucx/:$ORIGIN/../../ucx/\' {{}} \\;" ) + if os.path.exists( + build_dir / + "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so" + ): + install_file( + build_dir / + "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so", + lib_dir / "libtensorrt_llm_mooncake_wrapper.so") install_file( build_dir / "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so", @@ -1041,6 +1055,10 @@ def add_arguments(parser: ArgumentParser): help="Directory containing NCCL headers and libraries") parser.add_argument("--nixl_root", help="Directory containing NIXL headers and libraries") + parser.add_argument( + "--mooncake_root", + help= + "Directory containing Mooncake transfer engine headers and libraries") parser.add_argument( "--internal-cutlass-kernels-root", default="", diff --git a/scripts/generate_config_database_tests.py b/scripts/generate_config_database_tests.py new file mode 100644 index 00000000000..c198e975a47 --- /dev/null +++ b/scripts/generate_config_database_tests.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Generate a performance regression test list from the config database. + +This script: +1. Reads recipes from the examples/configs/database directory +2. Generates test config files per GPU type (e.g., config_database_b200_nvl.yaml) +3. Generates llm_config_database.yml test list with condition blocks grouped by GPU name and count +""" + +import copy +from collections import defaultdict +from pathlib import Path + +import yaml + +from examples.configs.database.database import ( + DATABASE_LIST_PATH, + Recipe, + RecipeList, + select_key_recipes, +) + +REPO_ROOT = Path(__file__).parent.parent +PERF_SANITY_DIR = REPO_ROOT / "tests" / "scripts" / "perf-sanity" +TEST_LIST_PATH = ( + REPO_ROOT / "tests" / "integration" / "test_lists" / "qa" / "llm_config_database.yml" +) +ITERATIONS = 10 + +# GPU type to condition wildcards mapping for test list +# Note: cpu is used to distinguish between e.g. H200_SXM and GH200 +GPU_WILDCARDS = { + "B200_NVL": {"gpu": ["*b200*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"}, + "H200_SXM": {"gpu": ["*h200*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"}, + "H100_SXM": {"gpu": ["*h100*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"}, + "GH200": {"gpu": ["*gh200*"], "cpu": "aarch64", "linux_distribution_name": "ubuntu*"}, + "GB200": {"gpu": ["*gb200*"], "cpu": "aarch64", "linux_distribution_name": "ubuntu*"}, +} + + +def generate_server_name(recipe: Recipe) -> str: + """Generate a unique server name from recipe.""" + model_slug = recipe.model.replace("/", "_").replace("-", "_").replace(".", "_") + return f"{model_slug}_{recipe.isl}_{recipe.osl}_conc{recipe.concurrency}_gpu{recipe.num_gpus}" + + +def generate_client_name(recipe: Recipe) -> str: + """Generate client config name.""" + return f"con{recipe.concurrency}_isl{recipe.isl}_osl{recipe.osl}" + + +def recipe_to_server_config(recipe: Recipe, llm_api_config: dict) -> dict: + """Convert a recipe + LLM API config to aggr_server format.""" + server_config = { + "name": generate_server_name(recipe), + "model_name": recipe.model, + "gpus": recipe.num_gpus, + # Enable scenario-only matching for baseline comparison + "match_mode": "scenario", + } + + # Copy LLM API config fields + for key, value in llm_api_config.items(): + server_config[key] = value + + # Disable KV cache reuse to ensure consistency + if "kv_cache_config" not in server_config: + server_config["kv_cache_config"] = {} + server_config["kv_cache_config"]["enable_block_reuse"] = False + + # Add client configs + server_config["client_configs"] = [ + { + "name": generate_client_name(recipe), + "concurrency": recipe.concurrency, + "iterations": ITERATIONS, + "isl": recipe.isl, + "osl": recipe.osl, + "random_range_ratio": 0.0, # Fixed ISL/OSL for reproducibility + "backend": "openai", + "streaming": True, + } + ] + + return server_config + + +def group_recipes_by_scenario(recipes: RecipeList) -> dict: + """Group recipes by scenario key (model, gpu, isl, osl, num_gpus).""" + groups = defaultdict(list) + for recipe in recipes: + key = (recipe.model, recipe.gpu, recipe.isl, recipe.osl, recipe.num_gpus) + groups[key].append(recipe) + return groups + + +def filter_to_key_recipes(recipes: RecipeList) -> list[Recipe]: + """Filter recipes to only key configs (min latency, balanced, max throughput).""" + scenario_groups = group_recipes_by_scenario(recipes) + key_recipes = [] + for scenario_recipes in scenario_groups.values(): + for recipe, _ in select_key_recipes(scenario_recipes): + key_recipes.append(recipe) + return key_recipes + + +def group_recipes_by_gpu(recipes: list[Recipe]) -> dict[str, list[Recipe]]: + """Group recipes by GPU type.""" + groups = defaultdict(list) + for recipe in recipes: + groups[recipe.gpu].append(recipe) + return groups + + +def group_recipes_by_num_gpus(recipes: list[Recipe]) -> dict[int, list[Recipe]]: + """Group recipes by num_gpus within a GPU type.""" + groups = defaultdict(list) + for recipe in recipes: + groups[recipe.num_gpus].append(recipe) + return groups + + +def generate_aggr_config(recipes: list[Recipe]) -> dict[str, list[dict]]: + """Generate aggr_server config from recipes.""" + server_configs = [] + + for recipe in recipes: + llm_api_config = recipe.load_config() + server_config = recipe_to_server_config(recipe, llm_api_config) + server_configs.append(server_config) + + return {"server_configs": server_configs} + + +def generate_condition_entry( + gpu_name: str, num_gpus: int, config_name: str, server_names: list +) -> dict: + # using copy.deepcopy to avoid creating YAML anchors + wildcards = copy.deepcopy(GPU_WILDCARDS[gpu_name]) + condition = { + "wildcards": wildcards, + "ranges": {"system_gpu_count": {"gte": num_gpus}}, + } + + tests = [ + f"perf/test_perf.py::test_perf[perf_sanity_upload-{config_name}-{name}]" + for name in server_names + ] + return {"condition": condition, "tests": tests} + + +def generate_tests(test_list_path: Path = TEST_LIST_PATH, test_config_dir: Path = PERF_SANITY_DIR): + test_list_path.parent.mkdir(parents=True, exist_ok=True) + + all_recipes = RecipeList.from_yaml(DATABASE_LIST_PATH) + recipes = filter_to_key_recipes(all_recipes) + print(f"Selected {len(recipes)} key recipes from {len(all_recipes)} total") + + gpu_groups = group_recipes_by_gpu(recipes) + condition_entries = [] + config_files = {} + + for gpu_name in sorted(gpu_groups.keys()): + gpu_recipes = gpu_groups[gpu_name] + config_name = f"config_database_{gpu_name.lower()}" + config_path = test_config_dir / f"{config_name}.yaml" + + aggr_config = generate_aggr_config(gpu_recipes) + config_content = yaml.dump( + aggr_config, default_flow_style=False, sort_keys=False, width=120 + ) + + with open(config_path, "w", encoding="utf-8") as f: + f.write(config_content) + print(f"Generated {config_path}") + + config_files[config_path] = config_content + + # Generate condition entries grouped by num_gpus + num_gpus_groups = group_recipes_by_num_gpus(gpu_recipes) + for num_gpus in sorted(num_gpus_groups.keys()): + server_names = [generate_server_name(r) for r in num_gpus_groups[num_gpus]] + entry = generate_condition_entry(gpu_name, num_gpus, config_name, server_names) + condition_entries.append(entry) + + test_list = { + "version": "0.0.1", + "llm_config_database": condition_entries, + } + + header = """# =============================================================================== +# Config Database Performance Tests (AUTO-GENERATED) +# =============================================================================== +# Generated by: scripts/generate_config_database_tests.py +# +# These tests use scenario-only matching (match_mode: scenario) for baselines. +# Baselines are matched by (model, gpu, isl, osl, concurrency, num_gpus) instead +# of full config fields, allowing configs to evolve while maintaining comparison. +# +# To regenerate: +# python scripts/generate_config_database_tests.py +# =============================================================================== + +""" + with open(test_list_path, "w", encoding="utf-8") as f: + f.write(header) + yaml.dump(test_list, f, default_flow_style=False, sort_keys=False, width=120) + print(f"Generated {test_list_path}") + + +if __name__ == "__main__": + generate_tests() diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py index 2d423c0811f..3c68c7edcb3 100644 --- a/scripts/generate_config_table.py +++ b/scripts/generate_config_table.py @@ -19,7 +19,7 @@ from collections import defaultdict from pathlib import Path -from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList +from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList, assign_profile SCRIPT_DIR = Path(__file__).parent.resolve() REPO_ROOT = SCRIPT_DIR.parent @@ -38,9 +38,6 @@ }, } -LOW_LATENCY_CONCURRENCY_THRESHOLD = 8 -HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32 - def generate_rst(yaml_path, output_file=None): """Generate RST table from YAML config database. @@ -51,10 +48,10 @@ def generate_rst(yaml_path, output_file=None): """ recipe_list = RecipeList.from_yaml(Path(yaml_path)) - # Group by model -> (gpu, isl, osl) -> list of recipes + # Group by model -> (gpu, num_gpus, isl, osl) -> list of recipes model_groups = defaultdict(lambda: defaultdict(list)) for recipe in recipe_list: - key = (recipe.gpu, recipe.isl, recipe.osl) + key = (recipe.gpu, recipe.num_gpus, recipe.isl, recipe.osl) model_groups[recipe.model][key].append(recipe) lines = [] @@ -97,7 +94,8 @@ def generate_rst(yaml_path, output_file=None): subgroups = model_groups[model] sorted_keys = sorted( - subgroups.keys(), key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0)) + subgroups.keys(), + key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0), int(k[3] or 0)), ) for key in sorted_keys: @@ -114,23 +112,7 @@ def generate_rst(yaml_path, output_file=None): conc = entry.concurrency config_path = entry.config_path - if n == 1: - if conc <= LOW_LATENCY_CONCURRENCY_THRESHOLD: - profile = "Low Latency" - elif conc >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD: - profile = "High Throughput" - else: - profile = "Balanced" - elif idx == 0: - profile = "Min Latency" - elif idx == n - 1: - profile = "Max Throughput" - elif idx in ((n - 1) // 2, n // 2): - profile = "Balanced" - elif idx < n // 2: - profile = "Low Latency" - else: - profile = "High Throughput" + profile = assign_profile(n, idx, conc) full_config_path = config_path command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}" diff --git a/scripts/generate_lock_file.py b/scripts/generate_lock_file.py index 9b37858c0e1..5a0992902c5 100755 --- a/scripts/generate_lock_file.py +++ b/scripts/generate_lock_file.py @@ -156,9 +156,10 @@ def generate_metadata_json(): packages = packages[:-1] for package in packages: + package = re.sub(r'\s#.*$', '', package).rstrip() # WAR: ignore lines with "-f": No tool exists to parse complex requirements.txt - if '-f' in package or \ - "#" in package or \ + if not package or \ + '-f' in package or \ package.startswith('--'): continue diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock index ac1ce39f45b..f2f8e40c409 100644 --- a/security_scanning/docs/poetry.lock +++ b/security_scanning/docs/poetry.lock @@ -900,13 +900,13 @@ files = [ [[package]] name = "soupsieve" -version = "2.8" +version = "2.8.1" description = "A modern CSS selector implementation for Beautiful Soup." optional = false python-versions = ">=3.9" files = [ - {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"}, - {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"}, + {file = "soupsieve-2.8.1-py3-none-any.whl", hash = "sha256:a11fe2a6f3d76ab3cf2de04eb339c1be5b506a8a47f2ceb6d139803177f85434"}, + {file = "soupsieve-2.8.1.tar.gz", hash = "sha256:4cf733bc50fa805f5df4b8ef4740fc0e0fa6218cf3006269afd3f9d6d80fd350"}, ] [[package]] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index 261179a6251..a2004681e08 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -2927,30 +2927,30 @@ six = ">=1.14.0" [[package]] name = "ruff" -version = "0.14.9" +version = "0.14.10" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.14.9-py3-none-linux_armv6l.whl", hash = "sha256:f1ec5de1ce150ca6e43691f4a9ef5c04574ad9ca35c8b3b0e18877314aba7e75"}, - {file = "ruff-0.14.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ed9d7417a299fc6030b4f26333bf1117ed82a61ea91238558c0268c14e00d0c2"}, - {file = "ruff-0.14.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d5dc3473c3f0e4a1008d0ef1d75cee24a48e254c8bed3a7afdd2b4392657ed2c"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84bf7c698fc8f3cb8278830fb6b5a47f9bcc1ed8cb4f689b9dd02698fa840697"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa733093d1f9d88a5d98988d8834ef5d6f9828d03743bf5e338bf980a19fce27"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a1cfb04eda979b20c8c19550c8b5f498df64ff8da151283311ce3199e8b3648"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1e5cb521e5ccf0008bd74d5595a4580313844a42b9103b7388eca5a12c970743"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd429a8926be6bba4befa8cdcf3f4dd2591c413ea5066b1e99155ed245ae42bb"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab208c1b7a492e37caeaf290b1378148f75e13c2225af5d44628b95fd7834273"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72034534e5b11e8a593f517b2f2f2b273eb68a30978c6a2d40473ad0aaa4cb4a"}, - {file = "ruff-0.14.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:712ff04f44663f1b90a1195f51525836e3413c8a773574a7b7775554269c30ed"}, - {file = "ruff-0.14.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a111fee1db6f1d5d5810245295527cda1d367c5aa8f42e0fca9a78ede9b4498b"}, - {file = "ruff-0.14.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8769efc71558fecc25eb295ddec7d1030d41a51e9dcf127cbd63ec517f22d567"}, - {file = "ruff-0.14.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:347e3bf16197e8a2de17940cd75fd6491e25c0aa7edf7d61aa03f146a1aa885a"}, - {file = "ruff-0.14.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7715d14e5bccf5b660f54516558aa94781d3eb0838f8e706fb60e3ff6eff03a8"}, - {file = "ruff-0.14.9-py3-none-win32.whl", hash = "sha256:df0937f30aaabe83da172adaf8937003ff28172f59ca9f17883b4213783df197"}, - {file = "ruff-0.14.9-py3-none-win_amd64.whl", hash = "sha256:c0b53a10e61df15a42ed711ec0bda0c582039cf6c754c49c020084c55b5b0bc2"}, - {file = "ruff-0.14.9-py3-none-win_arm64.whl", hash = "sha256:8e821c366517a074046d92f0e9213ed1c13dbc5b37a7fc20b07f79b64d62cc84"}, - {file = "ruff-0.14.9.tar.gz", hash = "sha256:35f85b25dd586381c0cc053f48826109384c81c00ad7ef1bd977bfcc28119d5b"}, + {file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"}, + {file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"}, + {file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"}, + {file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"}, + {file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"}, + {file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"}, + {file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"}, + {file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"}, + {file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"}, + {file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"}, + {file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"}, + {file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"}, ] [[package]] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 2356583a718..084a98983f2 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "c1cfb61b1b0940e9212b68e7ee72d42c6126e242", - "timestamp": "2025-12-18T02:42:21Z" + "commit_hash": "a7ac5a6bca6eab92723ec2d4abacee940e56ad22", + "timestamp": "2025-12-19T02:39:13Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 286d967e6fa..fce2b50e726 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -740,6 +740,82 @@ files = [ {file = "colored-2.3.1.tar.gz", hash = "sha256:fe6e888e12dc16643daa0b108f785df6d0b48420084b5d0a567de27bb09a14d8"}, ] +[[package]] +name = "contourpy" +version = "1.3.2" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = false +python-versions = ">=3.10" +files = [ + {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"}, + {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"}, + {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"}, + {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"}, + {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"}, + {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"}, + {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"}, + {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"}, + {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"}, + {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"}, + {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"}, + {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"}, + {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"}, + {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"}, + {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"}, + {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"}, + {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"}, + {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"}, + {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"}, + {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"}, + {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"}, + {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"}, + {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"}, + {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"}, + {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"}, + {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"}, + {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"}, + {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"}, + {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"}, + {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"}, + {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"}, +] + +[package.dependencies] +numpy = ">=1.23" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] + [[package]] name = "cuda-bindings" version = "13.1.1" @@ -841,6 +917,21 @@ opencl = ["nvidia-cuda-opencl (==13.0.85.*)"] profiler = ["nvidia-cuda-profiler-api (==13.0.85.*)"] sanitizer = ["nvidia-cuda-sanitizer-api (==13.0.85.*)"] +[[package]] +name = "cycler" +version = "0.12.1" +description = "Composable style cycles" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, + {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, +] + +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] + [[package]] name = "datasets" version = "3.1.0" @@ -1084,6 +1175,78 @@ tabulate = "*" torch = "*" tqdm = "*" +[[package]] +name = "fonttools" +version = "4.61.1" +description = "Tools to manipulate font files" +optional = false +python-versions = ">=3.10" +files = [ + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"}, + {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"}, + {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"}, + {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"}, + {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"}, + {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"}, + {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"}, + {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"}, + {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"}, + {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"}, + {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"}, + {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"}, + {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"}, + {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"}, + {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"}, +] + +[package.extras] +all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0)", "xattr", "zopfli (>=0.1.4)"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["munkres", "pycairo", "scipy"] +lxml = ["lxml (>=4.0)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.45.0)"] +symfont = ["sympy"] +type1 = ["xattr"] +unicode = ["unicodedata2 (>=17.0.0)"] +woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"] + [[package]] name = "frozenlist" version = "1.8.0" @@ -1707,6 +1870,116 @@ files = [ [package.dependencies] referencing = ">=0.31.0" +[[package]] +name = "kiwisolver" +version = "1.4.9" +description = "A fast implementation of the Cassowary constraint solver" +optional = false +python-versions = ">=3.10" +files = [ + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"}, + {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"}, +] + [[package]] name = "lark" version = "1.3.1" @@ -2017,6 +2290,84 @@ files = [ {file = "markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698"}, ] +[[package]] +name = "matplotlib" +version = "3.10.8" +description = "Python plotting package" +optional = false +python-versions = ">=3.10" +files = [ + {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"}, + {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"}, + {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"}, + {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"}, + {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"}, + {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"}, + {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"}, + {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"}, + {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"}, + {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"}, +] + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.3.1" +numpy = ">=1.23" +packaging = ">=20.0" +pillow = ">=8" +pyparsing = ">=3" +python-dateutil = ">=2.7" + +[package.extras] +dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"] + [[package]] name = "mdurl" version = "0.1.2" @@ -2551,6 +2902,75 @@ files = [ {file = "ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978"}, ] +[[package]] +name = "numexpr" +version = "2.13.1" +description = "Fast numerical expression evaluator for NumPy" +optional = false +python-versions = ">=3.10" +files = [ + {file = "numexpr-2.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdbc2b93ac59667f0ba725b24cd3b5559c300e91e179d09c74ebaf8c8961eef6"}, + {file = "numexpr-2.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ad6b5dfc191c766e3ec89d2e3f956f7ef3181a1f8bf2bb00ec48fb3bf97b44ac"}, + {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a12dbd4c07a8303c6f01cdade531d75c9b4f5b8f72cbe5821d8f9197ee6fba47"}, + {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2de5c8ca2f25690d48e475d53a3524876164227cf4044743818f5704c28a8639"}, + {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:533ec2d77fc059e3868e9798ef2f13ab57161517cd2e0c521bb33d1dc99068ca"}, + {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a75ddffc36f6b7a679fbc7df492685aed7e8888aec80ec2cd8e30f21fc019caa"}, + {file = "numexpr-2.13.1-cp310-cp310-win32.whl", hash = "sha256:790af35095626ad2d02201c56ac2d49ae45fc95a02af85f40808752ed32ee103"}, + {file = "numexpr-2.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:aadf3118b6ef87294277ffb77a9562970228341aaaa4b78de634a43ea8ea2c6e"}, + {file = "numexpr-2.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdf62745e072c670151c0705bddfe3f33c341dacb7eb255ddb1e8d2a257bfef5"}, + {file = "numexpr-2.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:91cf0521d8fed3f804640c4a6d22b5d9813d7e64b32c38215de163c7f092f7cc"}, + {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e2f111756fff63e27e495473d950e4c98bbebca55aa1572798b59110d6c84b"}, + {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a5a37b74561ed8dbd5f9be182d94419fa53f452e2d7d3e8d6dbef35a20f19f7"}, + {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78cb76676e63f02dcf507e3c563888018a68b6a2e2cd444628e09df270dfd0b2"}, + {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d29b3351de4c43b56d2ef7f138ab7a8988e797291bcbbd56d545e4e7902f254a"}, + {file = "numexpr-2.13.1-cp311-cp311-win32.whl", hash = "sha256:912488ddbd500937bb6f4dfc010bdb3bf757a76e0b93db2f2c56db49ef6b9351"}, + {file = "numexpr-2.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:66d0292f3b9dc5faadb4dd8a89d733321ff01c9699aee0c3cdbf513c9505e39c"}, + {file = "numexpr-2.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6aa48c2f2bfa142dfe260441486452be8f70b5551c17bc846fccf76123d4a226"}, + {file = "numexpr-2.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:67a3dd8b51e94251f535a9a404f1ac939a3ebeb9398caad20ae9d0de37c6d3b3"}, + {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca152998d44ea30b45ad6b8a050ac4a9408b61a17508df87ad0d919335d79b44"}, + {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b4280c8f7cc024846be8fdd6582572bb0b6bad98fb2a68a367ef5e6e2e130d5f"}, + {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b86e1daa4e27d6bf6304008ed4630a055babf863db2ec8f282b4058bbfe466bd"}, + {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:30d189fc52ee4a33b869a0592553cd2ed686c20cded21b2ddf347a4d143f1bea"}, + {file = "numexpr-2.13.1-cp312-cp312-win32.whl", hash = "sha256:e926b59d385de2396935b362143ac2c282176875cf8ee7baba0a150b58421b5c"}, + {file = "numexpr-2.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:8230a8f7cd4e6ba4022643c85e119aa4ca90412267ef20acdf1f54fb3136680d"}, + {file = "numexpr-2.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e4314ee477a2cfb9ecf4b15f2ef24bf7859f62b35de3caef297136ff25bb0b0"}, + {file = "numexpr-2.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d82d088f67647861b61a7b0e0148fd7487000a20909d65734821dd27e0839a68"}, + {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c615b13976e6332336a052d5b03be1fed231bc1afe07699f4c7cc116c7c3092c"}, + {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4874124bccc3c2462558ad2a75029bcc2d1c63ee4914b263bb06339e757efb85"}, + {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0fc7b5b0f8d7ba6c81e948b1d967a56097194c894e4f57852ed8639fc653def2"}, + {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e22104ab53f0933b5b522829149990cb74e0a8ec4b69ff0e6545eb4641b3f013"}, + {file = "numexpr-2.13.1-cp313-cp313-win32.whl", hash = "sha256:824aea72663ec123e042341cea4a2a2b3c71f315e4bc58ee5035ffc7f945bd29"}, + {file = "numexpr-2.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:9c7b1c3e9f398a5b062d9740c48ca454238bf1be433f0f75fe68619527bb7f1a"}, + {file = "numexpr-2.13.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:366a7887c2bad86e6f64666e178886f606cf8e81a6871df450d19f0f83421501"}, + {file = "numexpr-2.13.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:33ff9f071d06aaa0276cb5e2369efd517fe155ea091e43790f1f8bfd85e64d29"}, + {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c29a204b1d35941c088ec39a79c2e83e382729e4066b4b1f882aa5f70bf929a8"}, + {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:40e02db74d66c5b0a81c925838f42ec2d58cc99b49cbaf682f06ac03d9ff4102"}, + {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:36bd9a2b9bda42506377c7510c61f76e08d50da77ffb86a7a15cc5d57c56bb0f"}, + {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b9203651668a3994cf3fe52e079ff6be1c74bf775622edbc226e94f3d8ec8ec4"}, + {file = "numexpr-2.13.1-cp313-cp313t-win32.whl", hash = "sha256:b73774176b15fe88242e7ed174b5be5f2e3e830d2cd663234b1495628a30854c"}, + {file = "numexpr-2.13.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e6228db24b7faa96fbb2beee55f90fc8b0fe167cf288f8481c53ff5e95865a"}, + {file = "numexpr-2.13.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cbadcbd2cf0822d595ccf5345c69478e9fe42d556b9823e6b0636a3efdf990f0"}, + {file = "numexpr-2.13.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a189d514e8aa321ef1c650a2873000c08f843b3e3e66d69072005996ac25809c"}, + {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6b01e9301bed8f89f6d561d79dcaa8731a75cc50efc072526cfbc07df74226c"}, + {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7749e8c0ff0bae41a534e56fab667e529f528645a0216bb64260773ae8cb697"}, + {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0b0f326542185c23fca53e10fee3c39bdadc8d69a03c613938afaf3eea31e77f"}, + {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:33cc6d662a606cc5184c7faef1d7b176474a8c46b8b0d2df9ff0fa67ed56425f"}, + {file = "numexpr-2.13.1-cp314-cp314-win32.whl", hash = "sha256:71f442fd01ebfa77fce1bac37f671aed3c0d47a55e460beac54b89e767fbc0fa"}, + {file = "numexpr-2.13.1-cp314-cp314-win_amd64.whl", hash = "sha256:208cd9422d87333e24deb2fe492941cd13b65dc8b9ce665de045a0be89e9a254"}, + {file = "numexpr-2.13.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:37d31824b9c021078046bb2aa36aa1da23edaa7a6a8636ee998bf89a2f104722"}, + {file = "numexpr-2.13.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:15cee07c74e4792993cd2ecd46c5683815e8758ac56e1d4d236d2c9eb9e8ae01"}, + {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65cb46136f068ede2fc415c5f3d722f2c7dde3eda04ceafcfbcac03933f5d997"}, + {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:abc3c1601380c90659b9ac0241357c5788ab58de148f56c5f98adffe293c308c"}, + {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2836e900377ce27e99c043a35e008bc911c51781cea47623612a4e498dfa9592"}, + {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f4e4c5b38bb5695fff119672c3462d9a36875256947bafb2df4117b3271fd6a3"}, + {file = "numexpr-2.13.1-cp314-cp314t-win32.whl", hash = "sha256:156591eb23684542fd53ca1cbefff872c47c429a200655ef7e59dd8c03eeeaef"}, + {file = "numexpr-2.13.1-cp314-cp314t-win_amd64.whl", hash = "sha256:a2cc21b2d2e59db63006f190dbf20f5485dd846770870504ff2a72c8d0406e4e"}, + {file = "numexpr-2.13.1.tar.gz", hash = "sha256:ecb722249c2d6ed7fefe8504bb17e056481a5f31233c23a7ee02085c3d661fa1"}, +] + +[package.dependencies] +numpy = ">=1.23.0" + [[package]] name = "numpy" version = "1.26.4" @@ -4060,6 +4480,20 @@ nvidia-ml-py = ">=12.0.0" [package.extras] test = ["pytest (>=3.6)", "pytest-cov", "pytest-runner"] +[[package]] +name = "pyparsing" +version = "3.2.5" +description = "pyparsing - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"}, + {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + [[package]] name = "pyproject-hooks" version = "1.2.0" @@ -5210,19 +5644,56 @@ opt-einsum = ["opt-einsum (>=3.3)"] optree = ["optree (>=0.13.0)"] pyyaml = ["pyyaml"] +[[package]] +name = "torch-c-dlpack-ext" +version = "0.1.3" +description = "torch c dlpack ext" +optional = false +python-versions = ">=3.9" +files = [ + {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:49f8a1eaea21443c338df7bcf93f9026274b910ab23850777a88db040608c0a1"}, + {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e2cb08aa7591a08b4992fc99b10e86b46a65d9a46c34d9697e8fab03bfcaf46"}, + {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9ae36f7d4ccd4a9806528fa8dc8f0e3cfc47530adff8c7b6a72762bc97643b0"}, + {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:f92a0582cfa28418924f94bd6b89f662555d73dcc7ca0de1cad78a4f04ebca26"}, + {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:770fd7618973f70bfea288d5c419bdf974fc578e84248341524bb1ed20b969fd"}, + {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71de2233ff974f09379e84699af88e83aeb63dd885627123f745780ff592d15c"}, + {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78b963243b5b0e7d463fab365f31ec1569223845942f6591ab2ac067ad0f0338"}, + {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:b0244f282e0e74f2cefa843caeb601f5acfd88342029b0ca901dd40ab883818b"}, + {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b7d64453fa62c75551f2413cde55748a3461af475da386b2e709239555e07c3"}, + {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cd69fb034cd638eb0908767d74e5d0ea87df18d366b18d66c2c3472b29c80e5e"}, + {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8ebf732b5079912e0b85f32a75bae6932f021fbc13c2dff1c9f7cea437b71345"}, + {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:69685ac393f1f402c289ac04435120d518bde890388474fe2f8a58e7d290eb50"}, + {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5f87b18064c017edb240b1766e858d18fe9472c11180a2811216293376ba6ef0"}, + {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2afc7165195f4a256aab16147040d63a0cc55b7c530946d9726125268a54303a"}, + {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96743df478df006b21ae18111f4a2528abcc46131389b8d99176c37c30559474"}, + {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-win_amd64.whl", hash = "sha256:74f491fe1ec64ff631a4844ef87339a1e825d375d87bad79ec8e9b922292a043"}, + {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:61d17b3be0c43c846e8ff4c54e5f05a35daeb8453fb14cec05742fcce41bada7"}, + {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa8bf3a52fc13306866282e204ee6979a0cabaf64c8ef8d6ee700d4c4b2519a1"}, + {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fa48bb2e613c3a1fec135edbde1c7923a20b7dc3a5a3f2d17be7e0a7d333b18"}, + {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-win_amd64.whl", hash = "sha256:d7344b830359c4ef3165c10a82de96daf711a38c21b18b82c30d9d8dcd3e4529"}, + {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:b81bfa08d3dc791f808610e1abf0603c745b8c82681009a089b3dae650b6ff61"}, + {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d079d66404ec3911c02d4fd4cd41f42ef56f1ebdd5ecd68bcc2f425cbd12d08e"}, + {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d03bf108eab58b2c6dbe7e94211f670422c961b1e1e32fbaec442d5359ac02bf"}, + {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:5ee661e6b910e67200ba7c911436a5af8be288f938883971a0cf5632645183c8"}, + {file = "torch_c_dlpack_ext-0.1.3.tar.gz", hash = "sha256:4b5da66432af7224dcf02aad4f13cc416eeef5331cd153588b7e081a193f4972"}, +] + +[package.dependencies] +torch = "*" + [[package]] name = "torchao" -version = "0.14.1" +version = "0.15.0" description = "Package for applying ao techniques to GPU models" optional = false python-versions = "*" files = [ - {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"}, - {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"}, + {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"}, + {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"}, ] [package.extras] -dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"] +dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"] [[package]] name = "torchprofile" @@ -5856,4 +6327,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "955bcecb84ae2d8555ba7c10772099be9a6451a8a00f61d5aa3b86d2666a4ef6" +content-hash = "d44d9d44355bac8ca580030e7e4eeb0a7cfdff7cf25045ffd8f38d077b27306c" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index 4253fe2ac00..a9a9aa0e2a2 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -62,6 +62,7 @@ llguidance = "0.7.29" jsonschema = "^4.25.1" backoff = "^2.2.1" nvtx = "^0.2.14" +matplotlib = "^3.10.8" meson = "^1.10.0" ninja = "^1.13.0" etcd3 = {git = "https://github.com/kragniz/python-etcd3.git", rev = "e58a899579ba416449c4e225b61f039457c8072a"} @@ -73,7 +74,9 @@ blobfile = "^3.1.0" openai-harmony = "0.0.4" nvidia-cutlass-dsl = "4.3.1" plotly = "^6.5.0" +numexpr = "<2.14.0" partial-json-parser = "^0.2.1.1.post7" +torch-c-dlpack-ext = "0.1.3" mistral-common = "1.8.6" torchao = ">=0.14.1" diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock index 159351cf115..b530fa57c39 100644 --- a/security_scanning/triton_backend/poetry.lock +++ b/security_scanning/triton_backend/poetry.lock @@ -842,17 +842,17 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "r [[package]] name = "torchao" -version = "0.14.1" +version = "0.15.0" description = "Package for applying ao techniques to GPU models" optional = false python-versions = "*" files = [ - {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"}, - {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"}, + {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"}, + {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"}, ] [package.extras] -dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"] +dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"] [[package]] name = "tqdm" diff --git a/setup.py b/setup.py index 5c61029aad8..094ca01467e 100644 --- a/setup.py +++ b/setup.py @@ -114,9 +114,9 @@ def has_ext_modules(self): 'libs/libnvinfer_plugin_tensorrt_llm.so', 'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so', 'libs/libtensorrt_llm_nixl_wrapper.so', 'libs/nixl/**/*', - 'libs/ucx/**/*', 'libs/libpg_utils.so', - 'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt', - 'libs/nvshmem/nvshmem_bootstrap_uid.so.3', + 'libs/libtensorrt_llm_mooncake_wrapper.so', 'libs/ucx/**/*', + 'libs/libpg_utils.so', 'libs/libdecoder_attention_1.so', + 'libs/nvshmem/License.txt', 'libs/nvshmem/nvshmem_bootstrap_uid.so.3', 'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so', 'deep_ep/LICENSE', 'deep_ep/*.py', 'deep_ep_cpp_tllm.*.so', "include/**/*", 'deep_gemm/LICENSE', 'deep_gemm/include/**/*', diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py index 4265217453d..7ce9b7befa8 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py @@ -2,6 +2,8 @@ import flashinfer import torch +import torch.nn.functional as F +from einops import rearrange from ...flashinfer_utils import get_env_enable_pdl from ...modules.mamba.layernorm_gated import _layer_norm_fwd @@ -159,3 +161,35 @@ def _triton_rmsnorm_gated_meta( assert gate.shape == x.shape, "gate must match x shape" return x.new_empty(x.shape, dtype=torch.float32) + + +# Forked from: +# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py +# NOTES: +# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm` +# to be installed so as to be able to make use of its grouped gated RMS norm operation. +# We therefore replace it with one that uses einops + pytorch. +def gated_rms_norm_ref( + x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True +): + dtype = x.dtype + # N = x.shape[-1] + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + z = z.float() if z is not None else z + if z is not None and not norm_before_gate: + x = x * F.silu(z) + if group_size is None: + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) + else: + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps) + out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + if z is not None and norm_before_gate: + out *= F.silu(z) + return out.to(dtype) diff --git a/tensorrt_llm/_torch/auto_deploy/models/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/__init__.py index 6eae19f23c5..327d084bf0b 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/models/__init__.py @@ -1,4 +1,2 @@ -# TODO: When getting rid of the nemotron H patches, import `modeling_nemotron_h` here to ensure the -# custom model implementation is registered. from . import custom, hf, nemotron_flash, patches from .factory import * diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py index fef9fdb1660..e32f72f56f9 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py +++ b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py @@ -1 +1,8 @@ from .modeling_nemotron_flash import NemotronFlashForCausalLM, NemotronFlashPreTrainedTokenizerFast +from .modeling_nemotron_h import NemotronHForCausalLM + +__all__ = ( + "NemotronFlashForCausalLM", + "NemotronFlashPreTrainedTokenizerFast", + "NemotronHForCausalLM", +) diff --git a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py similarity index 83% rename from tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py rename to tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py index 6a54617497e..3756c054f76 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py +++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py @@ -25,17 +25,14 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint -from einops import rearrange from torch import nn from transformers.activations import ACT2FN from transformers.generation import GenerationMixin from transformers.modeling_utils import PreTrainedModel from transformers.utils import ModelOutput -from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import ( - _nemotron_h_moe_forward, - _nemotron_h_topk_router_forward, -) +from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import gated_rms_norm_ref +from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory class MambaRMSNormGated(torch.nn.Module): @@ -46,7 +43,7 @@ def __init__(self, hidden_size, group_size, eps=1e-5): self.group_size = group_size def forward(self, hidden_states, gate=None): - return _rms_norm_ref( + return gated_rms_norm_ref( x=hidden_states, weight=self.weight, bias=None, @@ -57,38 +54,6 @@ def forward(self, hidden_states, gate=None): ) -# Forked from: -# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py -# NOTES: -# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm` -# to be installed so as to be able to make use of its grouped gated RMS norm operation. -# We therefore replace it with one that uses einops + pytorch. -def _rms_norm_ref( - x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True -): - dtype = x.dtype - # N = x.shape[-1] - weight = weight.float() - bias = bias.float() if bias is not None else None - if upcast: - x = x.float() - z = z.float() if z is not None else z - if z is not None and not norm_before_gate: - x = x * F.silu(z) - if group_size is None: - rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) - out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) - else: - x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) - rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps) - out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight - if bias is not None: - out = out + bias - if z is not None and norm_before_gate: - out *= F.silu(z) - return out.to(dtype) - - class NemotronHMamba2Mixer(nn.Module): """ Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. @@ -149,9 +114,9 @@ def __init__(self, config, layer_idx: int): self.A_log._no_weight_decay = True # Instead of recomputing `torch.exp(self.A_log.float())` on every forward pass, we will register a hook # that sets this appropriately when loading weights. - # NOTE: we explicitly do NOT make this a `nn.Parameter` so that it does not appear in the state dict of - # this module, or an equivalent graph module trace from it. - self._minus_A = -A.float() + # NOTE: we explicitly register this as a non-persistent buffer so that it does not appear in the state dict of + # this module, or an equivalent graph module trace from it, but still gets included in e.g. `to()` calls. + self.register_buffer("_minus_A", -A.float(), persistent=False) self.norm = MambaRMSNormGated( self.intermediate_size, eps=self.layer_norm_epsilon, @@ -317,8 +282,43 @@ def __init__(self, config, layer_idx: Optional[int] = None): layer_idx=layer_idx, ) - # TODO: inline code from `_nemotron_h_moe_forward` when removing patches. - forward = _nemotron_h_moe_forward + def forward(self, hidden_states: torch.Tensor): + residuals = hidden_states + orig_shape = hidden_states.shape + topk_indices, topk_weights = self.gate(hidden_states) + x_flat = hidden_states.view(-1, hidden_states.shape[-1]) + + # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported + # graph. + # We dispatch shared expert first so that we can easily fork the execution of the routed experts + # (using the custom op below) to an auxiliary stream. + shared_out = self.shared_experts(residuals) + # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj) + has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj") + + if has_latent_proj: + # Latent MOE: project to latent space before routing + x_flat = self.fc1_latent_proj(x_flat) + + # Route through experts (operates in latent space if latent MOE, full space otherwise) + out_flat = torch.ops.auto_deploy.torch_moe( + x_flat, + topk_indices, + topk_weights, + w1_weight=[e.up_proj.weight for e in self.experts], + w2_weight=[e.down_proj.weight for e in self.experts], + w3_weight=[], + act_fn="relu2", + mlp_style="mlp", + ) + + if has_latent_proj: + # Latent MOE: project back from latent space + out_flat = self.fc2_latent_proj(out_flat) + + routed_out = out_flat.view(*orig_shape) + out = shared_out + routed_out + return out class NemotronHTopkRouter(nn.Module): @@ -339,22 +339,33 @@ def __init__(self, config): "e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32) ) - forward = _nemotron_h_topk_router_forward + def forward(self, hidden_states): + """ + Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel. + This replaces the original forward method which used pure PyTorch operations + with optimized CUDA kernels: + """ + hidden_states = hidden_states.view(-1, self.config.hidden_size) + if self.weight.dtype == torch.float32: + router_logits = F.linear(hidden_states.type(torch.float32), self.weight) + else: + router_logits = torch.ops.trtllm.dsv3_router_gemm_op( + hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32 + ) -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand( - batch, num_key_value_heads, n_rep, slen, head_dim - ) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + # Use the fused noaux_tc_op kernel which applies sigmoid internally + # and performs group-based top-k selection with normalization + topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op( + router_logits, + self.e_score_correction_bias, + self.n_group, + self.topk_group, + self.top_k, + self.routed_scaling_factor, + ) + + return topk_indices, topk_weights class NemotronHAttention(nn.Module): @@ -369,8 +380,23 @@ def __init__(self, config, layer_idx: Optional[int] = None): self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - if config.head_dim is not None: - self.head_dim = config.head_dim + + # At some point during NemotronH development, what used to be called `attention_head_dim` + # was renamed to `head_dim`. Since no configuration class's code (nor the modeling code, + # for that matter) was ever upstreamed into `transformers`, we have to resort to the below + # hack in order to support multiple iterations of NemotronH models. + if hasattr(config, "head_dim"): + head_dim = config.head_dim + elif hasattr(config, "attention_head_dim"): + head_dim = config.attention_head_dim + else: + raise AttributeError( + "Expected either `head_dim` or `attention_head_dim` to be present in the config " + "class, found neither." + ) + + if head_dim is not None: + self.head_dim = head_dim else: self.head_dim = config.hidden_size // config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads @@ -594,7 +620,4 @@ def forward( return NemotronHCausalLMOutput(logits) -# TODO: uncomment after removing patches (and make sure it is imported in `__init__.py`). -# from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory -# -# AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM) +AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM) diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py index 095e47f299d..e69de29bb2d 100644 --- a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py +++ b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py @@ -1,200 +0,0 @@ -import contextlib -import importlib.util -import sys -import types -from typing import Callable, Dict, List, Optional, Tuple - -import torch -import torch.nn.functional as F -from einops import rearrange -from transformers import AutoModelForCausalLM - -from tensorrt_llm._torch.auto_deploy.models.patches.bamba import _bamba_mixer_torch_forward - - -# Forked from: -# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py -# NOTES: -# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm` -# to be installed so as to be able to make use of its grouped gated RMS norm operation. -# We therefore replace it with one that uses einops + pytorch. -def _rms_norm_ref( - x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True -): - dtype = x.dtype - # N = x.shape[-1] - weight = weight.float() - bias = bias.float() if bias is not None else None - if upcast: - x = x.float() - z = z.float() if z is not None else z - if z is not None and not norm_before_gate: - x = x * F.silu(z) - if group_size is None: - rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) - out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) - else: - x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) - rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps) - out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight - if bias is not None: - out = out + bias - if z is not None and norm_before_gate: - out *= F.silu(z) - return out.to(dtype) - - -# The original implementation looks at `cache_position[0]` to decide what to do which does not -# play well with export. Plus, we do not want it to be updated anyway. -def _nemotron_h_model_update_mamba_mask(self, attention_mask, cache_position): - return None - - -def _nemotron_h_model_update_causal_mask(self, attention_mask, input_tensor, cache_position): - # Force attention to use causal mode without explicit masks - return None - - -def _nemotron_h_block_forward( - self, - hidden_states, - cache_params=None, - cache_position: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, -): - device = hidden_states.device - with contextlib.ExitStack() as stack: - if device.type == "cuda": - stack.enter_context(torch.cuda.stream(torch.cuda.default_stream(device))) - # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs - residual = hidden_states - hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype)) - if self.residual_in_fp32: - residual = residual.to(torch.float32) - - if self.block_type == "mamba": - hidden_states = self.mixer( - hidden_states, cache_params=cache_params, cache_position=cache_position - ) - elif self.block_type == "attention": - hidden_states = self.mixer(hidden_states, cache_position=cache_position) - hidden_states = hidden_states[0] - elif self.block_type in ["mlp", "moe"]: - hidden_states = self.mixer(hidden_states) - else: - raise ValueError(f"Invalid block_type: {self.block_type}") - - hidden_states = residual + hidden_states - return hidden_states - - -def _nemotron_h_topk_router_forward(self, hidden_states): - """ - Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel. - - This replaces the original forward method which used pure PyTorch operations - with optimized CUDA kernels: - """ - hidden_states = hidden_states.view(-1, self.config.hidden_size) - if self.weight.dtype == torch.float32: - router_logits = F.linear(hidden_states.type(torch.float32), self.weight) - else: - router_logits = torch.ops.trtllm.dsv3_router_gemm_op( - hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32 - ) - - # Use the fused noaux_tc_op kernel which applies sigmoid internally - # and performs group-based top-k selection with normalization - topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op( - router_logits, - self.e_score_correction_bias, - self.n_group, - self.topk_group, - self.top_k, - self.routed_scaling_factor, - ) - - return topk_indices, topk_weights - - -# Note: we assume experts have no bias for now -def _nemotron_h_moe_forward(self, hidden_states: torch.Tensor): - """ - Uses NemotronH router (returns indices, weights) and dispatches through auto_deploy::torch_moe - with act_fn='relu2'. Handles both latent MOE and direct MOE architectures. - """ - - residuals = hidden_states - orig_shape = hidden_states.shape - topk_indices, topk_weights = self.gate(hidden_states) - x_flat = hidden_states.view(-1, hidden_states.shape[-1]) - - # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported graph. - # We dispatch shared expert first so that we can easily fork the execution of the routed experts - # (using the custom op below) to an auxiliary stream. - shared_out = self.shared_experts(residuals) - # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj) - has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj") - - if has_latent_proj: - # Latent MOE: project to latent space before routing - x_flat = self.fc1_latent_proj(x_flat) - - # Route through experts (operates in latent space if latent MOE, full space otherwise) - out_flat = torch.ops.auto_deploy.torch_moe( - x_flat, - topk_indices, - topk_weights, - w1_weight=[e.up_proj.weight for e in self.experts], - w2_weight=[e.down_proj.weight for e in self.experts], - w3_weight=[], - act_fn="relu2", - mlp_style="mlp", - ) - - if has_latent_proj: - # Latent MOE: project back from latent space - out_flat = self.fc2_latent_proj(out_flat) - - routed_out = out_flat.view(*orig_shape) - out = shared_out + routed_out - return out - - -_from_config_original = AutoModelForCausalLM.from_config - -CUSTOM_MODULE_PATCHES: Dict[str, List[Tuple[str, Callable]]] = { - "NemotronHMamba2Mixer": [("forward", _bamba_mixer_torch_forward)], - "NemotronHModel": [ - ("_update_causal_mask", _nemotron_h_model_update_causal_mask), - ("_update_mamba_mask", _nemotron_h_model_update_mamba_mask), - ], - "NemotronHBlock": [("forward", _nemotron_h_block_forward)], - "NemotronHMOE": [("forward", _nemotron_h_moe_forward)], - "NemotronHTopkRouter": [("forward", _nemotron_h_topk_router_forward)], -} - - -def get_model_from_config_patched(config, **kwargs): - model = _from_config_original(config, **kwargs) - # Patch modules - for _, module in model.named_modules(): - if (module_name := type(module).__name__) in CUSTOM_MODULE_PATCHES.keys(): - patches = CUSTOM_MODULE_PATCHES[module_name] - for method_name, method_patch in patches: - setattr(module, method_name, types.MethodType(method_patch, module)) - - return model - - -# TODO: figure out how this can be incorporated into the export patch system -AutoModelForCausalLM.from_config = get_model_from_config_patched - -# TODO: figure out how this can be incorporated into the export patch system -# Only patch if the module isn't available -_mamba_ssm_module = "mamba_ssm" -_mamba_ssm_submodule = f"{_mamba_ssm_module}.ops.triton.layernorm_gated" -if importlib.util.find_spec(_mamba_ssm_module) is None: - stub_mod = types.ModuleType(_mamba_ssm_submodule) - stub_mod.rmsnorm_fn = _rms_norm_ref - sys.modules[_mamba_ssm_submodule] = stub_mod diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py index 28c61e74dd4..2fdaaf55067 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py @@ -123,7 +123,7 @@ def _apply( cnt += 1 return gm, TransformInfo( - skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=True + skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=(cnt == 0) ) def _insert_quantized_linear( diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py index 36c2e683bf6..860b5b7de5b 100644 --- a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py +++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py @@ -7,8 +7,8 @@ from pydantic import Field from torch.fx import GraphModule +from ...custom_ops.rms_norm import gated_rms_norm_ref from ...models.factory import ModelFactory -from ...models.patches.nemotron_h import _rms_norm_ref from ...shim.interface import CachedSequenceInterface # It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher @@ -225,7 +225,7 @@ def _gated_rmsnorm_pattern_ref( eps: float = 1e-5, group_size: int = 512, ) -> torch.Tensor: - y = _rms_norm_ref( + y = gated_rms_norm_ref( x, weight, bias=None, diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py index 73ee3f5c7b5..5616be77081 100644 --- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py +++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py @@ -42,6 +42,7 @@ def create_kv_cache_transceiver( cache_transceiver_config.backend = "NIXL" # Ordered by priority env_vars = [("TRTLLM_USE_UCX_KVCACHE", "UCX"), + ("TRTLLM_USE_MOONCAKE_KVCACHE", "MOONCAKE"), ("TRTLLM_USE_MPI_KVCACHE", "MPI")] for env_var, be_type in env_vars: if getenv(env_var) == "1": diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index c2d5f23f50c..2f22f493406 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1739,10 +1739,11 @@ class CacheTransceiverConfig(StrictBaseModel, PybindMirror): Configuration for the cache transceiver. """ - backend: Optional[Literal["DEFAULT", "UCX", "NIXL", "MPI"]] = Field( - default=None, - description= - "The communication backend type to use for the cache transceiver.") + backend: Optional[Literal[ + "DEFAULT", "UCX", "NIXL", "MOONCAKE", "MPI"]] = Field( + default=None, + description= + "The communication backend type to use for the cache transceiver.") max_tokens_in_buffer: Optional[int] = Field( default=None, diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 31f04f99685..894114c0f4a 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -863,10 +863,7 @@ def test_auto_dtype_with_helix(self): "disable_overlap_scheduler": True, "kv_cache_config": kv_cache_config, "enable_chunked_prefill": False, - "cuda_graph_config": { - "enable_padding": True, - "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128] - }, + "cuda_graph_config": None, "cache_transceiver_config": { "backend": "UCX" }, diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py index 7cf92efaadb..1124178cccc 100644 --- a/tests/integration/defs/cpp/test_multi_gpu.py +++ b/tests/integration/defs/cpp/test_multi_gpu.py @@ -25,6 +25,7 @@ class KVCacheType(Enum): MPI = auto() UCX = auto() NIXL = auto() + MOONCAKE = auto() def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False): @@ -37,6 +38,9 @@ def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False): env["TRTLLM_USE_UCX_KVCACHE"] = "1" case KVCacheType.NIXL: env["TRTLLM_USE_NIXL_KVCACHE"] = "1" + case KVCacheType.MOONCAKE: + env["TRTLLM_USE_MOONCAKE_KVCACHE"] = "1" + env["MC_FORCE_TCP"] = "1" case KVCacheType.NONE: pass case _: @@ -502,8 +506,9 @@ def test_fused_gemm_allreduce(build_google_tests, nprocs, build_dir): @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"], indirect=True) -@pytest.mark.parametrize("kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX], - ids=["nixl_kvcache", "ucx_kvcache"]) +@pytest.mark.parametrize( + "kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX, KVCacheType.MOONCAKE], + ids=["nixl_kvcache", "ucx_kvcache", "mooncake_kvcache"]) @pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"]) def test_cache_transceiver(build_google_tests, nprocs, kvcache_type, build_dir): diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 5824670d6f1..2687a730ce3 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -58,6 +58,20 @@ "d_p99_e2el", ] +# Fields for scenario-only matching for recipe tests. +# Unlike regular tests that match on all config fields, recipes match only on the benchmark +# scenario, allowing the underlying config to change while still comparing against baselines +# for the same scenario. +SCENARIO_MATCH_FIELDS = [ + "s_runtime", + "s_model_name", + "s_gpu_type", + "l_isl", + "l_osl", + "l_concurrency", + "l_num_gpus", +] + def add_id(data): OpenSearchDB.add_id_of_json(data) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index f55113f05c1..82891ca8470 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -29,7 +29,8 @@ print_warning) from ..conftest import get_llm_root, llm_models_root, trt_environment -from .open_search_db_utils import (add_id, get_history_data, get_job_info, +from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id, + get_history_data, get_job_info, post_new_perf_data, prepare_baseline_data, prepare_regressive_test_cases, write_regressive_test_cases) @@ -597,6 +598,11 @@ def __init__(self, server_config_data: dict, env_vars: str = ""): self.speculative_model_dir = speculative_config.get( 'speculative_model_dir', "") + # match_mode: "config" (default, 40+ fields) or "scenario" (benchmark scenario fields for recipe testing) + # When match_mode is "scenario", baselines are matched by scenario identity + # (model, gpu, isl, osl, concurrency, num_gpus) instead of full config fields. + self.match_mode = server_config_data.get('match_mode', "config") + # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs) self.extra_llm_api_config_data = { k: v @@ -2438,9 +2444,12 @@ def prefix_server_config_dict(config_dict: dict, new_data_dict[cmd_idx] = new_data cmd_idx += 1 if not match_keys: - match_keys.append("s_runtime") - match_keys.extend(server_config_dict.keys()) - match_keys.extend(client_config_dict.keys()) + if server_config.match_mode == "scenario": + match_keys = SCENARIO_MATCH_FIELDS.copy() + else: + match_keys.append("s_runtime") + match_keys.extend(server_config_dict.keys()) + match_keys.extend(client_config_dict.keys()) elif self._config.runtime == "multi_node_disagg_server": if self._config.disagg_configs[0][ diff --git a/tests/integration/test_lists/qa/README.md b/tests/integration/test_lists/qa/README.md index 1a15c87ccfb..a0e3afb3dcf 100644 --- a/tests/integration/test_lists/qa/README.md +++ b/tests/integration/test_lists/qa/README.md @@ -59,6 +59,7 @@ This directory contains various test configuration files: - `llm_perf_full.yml` - Main performance test configuration - `llm_perf_cluster.yml` - Cluster-based performance tests - `llm_perf_sanity.yml` - Performance sanity checks +- `llm_config_database.yml` - Performance regression tests for the config database in `examples/configs/database` (auto-generated by `scripts/generate_config_database_tests.py`) - `llm_perf_nim.yml` - NIM-specific performance tests - `llm_trt_integration_perf.yml` - Integration performance tests - `llm_trt_integration_perf_sanity.yml` - Integration performance sanity checks @@ -77,7 +78,7 @@ QA tests are executed on a regular schedule: - **Weekly**: Automated regression testing - **Release**: Comprehensive validation before each release - **Full Cycle Testing**: - run all gpu with llm_function_core.txt + run NIM specific gpu with llm_function_nim.txt + run all gpu with llm_function_core.txt, run NIM specific gpu with llm_function_nim.txt, and run config database tests with llm_config_database.yml - **Sanity Cycle Testing**: run all gpu with llm_function_core_sanity.txt - **NIM Cycle Testing**: diff --git a/tests/integration/test_lists/qa/llm_config_database.yml b/tests/integration/test_lists/qa/llm_config_database.yml new file mode 100644 index 00000000000..15f14162b7b --- /dev/null +++ b/tests/integration/test_lists/qa/llm_config_database.yml @@ -0,0 +1,191 @@ +# =============================================================================== +# Config Database Performance Tests (AUTO-GENERATED) +# =============================================================================== +# Generated by: scripts/generate_config_database_tests.py +# +# These tests use scenario-only matching (match_mode: scenario) for baselines. +# Baselines are matched by (model, gpu, isl, osl, concurrency, num_gpus) instead +# of full config fields, allowing configs to evolve while maintaining comparison. +# +# To regenerate: +# python scripts/generate_config_database_tests.py +# =============================================================================== + +version: 0.0.1 +llm_config_database: +- condition: + wildcards: + gpu: + - '*b200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 1 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1] +- condition: + wildcards: + gpu: + - '*b200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 2 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2] +- condition: + wildcards: + gpu: + - '*b200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 4 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4] +- condition: + wildcards: + gpu: + - '*b200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 8 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8] +- condition: + wildcards: + gpu: + - '*h200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 1 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1] +- condition: + wildcards: + gpu: + - '*h200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 2 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2] +- condition: + wildcards: + gpu: + - '*h200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 4 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4] +- condition: + wildcards: + gpu: + - '*h200*' + cpu: x86_64 + linux_distribution_name: ubuntu* + ranges: + system_gpu_count: + gte: 8 + tests: + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8] + - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 382dd135531..13a078c463a 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -181,4 +181,3 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2 - - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 63817ed9afd..4e90db0050b 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -231,6 +231,7 @@ l0_dgx_h100: - cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90] ISOLATION - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] ISOLATION - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] ISOLATION + - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] ISOLATION - cpp/test_multi_gpu.py::test_user_buffer[2proc-90] - cpp/test_multi_gpu.py::test_enc_dec[t5-90] - cpp/test_multi_gpu.py::test_llama_executor[llama-orchestrator-90] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml index 40fe6ed6750..503a3024a00 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml @@ -69,6 +69,7 @@ l0_gb200_multi_gpus: - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] + - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml index 894bc21b1e7..f88ac773375 100644 --- a/tests/integration/test_lists/test-db/l0_sanity_check.yml +++ b/tests/integration/test_lists/test-db/l0_sanity_check.yml @@ -34,3 +34,5 @@ l0_sanity_check: - examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] ISOLATION - unittest/others/test_kv_cache_transceiver.py::test_kv_cache_transceiver_single_process[NIXL-mha-ctx_fp16_gen_fp16] - unittest/others/test_kv_cache_transceiver.py::test_kv_cache_transceiver_single_process[UCX-mha-ctx_fp16_gen_fp16] + - unittest/others/test_kv_cache_transceiver.py::test_cancel_request_in_transmission[mha] + - unittest/others/test_kv_cache_transceiver.py::test_cancel_request_in_transmission[mla] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2b3cc7427f7..4f6d8e75a44 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -407,7 +407,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912) unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) @@ -436,7 +435,6 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix SKIP (https://nvbugs/5741331) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075) @@ -459,14 +457,9 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075) -test_e2e.py::test_openai_mmencoder_example SKIP (https://nvbugs/5747911) test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920) examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5747930) test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938) -unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867) -unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867) -unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867) -unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867) unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args8] SKIP (https://nvbugs/5747878) unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args9] SKIP (https://nvbugs/5747878) triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981) @@ -482,6 +475,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5702793) accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5702793) disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564) +disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963) unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516) unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521) unittest/llmapi/apps/_test_openai_responses.py::test_reasoning_effort[DeepSeek-R1-Distill-Qwen-1.5B] SKIP (https://nvbugs/5753250) diff --git a/tests/scripts/perf-sanity/config_database_b200_nvl.yaml b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml new file mode 100644 index 00000000000..3ad69455a4e --- /dev/null +++ b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml @@ -0,0 +1,1839 @@ +server_configs: +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con32_isl1024_osl1024 + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1344 + max_seq_len: 2068 + client_configs: + - name: con256_isl1024_osl1024 + concurrency: 256 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con32_isl1024_osl1024 + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1344 + max_seq_len: 2068 + client_configs: + - name: con256_isl1024_osl1024 + concurrency: 256 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con32_isl8192_osl1024 + concurrency: 32 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 4 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8512 + max_seq_len: 9416 + client_configs: + - name: con256_isl8192_osl1024 + concurrency: 256 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con32_isl8192_osl1024 + concurrency: 32 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8 + model_name: nvidia/DeepSeek-R1-0528-FP4-v2 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8512 + max_seq_len: 9416 + client_configs: + - name: con256_isl8192_osl1024 + concurrency: 256 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 256 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.8 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: DEEPGEMM + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + NCCL_GRAPH_REGISTER: 0 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: fp8 + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + print_iter_log: true + stream_interval: 20 + num_postprocess_workers: 4 + moe_config: + backend: TRTLLM + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true diff --git a/tests/scripts/perf-sanity/config_database_h200_sxm.yaml b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml new file mode 100644 index 00000000000..9d2f8481ce9 --- /dev/null +++ b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml @@ -0,0 +1,1415 @@ +server_configs: +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 1152 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: false + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8 + model_name: deepseek-ai/DeepSeek-R1-0528 + gpus: 8 + match_mode: scenario + cuda_graph_config: + enable_padding: true + max_batch_size: 128 + enable_attention_dp: true + print_iter_log: true + kv_cache_config: + dtype: fp8 + free_gpu_memory_fraction: 0.75 + enable_block_reuse: false + stream_interval: 10 + moe_config: + backend: CUTLASS + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 8320 + max_seq_len: 9416 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con4_isl1024_osl1024 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con16_isl1024_osl1024 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 2068 + client_configs: + - name: con64_isl1024_osl1024 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl1024_osl8192 + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl1024_osl8192 + concurrency: 16 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl1024_osl8192 + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 8192 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1 + model_name: openai/gpt-oss-120b + gpus: 1 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 1 + moe_expert_parallel_size: 1 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2 + model_name: openai/gpt-oss-120b + gpus: 2 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4 + model_name: openai/gpt-oss-120b + gpus: 4 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con4_isl8192_osl1024 + concurrency: 4 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 16 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con16_isl8192_osl1024 + concurrency: 16 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true +- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8 + model_name: openai/gpt-oss-120b + gpus: 8 + match_mode: scenario + env_overrides: + TRTLLM_ENABLE_PDL: 1 + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + enable_attention_dp: false + kv_cache_config: + dtype: auto + enable_block_reuse: false + free_gpu_memory_fraction: 0.85 + moe_config: + backend: TRITON + num_postprocess_workers: 4 + print_iter_log: true + stream_interval: 20 + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + trust_remote_code: true + backend: pytorch + max_num_tokens: 20000 + max_seq_len: 9236 + client_configs: + - name: con64_isl8192_osl1024 + concurrency: 64 + iterations: 10 + isl: 8192 + osl: 1024 + random_range_ratio: 0.0 + backend: openai + streaming: true diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py index 35b293686d2..59952a6c89f 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py @@ -1,8 +1,10 @@ import pytest import torch -import tensorrt_llm._torch.auto_deploy.custom_ops # noqa: F401 -from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import _rms_norm_ref +from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import ( + gated_rms_norm_ref, + triton_rmsnorm_gated, +) @pytest.mark.skipif( @@ -19,12 +21,12 @@ def test_custom_op_matches_ref(B, T, H, group, use_gate, dtype): z = torch.randn_like(x) if use_gate else None w = torch.ones(H, dtype=dtype, device=device) - y_ref = _rms_norm_ref( + y_ref = gated_rms_norm_ref( x, w, bias=None, z=z, eps=1e-5, group_size=group, norm_before_gate=False, upcast=True ) # Custom op (currently returns fp32). Cast it back to x.dtype for apples-to-apples with ref. - y_op_fp32 = torch.ops.auto_deploy.triton_rmsnorm_gated(x, w, z, 1e-5, group, False) + y_op_fp32 = triton_rmsnorm_gated(x, w, z, 1e-5, group, False) y_op = y_op_fp32.to(x.dtype) assert y_ref.dtype == x.dtype and y_op.dtype == x.dtype diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py index 82f3774511d..d6e63b6433d 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py @@ -205,10 +205,12 @@ def test_custom_model_mapping_in_parent_does_not_affect_parent(): class Child(AutoModelForCausalLMFactory): pass + parent_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping) + custom_model_cls = MagicMock(spec=AutoModelForCausalLM) custom_model_cls.configure_mock(_from_config=MagicMock(side_effect=MyError)) Child.register_custom_model_cls( config_cls_name=FooConfig.__name__, custom_model_cls=custom_model_cls ) - assert AutoModelForCausalLMFactory._custom_model_mapping == {} + assert AutoModelForCausalLMFactory._custom_model_mapping == parent_mapping diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py index ceabe6c1b98..6ea5c0efa17 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py @@ -1,5 +1,3 @@ -import copy - import pytest import torch from _model_test_utils import get_small_model_config @@ -7,8 +5,6 @@ from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig -from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory -from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device # NOTE: find example inputs with the same tokenization length to avoid seq concat. @@ -16,37 +12,15 @@ EXAMPLE_INPUT2 = "Tiger is a cat with the following properties:" -@pytest.fixture -def setup_custom_model_cls_registry(request): - # TODO: remove all this when the patches in `bamba.py` and `nemotron_h.py` can be removed. - old_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping) - AutoModelForCausalLMFactory._custom_model_mapping = {} - - register_custom_model = request.node.callspec.params.get("register_custom_model", False) - if register_custom_model: - AutoModelForCausalLMFactory.register_custom_model_cls( - config_cls_name="NemotronHConfig", - custom_model_cls=NemotronHForCausalLM, - ) - yield - AutoModelForCausalLMFactory._custom_model_mapping = old_mapping - - @pytest.mark.parametrize( - "model_dir,run_verify_generation,register_custom_model", + "model_dir,run_verify_generation", [ - ("ibm-ai-platform/Bamba-9B-v2", True, False), - # This tests the incumbent patching approach. - ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, False), - # This tests the new custom model implementation. - ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, True), + ("ibm-ai-platform/Bamba-9B-v2", True), ], ) def test_bamba_patches( model_dir: str, run_verify_generation: bool, - register_custom_model: bool, - setup_custom_model_cls_registry, ): # NOTE: set to False if you want to locally test the full model. use_small_config: bool = True @@ -124,13 +98,14 @@ def _run_torch_export_to_gm(): move_to_device(gm, "cuda") factory._to_maybe_random(model, "cuda") model.load_state_dict(gm.state_dict()) + gm.load_state_dict(model.state_dict()) else: factory.load_or_random_init(model, device="cuda") gm = _run_torch_export_to_gm() move_to_device(gm, "cuda") if run_verify_generation: - _verify_generation(factory, model, tokenizer) + _verify_generation(model, tokenizer) # let's do a comparison of every state dict item between the model and the gm torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0) @@ -157,7 +132,7 @@ def _run_torch_export_to_gm(): ) -def _verify_generation(factory, model, tokenizer): +def _verify_generation(model, tokenizer): print("====== WITHOUT PATCH ======") _generate(tokenizer, model) with apply_export_patches(patch_list=["bamba"]): diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py new file mode 100644 index 00000000000..94b22ed14fc --- /dev/null +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py @@ -0,0 +1,235 @@ +import importlib.util +import sys +import types +from unittest import mock + +import pytest +import torch +from _model_test_utils import get_small_model_config +from torch.export import Dim +from transformers import AutoConfig, AutoModelForCausalLM +from utils.llm_data import llm_models_root + +from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm +from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig +from tensorrt_llm._torch.auto_deploy.models.custom.modeling_nemotron_h import NemotronHForCausalLM +from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device + +_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8)) + + +@pytest.fixture(scope="function", autouse=True) +def set_seed(): + torch.manual_seed(42) + + +@pytest.fixture(autouse=True) +def stub_mamba_ssm_if_missing(): + """Stub `mamba_ssm` package. + + The `modeling_nemotron_h.py` code in all recent nemotron checkpoints have a hard dependency + on `mamba_ssm.ops.triton.layernorm_gated.rmsnorm_fn`. This fixture stubs it, such that we + at least can get past the import stage of the remote modeling code. + """ + module = "mamba_ssm" + submodule = f"{module}.ops.triton.layernorm_gated" + + if importlib.util.find_spec(module) is not None: + yield + return + + stub_mod = types.ModuleType(submodule) + stub_mod.rmsnorm_fn = None + + with mock.patch.dict(sys.modules, {submodule: stub_mod}): + yield + + +def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None): + """ + Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module. + """ + cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) + + cfg.use_cache = False + + cfg.torch_dtype = "bfloat16" + cfg.hidden_size = 32 + cfg.intermediate_size = 64 + cfg.moe_intermediate_size = 64 + cfg.moe_shared_expert_intermediate_size = 64 + cfg.mamba_head_dim = 40 + cfg.mamba_num_heads = 4 + cfg.n_groups = 2 + cfg.num_attention_heads = 4 + cfg.num_hidden_layers = 9 + cfg.num_key_value_heads = 2 + cfg.ssm_state_size = 32 + + if custom_model_cls is None: + model = AutoModelForCausalLM.from_config(cfg, trust_remote_code=True) + else: + model = custom_model_cls._from_config(cfg) + model.eval() + + nemotron_moe = None + for _, mod in model.named_modules(): + if type(mod).__name__ == "NemotronHMOE": + nemotron_moe = mod + break + + if nemotron_moe is None: + raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.") + + _set_gate_weights(nemotron_moe) + + return nemotron_moe + + +def _set_gate_weights(module): + # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter` + # is initialized as `torch.empty` in the original model code, which no manner of random seed + # setting will have any effect on. We therefore set it like the below to ensure the + # reproducibility of the tests. + for _, mod in module.named_modules(): + if type(mod).__name__ == "NemotronHTopkRouter": + if hasattr(mod, "weight"): + mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight)) + + +@pytest.mark.parametrize( + "model_name", + [ + llm_models_root() / "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + ], +) +@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@torch.no_grad() +def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype): + device = "cuda" + + module = _load_nemotron_moe_layer(model_name) + module.to(device) + + H = module.config.hidden_size + x = torch.randn(B, S, H, device=device, dtype=dtype) + + ref = module(x) + + new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM) + new_module.to(device) + new_module.load_state_dict(module.state_dict()) + + test = new_module(x) + + rtol = 0.05 + atol = 0.05 + + torch.testing.assert_close(test, ref, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize( + "model_dir,model_on_meta_during_export", + [ + ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True), + ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", False), + ], +) +def test_custom_model_implementation_can_be_exported( + model_dir: str, + model_on_meta_during_export: bool, +): + # NOTE: set to False if you want to locally test the full model. + use_small_config: bool = True + + common_kwargs = { + "world_size": 0, + "runtime": "demollm", + "model_factory": "AutoModelForCausalLM", + "max_seq_len": 512, + "transforms": { + "insert_cached_attention": {"backend": "flashinfer"}, + "compile_model": {"backend": "torch-simple"}, + }, + } + + if use_small_config: + llm_args = get_small_model_config(model_dir, **common_kwargs)["args"] + else: + llm_args = { + "model": model_dir, + **common_kwargs, + "model_kwargs": { + "dtype": "bfloat16", + }, + } + llm_args = AutoDeployConfig(**llm_args) + + factory = llm_args.create_factory() + model = factory.build_model("meta") + tokenizer = factory.init_tokenizer() + + # 1. Export wants min batch size of 2 (to avoid specialization during export). + # 2. Can't get `padding` / `truncation` to work without other steps so just use the prompts + # with the same tokenized length in order for the tokenizer not to complain when creating + # the tensor. + message = [ + "Mamba is a snake with the following properties:", + "Tiger is a cat with the following properties:", + ] + inputs = tokenizer(message, return_tensors="pt", return_token_type_ids=False).to("cuda") + + input_ids = inputs["input_ids"] + position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).repeat( + input_ids.shape[0], 1 + ) + dynamic_shapes = ( + {0: Dim("batch_size", min=0, max=8), 1: Dim("seq_len", min=0, max=512)}, + { + 0: Dim("batch_size", min=0, max=8), + 1: Dim("seq_len", min=0, max=512), + }, + ) + + def _run_torch_export_to_gm(): + return torch_export_to_gm( + model, + args=tuple(), + kwargs={"input_ids": input_ids, "position_ids": position_ids}, + dynamic_shapes=dynamic_shapes, + ) + + if model_on_meta_during_export: + gm = _run_torch_export_to_gm() + factory.load_or_random_init(gm, device="cuda") + move_to_device(gm, "cuda") + factory._to_maybe_random(model, "cuda") + # In order to ensure the `_minus_A` (non-persistent buffer) is correct, we need to run the + # model's load state pre/post hooks by loading the state dicts after initialization. + # NOTE: this is done under the hood by `torch_export_to_gm`, so we only need this in this + # `if` clause. + model.load_state_dict(gm.state_dict()) + gm.load_state_dict(model.state_dict()) + else: + factory.load_or_random_init(model, device="cuda") + gm = _run_torch_export_to_gm() + move_to_device(gm, "cuda") + + # let's do a comparison of every state dict item between the model and the gm + torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0) + torch.testing.assert_close( + dict(model.named_buffers()), dict(gm.named_buffers()), rtol=0.0, atol=0.0 + ) + + with torch.inference_mode(): + out_original = model(input_ids=input_ids, position_ids=position_ids) + out_gm = gm(input_ids=input_ids, position_ids=position_ids) + + atol, rtol = 1e-3, 1e-3 + torch.testing.assert_close( + out_gm, + out_original, + rtol=rtol, + atol=atol, + ) diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py deleted file mode 100644 index 3ef4e8eb54f..00000000000 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py +++ /dev/null @@ -1,158 +0,0 @@ -import functools -import types - -import pytest -import torch -from _model_test_utils import _hf_model_dir_or_hub_id -from transformers import AutoConfig - -from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM -from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import ( - _from_config_original, - _nemotron_h_moe_forward, -) - -_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8)) - - -@pytest.fixture(scope="function", autouse=True) -def set_seed(): - torch.manual_seed(42) - - -def skip_on_no_hf_access(func): - """Decorator for skipping tests that fail due to HF access issues. - - This allows us to share the same test code for CI (where access may be restricted, especially for private - repositories) and locally. - """ - - @functools.wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except OSError as e: - if "not a valid model identifier" in str(e): - pytest.skip("Test skipped due to (no) HF access.") - raise - - return wrapper - - -def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None): - """ - Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module. - """ - cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) - - cfg.use_cache = False - - cfg.torch_dtype = "bfloat16" - cfg.hidden_size = 32 - cfg.intermediate_size = 64 - cfg.moe_intermediate_size = 64 - cfg.moe_shared_expert_intermediate_size = 64 - cfg.mamba_head_dim = 40 - cfg.mamba_num_heads = 4 - cfg.n_groups = 2 - cfg.num_attention_heads = 4 - cfg.num_hidden_layers = 9 - cfg.num_key_value_heads = 2 - cfg.ssm_state_size = 32 - - if custom_model_cls is None: - model = _from_config_original(cfg, trust_remote_code=True) - else: - model = custom_model_cls._from_config(cfg) - model.eval() - - nemotron_moe = None - for _, mod in model.named_modules(): - if type(mod).__name__ == "NemotronHMOE": - nemotron_moe = mod - break - - if nemotron_moe is None: - raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.") - - _set_gate_weights(nemotron_moe) - - return nemotron_moe - - -def _set_gate_weights(module): - # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter` - # is initialized as `torch.empty` in the original model code, which no manner of random seed - # setting will have any effect on. We therefore set it like the below to ensure the - # reproducibility of the tests. - for _, mod in module.named_modules(): - if type(mod).__name__ == "NemotronHTopkRouter": - if hasattr(mod, "weight"): - mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight)) - - -@pytest.mark.parametrize( - "model_name", - [ - _hf_model_dir_or_hub_id( - "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3" - ), - ], -) -@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -@torch.no_grad() -@skip_on_no_hf_access -def test_nemotronh_moe_patch_forward(model_name, B, S, dtype): - device = "cuda" - - module = _load_nemotron_moe_layer(model_name) - module.to(device) - - H = module.config.hidden_size - x = torch.randn(B, S, H, device=device, dtype=dtype) - - ref = module(x) - - module.forward = types.MethodType(_nemotron_h_moe_forward, module) - test = module(x) - - rtol = 0.05 - atol = 0.05 - - torch.testing.assert_close(test, ref, rtol=rtol, atol=atol) - - -@pytest.mark.parametrize( - "model_name", - [ - _hf_model_dir_or_hub_id( - "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3" - ), - ], -) -@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES) -@pytest.mark.parametrize("dtype", [torch.bfloat16]) -@torch.no_grad() -@skip_on_no_hf_access -def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype): - device = "cuda" - - module = _load_nemotron_moe_layer(model_name) - module.to(device) - - H = module.config.hidden_size - x = torch.randn(B, S, H, device=device, dtype=dtype) - - ref = module(x) - - new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM) - new_module.to(device) - new_module.load_state_dict(module.state_dict()) - - test = new_module(x) - - rtol = 0.05 - atol = 0.05 - - torch.testing.assert_close(test, ref, rtol=rtol, atol=atol) diff --git a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py index 0c52852b9ec..ff9dd92e0ca 100644 --- a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py +++ b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py @@ -104,7 +104,7 @@ def gen_tp_pp_size(request): def worker(model_name: str, ctx_tp_pp_size: tuple, gen_tp_pp_size: tuple): extra_config = { "cache_transceiver_config": { - "backend": "UCX" + "backend": "DEFAULT" }, "kv_cache_config": { "free_gpu_memory_fraction": 0.5, diff --git a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py index 1ca1beec2ab..312f9232d40 100644 --- a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py +++ b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py @@ -5,6 +5,7 @@ import pytest import requests import yaml +from utils.llm_data import llm_models_root from ..test_llm import get_model_path from .openai_server import RemoteMMEncoderServer @@ -69,7 +70,8 @@ def async_client(server: RemoteMMEncoderServer): def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str): content_text = "Describe the natural environment in the image." - image_url = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png" + image_url = str(llm_models_root() / "multimodals" / "test_data" / + "seashore.png") messages = [{ "role": "user", diff --git a/tests/unittest/tools/test_config_database_sync.py b/tests/unittest/tools/test_config_database_sync.py new file mode 100644 index 00000000000..92a42431669 --- /dev/null +++ b/tests/unittest/tools/test_config_database_sync.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +import unittest +from pathlib import Path + +# Add scripts directory to path +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) +SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts") +sys.path.insert(0, SCRIPTS_DIR) + +from generate_config_database_tests import ( # noqa: E402 + PERF_SANITY_DIR, + TEST_LIST_PATH, + generate_tests, +) +from generate_config_table import generate_rst # noqa: E402 + + +class TestConfigDatabaseSync(unittest.TestCase): + def test_config_table_sync(self): + """Test that the config_table.rst file is synchronized with the lookup.yaml database. + + Ensures that the RST file is up-to-date with the YAML database. + """ + if generate_rst is None: + self.skipTest("generate_config_table not available") + + # Define paths + yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml") + rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst") + + # Ensure files exist + self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}") + self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}") + + # Read existing RST content + with open(rst_path, "r") as f: + existing_content = f.read() + + # Generate new RST content + with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp: + generate_rst(yaml_path, output_file=tmp.name) + tmp.seek(0) + generated_content = tmp.read() + + # Compare content + self.assertEqual( + existing_content.strip(), + generated_content.strip(), + "config_table.rst is not synchronized with lookup.yaml. " + "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.", + ) + + def test_config_database_tests_sync(self): + """Test that config database test files are synchronized with lookup.yaml. + + Ensures that both the test list YAML and per-GPU config files are up-to-date. + """ + self.assertTrue(TEST_LIST_PATH.exists(), f"Test list not found: {TEST_LIST_PATH}") + + with open(TEST_LIST_PATH) as f: + existing_test_list = f.read() + + existing_config_files = {} + for config_path in PERF_SANITY_DIR.glob("config_database_*.yaml"): + with open(config_path) as f: + existing_config_files[config_path.name] = f.read() + + # Generate to temp directory + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_config_dir = Path(tmp_dir) / "configs" + tmp_test_list_path = Path(tmp_dir) / "test_list.yml" + tmp_config_dir.mkdir(parents=True, exist_ok=True) + + generate_tests(tmp_test_list_path, tmp_config_dir) + + with open(tmp_test_list_path) as f: + generated_test_list = f.read() + + self.assertEqual( + existing_test_list.strip(), + generated_test_list.strip(), + f"{TEST_LIST_PATH} is not synchronized with lookup.yaml. " + "Please run 'python3 scripts/generate_config_database_tests.py' from the repo root.", + ) + + generated_config_files = {} + for config_path in tmp_config_dir.glob("config_database_*.yaml"): + with open(config_path) as f: + generated_config_files[config_path.name] = f.read() + + # Check same set of files + self.assertEqual( + set(existing_config_files.keys()), + set(generated_config_files.keys()), + "Mismatch in config database config files. " + "Please run 'python scripts/generate_config_database_tests.py' from the repo root.", + ) + + # Compare each config file + for filename in existing_config_files: + self.assertEqual( + existing_config_files[filename].strip(), + generated_config_files[filename].strip(), + f"{filename} is not synchronized with lookup.yaml. " + "Please run 'python scripts/generate_config_database_tests.py' from the repo root.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unittest/tools/test_generate_config_table.py b/tests/unittest/tools/test_generate_config_table.py deleted file mode 100644 index a2dcf66783f..00000000000 --- a/tests/unittest/tools/test_generate_config_table.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import tempfile -import unittest - -# Add scripts directory to path -REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) -SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts") -sys.path.insert(0, SCRIPTS_DIR) - -from generate_config_table import generate_rst # noqa: E402 - - -class TestConfigTableSync(unittest.TestCase): - def test_config_table_sync(self): - """Test that the config_table.rst file is synchronized with the lookup.yaml database. - - Ensures that the RST file is up-to-date with the YAML database. - """ - if generate_rst is None: - self.skipTest("generate_config_table not available") - - # Define paths - yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml") - rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst") - - # Ensure files exist - self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}") - self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}") - - # Read existing RST content - with open(rst_path, "r") as f: - existing_content = f.read() - - # Generate new RST content - with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp: - generate_rst(yaml_path, output_file=tmp.name) - tmp.seek(0) - generated_content = tmp.read() - - # Compare content - self.assertEqual( - existing_content.strip(), - generated_content.strip(), - "config_table.rst is not synchronized with lookup.yaml. " - "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.", - ) - - -if __name__ == "__main__": - unittest.main()