diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index dda8f52cc82..787fa0bb7e9 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1468,7 +1468,8 @@ class CacheTransceiverConfig
         DEFAULT = 0,
         MPI = 1,
         UCX = 2,
-        NIXL = 3
+        NIXL = 3,
+        MOONCAKE = 4
     };
     explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
         std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt,
diff --git a/cpp/include/tensorrt_llm/executor/transferAgent.h b/cpp/include/tensorrt_llm/executor/transferAgent.h
index ac469fcb403..5f4ff1f0616 100644
--- a/cpp/include/tensorrt_llm/executor/transferAgent.h
+++ b/cpp/include/tensorrt_llm/executor/transferAgent.h
@@ -391,6 +391,14 @@ template <typename... Args>
             "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
         return func(std::forward<Args>(args)...);
     }
+    if (backend == "mooncake")
+    {
+        auto& loader = DynLibLoader::getInstance();
+        using CreateMooncakeFuncType = std::unique_ptr<BaseTransferAgent> (*)(BaseAgentConfig const*);
+        auto* func = loader.getFunctionPointer<CreateMooncakeFuncType>(
+            "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent");
+        return func(std::forward<Args>(args)...);
+    }
     TLLM_THROW("Unknown backend name.");
 }
 
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index a9e4a007290..76604ec2296 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -159,6 +159,10 @@ if(NIXL_ROOT)
   set(NIXL_WRAPPER_TARGET tensorrt_llm_nixl_wrapper)
 endif()
 
+if(MOONCAKE_ROOT)
+  set(MOONCAKE_WRAPPER_TARGET tensorrt_llm_mooncake_wrapper)
+endif()
+
 add_subdirectory(executor)
 
 find_package(Threads REQUIRED)
@@ -272,6 +276,11 @@ if(TARGET ${NIXL_WRAPPER_TARGET})
   add_dependencies(${SHARED_TARGET} ${NIXL_WRAPPER_TARGET})
 endif()
 
+if(TARGET ${MOONCAKE_WRAPPER_TARGET})
+  target_link_libraries(${MOONCAKE_WRAPPER_TARGET} INTERFACE ${SHARED_TARGET})
+  add_dependencies(${SHARED_TARGET} ${MOONCAKE_WRAPPER_TARGET})
+endif()
+
 if(NOT WIN32)
   # Load libraries at $PREFIX/lib from
   # $PREFIX/lib/python3.12/site-packages/tensorrt_llm/libs
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
index bb253c969f3..7e4c26bfd78 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -81,6 +81,11 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc
             backendType = executor::CacheTransceiverConfig::BackendType::NIXL;
             TLLM_LOG_INFO("Enable NIXL KV cache transport.");
         }
+        else if (common::getEnvUseMooncakeKvCache())
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::MOONCAKE;
+            TLLM_LOG_INFO("Enable MOONCAKE KV cache transport.");
+        }
         else if (common::getEnvUseMPIKvCache())
         {
             backendType = executor::CacheTransceiverConfig::BackendType::MPI;
@@ -203,9 +208,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
     else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL)
     {
         mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
-            mCacheTransBufferManagerPtrs, *mCacheState);
+            mCacheTransBufferManagerPtrs, *mCacheState, "nixl");
         TLLM_LOG_INFO("NIXL Connection Manager created");
     }
+    else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE)
+    {
+        mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
+            mCacheTransBufferManagerPtrs, *mCacheState, "mooncake");
+        TLLM_LOG_INFO("MOONCAKE Connection Manager created");
+    }
     else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI)
     {
         mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world());
diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp
index fc85975acb3..4a082a4ff3a 100644
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@@ -281,6 +281,12 @@ bool getEnvUseNixlKvCache()
     return useNixlKvCache;
 }
 
+bool getEnvUseMooncakeKvCache()
+{
+    static bool const useMooncakeKvCache = getBoolEnv("TRTLLM_USE_MOONCAKE_KVCACHE");
+    return useMooncakeKvCache;
+}
+
 bool getEnvUseRoundRobinBlockDistForCP()
 {
     static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP");
@@ -343,6 +349,23 @@ std::string getEnvNixlBackend()
     return nixlBackend;
 }
 
+std::string getEnvMooncakeInterface()
+{
+    static std::once_flag flag;
+    static std::string mooncakeInterface;
+
+    std::call_once(flag,
+        [&]()
+        {
+            char const* mooncake_interface = std::getenv("TRTLLM_MOONCAKE_INTERFACE");
+            if (mooncake_interface)
+            {
+                mooncakeInterface = mooncake_interface;
+            }
+        });
+    return mooncakeInterface;
+}
+
 bool getEnvDisaggLayerwise()
 {
     static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE");
diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h
index 8a3af2458dd..f838f0e9ae0 100644
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@@ -83,8 +83,11 @@ inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 g
 bool getEnvUseUCXKvCache();
 
 bool getEnvUseMPIKvCache();
+
 bool getEnvUseNixlKvCache();
 
+bool getEnvUseMooncakeKvCache();
+
 bool getEnvUseRoundRobinBlockDistForCP();
 
 std::string getEnvUCXInterface();
@@ -93,6 +96,8 @@ std::string getEnvNixlInterface();
 
 std::string getEnvNixlBackend();
 
+std::string getEnvMooncakeInterface();
+
 bool getEnvDisaggLayerwise();
 
 bool getEnvParallelCacheSend();
diff --git a/cpp/tensorrt_llm/common/ipUtils.cpp b/cpp/tensorrt_llm/common/ipUtils.cpp
new file mode 100644
index 00000000000..e4e9767194e
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ipUtils.cpp
@@ -0,0 +1,226 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ipUtils.h"
+#include "tensorrt_llm/common/logger.h"
+
+#include <arpa/inet.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <string>
+#include <sys/socket.h>
+#include <unistd.h>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+
+std::string getLocalIpByNic(std::string const& interface, int rank)
+{
+    struct ifaddrs* ifaddr = nullptr;
+    if (getifaddrs(&ifaddr) == -1)
+    {
+        TLLM_LOG_ERROR(rank,
+            "getLocalIpByNic: Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is "
+            "set "
+            "correctly.");
+        return std::string{};
+    }
+
+    for (struct ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next)
+    {
+        if (ifa->ifa_addr == nullptr)
+        {
+            continue;
+        }
+
+        if (ifa->ifa_name == interface)
+        {
+            if (ifa->ifa_addr->sa_family == AF_INET)
+            {
+                char ip[INET_ADDRSTRLEN]{};
+                void* addr = &((reinterpret_cast<struct sockaddr_in*>(ifa->ifa_addr))->sin_addr);
+                if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "0.0.0.0") != 0)
+                {
+                    freeifaddrs(ifaddr);
+                    return std::string(ip);
+                }
+            }
+            else if (ifa->ifa_addr->sa_family == AF_INET6)
+            {
+                char ip[INET6_ADDRSTRLEN]{};
+                void* addr = &((reinterpret_cast<struct sockaddr_in6*>(ifa->ifa_addr))->sin6_addr);
+                if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0
+                    && std::strcmp(ip, "::1") != 0)
+                {
+                    freeifaddrs(ifaddr);
+                    return std::string(ip);
+                }
+            }
+        }
+    }
+
+    freeifaddrs(ifaddr);
+    TLLM_LOG_ERROR(
+        rank, "Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is set correctly.");
+    return std::string{};
+}
+
+std::string getLocalIpByHostname(int rank)
+{
+    char hostname[256]{};
+    if (gethostname(hostname, sizeof(hostname)) == -1)
+    {
+        TLLM_LOG_ERROR(rank, "getLocalIpByHostname: Can't get hostname");
+        return std::string{};
+    }
+
+    struct addrinfo hints = {};
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+    hints.ai_flags = AI_CANONNAME;
+
+    struct addrinfo* res = nullptr;
+    if (getaddrinfo(hostname, nullptr, &hints, &res) != 0)
+    {
+        TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get address info for hostname");
+        return std::string{};
+    }
+
+    for (struct addrinfo* p = res; p != nullptr; p = p->ai_next)
+    {
+
+        if (p->ai_family == AF_INET)
+        { // IPv4
+            char ip[INET_ADDRSTRLEN]{};
+            struct sockaddr_in* ipv4 = reinterpret_cast<struct sockaddr_in*>(p->ai_addr);
+            void* addr = &(ipv4->sin_addr);
+            if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "127.0.0.1") != 0
+                && std::strcmp(ip, "0.0.0.0") != 0)
+            {
+                freeaddrinfo(res);
+                return std::string(ip);
+            }
+        }
+        else if (p->ai_family == AF_INET6)
+        { // IPv6
+            char ip[INET6_ADDRSTRLEN]{};
+            struct sockaddr_in6* ipv6 = reinterpret_cast<struct sockaddr_in6*>(p->ai_addr);
+            void* addr = &(ipv6->sin6_addr);
+            if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0
+                && std::strcmp(ip, "::1") != 0)
+            {
+                freeaddrinfo(res);
+                return std::string(ip);
+            }
+        }
+    }
+
+    freeaddrinfo(res);
+    TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get local ip from hostname");
+    return std::string{};
+}
+
+std::string getLocalIpByRemoteOrHostName(int rank)
+{
+
+    // Try IPv4
+    struct sockaddr_in addr
+    {
+    };
+
+    addr.sin_family = AF_INET;
+    addr.sin_port = htons(80);
+    // using google's public dns server to get the local ip which can be accessed from remote
+    char const* dns_ip_v4 = "8.8.8.8";
+    inet_pton(AF_INET, dns_ip_v4, &addr.sin_addr);
+
+    int sock = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sock != -1)
+    {
+        if (connect(sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)) != -1)
+        {
+            socklen_t addr_len = sizeof(addr);
+            if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &addr_len) != -1)
+            {
+                char ip[INET_ADDRSTRLEN]{};
+                inet_ntop(AF_INET, &addr.sin_addr, ip, sizeof(ip));
+                close(sock);
+                return std::string(ip);
+            }
+        }
+        close(sock);
+    }
+
+    // Try IPv6
+    struct sockaddr_in6 addr6
+    {
+    };
+
+    addr6.sin6_family = AF_INET6;
+    addr6.sin6_port = htons(80);
+    // using google's public dns server
+    char const* dns_ipv6 = "2001:4860:4860::8888";
+    inet_pton(AF_INET6, dns_ipv6, &addr6.sin6_addr);
+
+    sock = socket(AF_INET6, SOCK_DGRAM, 0);
+    if (sock != -1)
+    {
+        if (connect(sock, reinterpret_cast<struct sockaddr*>(&addr6), sizeof(addr6)) != -1)
+        {
+            socklen_t addr_len = sizeof(addr6);
+            if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr6), &addr_len) != -1)
+            {
+                char ip[INET6_ADDRSTRLEN]{};
+                inet_ntop(AF_INET6, &addr6.sin6_addr, ip, sizeof(ip));
+                close(sock);
+                return std::string(ip);
+            }
+        }
+        close(sock);
+    }
+
+    // Try hostname
+    return getLocalIpByHostname(rank);
+}
+
+std::string getLocalIp(std::string interface, int rank)
+{
+    std::string localIP = {};
+    if (!interface.empty())
+    {
+        localIP = getLocalIpByNic(interface, rank);
+    }
+    if (localIP.empty())
+    {
+        localIP = getLocalIpByRemoteOrHostName(rank);
+    }
+    // check whether the localIP is valid
+    if (localIP.empty())
+    {
+        TLLM_THROW("getLocalIp: Can't get local ip");
+    }
+    return localIP;
+}
+} // namespace common
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/common/ipUtils.h b/cpp/tensorrt_llm/common/ipUtils.h
new file mode 100644
index 00000000000..9e8081683df
--- /dev/null
+++ b/cpp/tensorrt_llm/common/ipUtils.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/config.h"
+#include <string>
+
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
+{
+std::string getLocalIp(std::string interface, int rank);
+} // namespace common
+
+TRTLLM_NAMESPACE_END
diff --git a/cpp/tensorrt_llm/executor/CMakeLists.txt b/cpp/tensorrt_llm/executor/CMakeLists.txt
index e0e91d4b993..6639b582751 100644
--- a/cpp/tensorrt_llm/executor/CMakeLists.txt
+++ b/cpp/tensorrt_llm/executor/CMakeLists.txt
@@ -91,3 +91,4 @@ target_compile_definitions(${EXECUTOR_STATIC_TARGET}
 
 add_subdirectory(cache_transmission/ucx_utils)
 add_subdirectory(cache_transmission/nixl_utils)
+add_subdirectory(cache_transmission/mooncake_utils)
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
index b9dcc22a578..ee8e8e21b35 100644
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp
@@ -231,12 +231,12 @@ bool AgentConnection::recvReadySignal(DataContext const& ctx) const
 {
     ReadySignalInfo readySignalInfo{mAgentName, ctx, false};
     mAgentConnectionManager->waitForReadySignal(mRemoteAgentName, readySignalInfo);
-    return true;
+    return readySignalInfo.mIsReady;
 }
 
 AgentConnectionManager::AgentConnectionManager(
     std::vector<batch_manager::kv_cache_manager::CacheTransBufferManager*> cacheTransBufferManagers,
-    CacheState cacheState)
+    CacheState cacheState, std::string const& backendType)
     : mCacheState(std::move(cacheState))
     , mCacheTransBufferManagers(std::move(cacheTransBufferManagers))
     , mRegMemDescs(MemoryType::kVRAM, {})
@@ -247,7 +247,7 @@ AgentConnectionManager::AgentConnectionManager(
     mAgentName = genUniqueAgentName();
     // Create Agent
     BaseAgentConfig config{mAgentName, true};
-    m_Agent = makeTransferAgent("nixl", &config);
+    m_Agent = makeTransferAgent(backendType, &config);
     TLLM_CHECK(!mCacheTransBufferManagers.empty());
     std::vector<MemoryDesc> memDescs;
     for (auto* cacheTransBufferManager : mCacheTransBufferManagers)
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
index d5a780bf45b..6b8bd875e4a 100644
--- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
+++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h
@@ -277,7 +277,7 @@ class AgentConnectionManager : public ConnectionManager
 public:
     AgentConnectionManager(
         std::vector<batch_manager::kv_cache_manager::CacheTransBufferManager*> cacheTransBufferManagers,
-        CacheState cacheState);
+        CacheState cacheState, std::string const& backendType);
     ~AgentConnectionManager();
     AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override;
     [[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt
new file mode 100644
index 00000000000..105d3b93f1f
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT
+# Source Code License Agreement
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related documentation
+# and any modifications thereto. Any use, reproduction, disclosure or
+# distribution of this material and related documentation without an express
+# license agreement from NVIDIA CORPORATION or its affiliates is strictly
+# prohibited.
+
+# MOONCAKE is not supported on Rocky8 for now
+set(IS_ROCKY8 FALSE)
+if(EXISTS "/etc/redhat-release")
+  set(IS_ROCKY8 TRUE)
+endif()
+
+if(MOONCAKE_ROOT AND NOT IS_ROCKY8)
+  find_library(TRANSFER_ENGINE_LIB transfer_engine ${MOONCAKE_ROOT}/lib)
+  find_path(TRANSFER_ENGINE_INCLUDE_DIR transfer_engine_c.h
+            ${MOONCAKE_ROOT}/include)
+
+  message(STATUS "Find transfer engine results:")
+  message(STATUS "  TRANSFER_ENGINE_LIB = ${TRANSFER_ENGINE_LIB}")
+  message(
+    STATUS "  TRANSFER_ENGINE_INCLUDE_DIR = ${TRANSFER_ENGINE_INCLUDE_DIR}")
+
+  if(TRANSFER_ENGINE_LIB AND TRANSFER_ENGINE_INCLUDE_DIR)
+    set(MOONCAKE_WRAPPER_TARGET "tensorrt_llm_mooncake_wrapper")
+
+    add_library(${MOONCAKE_WRAPPER_TARGET} SHARED transferAgent.cpp)
+    target_compile_options(${MOONCAKE_WRAPPER_TARGET} PRIVATE -Wno-error)
+
+    target_include_directories(${MOONCAKE_WRAPPER_TARGET}
+                               PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR})
+
+    target_link_libraries(${MOONCAKE_WRAPPER_TARGET}
+                          PRIVATE ${TRANSFER_ENGINE_LIB} CUDA::cudart)
+  endif()
+endif()
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
new file mode 100644
index 00000000000..eabbca98c3c
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
@@ -0,0 +1,546 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/common/ipUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/transferAgent.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <chrono>
+#include <dirent.h>
+#include <fcntl.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <thread>
+#include <unistd.h>
+
+namespace tensorrt_llm::executor::kv_cache
+{
+
+MooncakeTransferStatus::MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount)
+    : mEngine{engine}
+    , mBatchId{batchId}
+    , mRequestCount{requestCount}
+{
+    TLLM_CHECK(mEngine);
+}
+
+void MooncakeTransferStatus::wait() const
+{
+    while (!isCompleted())
+    {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+}
+
+[[nodiscard]] bool MooncakeTransferStatus::isCompleted() const
+{
+    if (mBatchFreed)
+    {
+        return true;
+    }
+
+    bool has_failed = false;
+    for (size_t index = 0; index < mRequestCount; ++index)
+    {
+        transfer_status_t status;
+        int rc = getTransferStatus(mEngine, mBatchId, index, &status);
+        if (rc || status.status == STATUS_FAILED)
+        {
+            has_failed = true;
+            if (rc)
+            {
+                TLLM_LOG_ERROR(
+                    "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc);
+            }
+            else
+            {
+                TLLM_LOG_ERROR("Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status);
+            }
+        }
+        else if (status.status == STATUS_PENDING || status.status == STATUS_WAITING)
+        {
+            TLLM_LOG_DEBUG("Transfer is pending for batch %lu, task %zu", mBatchId, index);
+            return false;
+        }
+    }
+    if (!has_failed)
+    {
+        // Each batchId has the batch size, and cannot process more requests
+        // than the batch size. So, free the batch id here to workaround the issue
+        // where the same batchId could be used to post multiple transfer.
+        freeBatchID(mEngine, mBatchId);
+        mBatchFreed = true;
+        TLLM_LOG_DEBUG("Batch ID %lu freed, future calls will return true directly", mBatchId);
+    }
+    // Currently, we cannot distinguish between failed and completed from return value.
+    TLLM_LOG_DEBUG("Transfer is completed for batch %lu", mBatchId);
+    return true;
+}
+
+const std::string MooncakeBase64Helper::STANDARD_CHARS
+    = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz"
+      "0123456789+/";
+
+std::string MooncakeBase64Helper::encode(std::vector<uint8_t> const& data)
+{
+    return encodeInternal(data, STANDARD_CHARS);
+}
+
+std::string MooncakeBase64Helper::encode(std::string const& data)
+{
+    std::vector<uint8_t> vec(data.begin(), data.end());
+    return encode(vec);
+}
+
+std::vector<uint8_t> MooncakeBase64Helper::decode(std::string const& encoded)
+{
+    return decodeInternal(encoded, STANDARD_CHARS);
+}
+
+std::string MooncakeBase64Helper::decodeToString(std::string const& encoded)
+{
+    auto vec = decode(encoded);
+    return std::string(vec.begin(), vec.end());
+}
+
+std::string MooncakeBase64Helper::encodeInternal(std::vector<uint8_t> const& data, std::string const& chars)
+{
+    std::string encoded;
+    size_t i = 0;
+    size_t j = 0;
+    std::array<uint8_t, 3> charArray3{};
+    std::array<uint8_t, 4> charArray4{};
+    size_t dataLen = data.size();
+    uint8_t const* bytes = data.data();
+
+    while (dataLen--)
+    {
+        charArray3[i++] = *(bytes++);
+        if (i == 3)
+        {
+            charArray4[0] = (charArray3[0] & 0xfc) >> 2;
+            charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4);
+            charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6);
+            charArray4[3] = charArray3[2] & 0x3f;
+
+            for (i = 0; i < 4; i++)
+            {
+                encoded += chars[charArray4[i]];
+            }
+            i = 0;
+        }
+    }
+
+    if (i > 0)
+    {
+        for (j = i; j < 3; j++)
+        {
+            charArray3[j] = '\0';
+        }
+
+        charArray4[0] = (charArray3[0] & 0xfc) >> 2;
+        charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4);
+        charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6);
+        charArray4[3] = charArray3[2] & 0x3f;
+
+        for (j = 0; j < i + 1; j++)
+        {
+            encoded += chars[charArray4[j]];
+        }
+
+        while (i++ < 3)
+        {
+            encoded += '=';
+        }
+    }
+
+    return encoded;
+}
+
+std::vector<uint8_t> MooncakeBase64Helper::decodeInternal(std::string const& encoded, std::string const& chars)
+{
+    size_t encodedLen = encoded.size();
+    size_t i = 0;
+    size_t j = 0;
+    size_t in_ = 0;
+    std::array<uint8_t, 3> charArray3{};
+    std::array<uint8_t, 4> charArray4{};
+    std::vector<uint8_t> decoded;
+
+    std::string cleanEncoded;
+    for (char c : encoded)
+    {
+        if (!isWhitespace(c))
+        {
+            cleanEncoded += c;
+        }
+    }
+
+    encodedLen = cleanEncoded.size();
+
+    while (encodedLen-- && cleanEncoded[in_] != '=' && isBase64(cleanEncoded[in_], chars))
+    {
+        charArray4[i++] = cleanEncoded[in_];
+        in_++;
+        if (i == 4)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                charArray4[i] = chars.find(charArray4[i]);
+            }
+
+            charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
+            charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2);
+            charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3];
+
+            for (i = 0; i < 3; i++)
+            {
+                decoded.push_back(charArray3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i > 0)
+    {
+        for (j = i; j < 4; j++)
+        {
+            charArray4[j] = 0;
+        }
+
+        for (j = 0; j < 4; j++)
+        {
+            charArray4[j] = chars.find(charArray4[j]);
+        }
+
+        charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
+        charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2);
+        charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3];
+
+        for (j = 0; j < i - 1; j++)
+        {
+            decoded.push_back(charArray3[j]);
+        }
+    }
+
+    return decoded;
+}
+
+bool MooncakeBase64Helper::isBase64(uint8_t c, std::string const& chars)
+{
+    return (isalnum(c) || (c == chars[62]) || (c == chars[63]));
+}
+
+bool MooncakeBase64Helper::isWhitespace(uint8_t c)
+{
+    return (c == ' ' || c == '\n' || c == '\r' || c == '\t');
+}
+
+MooncakeTransferAgent::MooncakeTransferAgent(BaseAgentConfig const& config)
+{
+    mLocalAgentName = config.mName;
+    std::string segmentName = "127.0.0.1";
+
+    if (getenv("TLLM_MOONCAKE_IP_ADDR"))
+    {
+        segmentName = std::string(getenv("TLLM_MOONCAKE_IP_ADDR"));
+    }
+    else
+    {
+        auto ip = common::getLocalIp(common::getEnvMooncakeInterface(), mpi::MpiComm::session().getRank());
+        if (!ip.empty())
+            segmentName = ip;
+    }
+
+    mEngine = createTransferEngine("P2PHANDSHAKE", segmentName.c_str(), "", 0, true);
+}
+
+void MooncakeTransferAgent::registerMemory(RegisterDescs const& descs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::registerMemory");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    for (auto const& desc : descs.getDescs())
+    {
+        auto it = mMemRegInfo.find(desc.getAddr());
+        if (it != mMemRegInfo.end())
+        {
+            it->second->addRef();
+            continue;
+        }
+
+        int err = registerLocalMemory(mEngine, reinterpret_cast<void*>(desc.getAddr()), desc.getLen(), "*", 1);
+
+        TLLM_CHECK_WITH_INFO(err == 0, "registerLocalMemory failed, addr: %p, len: %lu",
+            reinterpret_cast<void*>(desc.getAddr()), desc.getLen());
+
+        auto mooncakeDesc = std::make_shared<MooncakeMemoryDesc>(desc);
+        mMemRegInfo[desc.getAddr()] = std::move(mooncakeDesc);
+    }
+}
+
+void MooncakeTransferAgent::deregisterMemory(RegisterDescs const& descs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::deregisterMemory");
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    for (auto const& desc : descs.getDescs())
+    {
+        auto it = mMemRegInfo.find(desc.getAddr());
+        if (it != mMemRegInfo.end())
+        {
+            auto const& mooncakeDesc = it->second;
+            mooncakeDesc->releaseRef();
+            if (mooncakeDesc->getRefCount())
+                continue;
+
+            int err = unregisterLocalMemory(mEngine, reinterpret_cast<void*>(desc.getAddr()));
+
+            TLLM_CHECK_WITH_INFO(
+                err == 0, "unregisterLocalMemory failed, addr: %p", reinterpret_cast<void*>(desc.getAddr()));
+
+            mMemRegInfo.erase(desc.getAddr());
+        }
+    }
+}
+
+void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::loadRemoteAgent");
+
+    // Do the same thing as loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo)
+    loadRemoteAgent(name, std::move(agentDesc.getBackendAgentDesc()));
+}
+
+void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo)
+{
+    TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
+        "MooncakeTransferAgent::loadRemoteAgent loadRemoteAgent to %s remoteagent name: %s", connectionInfo.c_str(),
+        name.c_str());
+
+    std::lock_guard<std::mutex> lock(mMutex);
+    auto segmentId = openSegment(mEngine, connectionInfo.c_str());
+
+    TLLM_CHECK_WITH_INFO(
+        segmentId >= 0, "loadRemoteAgent openSegment failed, connectionInfo: %s", connectionInfo.c_str());
+
+    mConnectedAgents[name].segmentId = segmentId;
+}
+
+void MooncakeTransferAgent::invalidateRemoteAgent(std::string const& name)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::invalidateRemoteAgent");
+}
+
+AgentDesc MooncakeTransferAgent::getLocalAgentDesc()
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalAgentDesc");
+
+    // Using connection info as agent desc
+    const static size_t kBufLen = 64;
+    char connectionInfo[kBufLen];
+
+    int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalIpAndPort failed");
+
+    return AgentDesc{std::string(connectionInfo)};
+}
+
+ConnectionInfoType MooncakeTransferAgent::getLocalConnectionInfo()
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalConnectionInfo");
+
+    const static size_t kBufLen = 64;
+    char connectionInfo[kBufLen];
+
+    int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalConnectionInfo failed");
+
+    return std::string(connectionInfo);
+}
+
+[[nodiscard]] std::unique_ptr<TransferStatus> MooncakeTransferAgent::submitTransferRequests(
+    TransferRequest const& request)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::submitTransferRequests");
+
+    bool hasNotif = false;
+    std::string syncMessage;
+
+    if (request.getSyncMessage().has_value())
+    {
+        hasNotif = true;
+        syncMessage = request.getSyncMessage().value();
+    }
+
+    const static size_t kMaxRequestCount = 1024;
+    uint64_t batchId = allocateBatchID(mEngine, kMaxRequestCount);
+
+    TLLM_CHECK_WITH_INFO(batchId != INVALID_BATCH, "allocateBatchID failed");
+
+    int segmentId;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        std::string remoteName = request.getRemoteName();
+
+        auto it = mConnectedAgents.find(remoteName);
+        if (it == mConnectedAgents.end())
+        {
+            std::string error = "Remote agent " + remoteName + "not found";
+            TLLM_THROW(error);
+        }
+
+        auto const& agentInfo = it->second;
+        segmentId = agentInfo.segmentId;
+    }
+
+    auto localDescs = request.getSrcDescs().getDescs();
+    auto remoteDescs = request.getDstDescs().getDescs();
+
+    TLLM_CHECK_WITH_INFO(localDescs.size() == remoteDescs.size(), "Number of local and remote memory must match");
+
+    size_t requestCount = localDescs.size();
+    std::vector<transfer_request_t> transferRequests(requestCount);
+
+    for (size_t index = 0; index < requestCount; ++index)
+    {
+        TLLM_CHECK_WITH_INFO(
+            localDescs[index].getLen() == remoteDescs[index].getLen(), "Length of local and remote memory must match");
+
+        transferRequests[index].opcode = (request.getOp() == TransferOp::kREAD) ? OPCODE_READ : OPCODE_WRITE;
+        transferRequests[index].source = reinterpret_cast<void*>(localDescs[index].getAddr());
+        transferRequests[index].target_offset = remoteDescs[index].getAddr();
+        transferRequests[index].length = localDescs[index].getLen();
+        transferRequests[index].target_id = segmentId;
+    }
+
+    int rc = 0;
+    if (hasNotif)
+    {
+        notify_msg_t notifyMsg;
+        notifyMsg.name = const_cast<char*>(mLocalAgentName.c_str());
+        notifyMsg.msg = const_cast<char*>(syncMessage.c_str());
+        rc = submitTransferWithNotify(mEngine, batchId, transferRequests.data(), requestCount, notifyMsg);
+    }
+    else
+    {
+        rc = submitTransfer(mEngine, batchId, transferRequests.data(), requestCount);
+    }
+
+    TLLM_CHECK_WITH_INFO(rc == 0, "submitTransfer failed with status: %d", rc);
+
+    return std::make_unique<MooncakeTransferStatus>(mEngine, batchId, requestCount);
+}
+
+void MooncakeTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage");
+    int segmentId;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto it = mConnectedAgents.find(name);
+
+        if (it == mConnectedAgents.end())
+        {
+            TLLM_LOG_WARNING("Remote agent %s not found", name.c_str());
+            return;
+        }
+
+        auto const& agentInfo = it->second;
+        segmentId = agentInfo.segmentId;
+    }
+
+    notify_msg_t notifyMsg;
+    notifyMsg.name = const_cast<char*>(mLocalAgentName.c_str());
+    std::string encoded = MooncakeBase64Helper::encode(syncMessage);
+    notifyMsg.msg = const_cast<char*>(encoded.c_str());
+
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage notifyMsg.name: %s, notifyMsg.msg: %s", notifyMsg.name,
+        notifyMsg.msg);
+
+    int ret = genNotifyInEngine(mEngine, segmentId, notifyMsg);
+
+    TLLM_CHECK_WITH_INFO(ret == 0, "genNotifyInEngine failed with status: %d", ret);
+}
+
+[[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> MooncakeTransferAgent::getNotifiedSyncMessages()
+{
+    std::unordered_map<std::string, std::vector<SyncMessage>> notifs;
+    int size = 0;
+
+    notify_msg_t* notifyMsgs = getNotifsFromEngine(mEngine, &size);
+
+    TLLM_CHECK_WITH_INFO(size >= 0, "getNotifsFromEngine returned negative size: %d", size);
+
+    for (int i = 0; i < size; i++)
+    {
+        if (notifyMsgs[i].msg == nullptr)
+        {
+            TLLM_LOG_WARNING("Message pointer is null for: %s", notifyMsgs[i].name);
+            continue;
+        }
+
+        std::string decoded = MooncakeBase64Helper::decodeToString(notifyMsgs[i].msg);
+        notifs[notifyMsgs[i].name].emplace_back(std::move(decoded));
+
+        TLLM_LOG_DEBUG("MooncakeTransferAgent::getNotifiedSyncMessages getNotifsFromEngine: %s, %s", notifyMsgs[i].name,
+            notifyMsgs[i].msg);
+    }
+
+    freeNotifsMsgBuf(notifyMsgs, size);
+    return notifs;
+}
+
+bool MooncakeTransferAgent::checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs)
+{
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::checkRemoteDescs");
+    return true;
+}
+
+MooncakeTransferAgent::~MooncakeTransferAgent()
+{
+    destroyTransferEngine(mEngine);
+    TLLM_LOG_DEBUG("MooncakeTransferAgent::~MooncakeTransferAgent");
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+#endif
+
+extern "C"
+{
+    std::unique_ptr<BaseTransferAgent> createMooncakeTransferAgent(BaseAgentConfig const* config)
+    {
+        TLLM_CHECK(config);
+        return std::make_unique<MooncakeTransferAgent>(*config);
+    }
+}
+
+} // namespace tensorrt_llm::executor::kv_cache
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h
new file mode 100644
index 00000000000..0aeeedeae17
--- /dev/null
+++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h
@@ -0,0 +1,165 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "tensorrt_llm/executor/transferAgent.h"
+#include "transfer_engine_c.h"
+
+namespace tensorrt_llm::executor::kv_cache
+{
+
+class MooncakeTransferStatus final : public TransferStatus
+{
+public:
+    MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount);
+
+    [[nodiscard]] bool isCompleted() const override;
+
+    void wait() const override;
+
+private:
+    transfer_engine_t mEngine;
+    uint64_t mBatchId;
+    size_t mRequestCount;
+    mutable bool mBatchFreed = false;
+};
+
+class MooncakeMemoryDesc
+{
+public:
+    MooncakeMemoryDesc(MemoryDesc desc)
+        : mDesc{std::move(desc)}
+        , mRefCnt{0}
+    {
+    }
+
+    MooncakeMemoryDesc(MooncakeMemoryDesc const& other)
+        : mDesc{other.mDesc}
+        , mRefCnt{0}
+    {
+    }
+
+    MooncakeMemoryDesc& operator=(MooncakeMemoryDesc const&) = delete;
+
+    ~MooncakeMemoryDesc() = default;
+
+    void addRef() noexcept
+    {
+        ++mRefCnt;
+    }
+
+    int releaseRef() noexcept
+    {
+        return --mRefCnt;
+    }
+
+    int getRefCount() const noexcept
+    {
+        return mRefCnt;
+    }
+
+    MemoryDesc const& getDesc() const noexcept
+    {
+        return mDesc;
+    }
+
+private:
+    MemoryDesc mDesc;
+    int mRefCnt;
+};
+
+class MooncakeBase64Helper
+{
+public:
+    static std::string encode(std::vector<uint8_t> const& data);
+    static std::string encode(std::string const& data);
+
+    static std::vector<uint8_t> decode(std::string const& encoded);
+    static std::string decodeToString(std::string const& encoded);
+
+private:
+    static const std::string STANDARD_CHARS;
+
+    static std::string encodeInternal(std::vector<uint8_t> const& data, std::string const& chars);
+    static std::vector<uint8_t> decodeInternal(std::string const& encoded, std::string const& chars);
+
+    static inline bool isBase64(uint8_t c, std::string const& chars);
+    static inline bool isWhitespace(uint8_t c);
+};
+
+class MooncakeTransferAgent final : public BaseTransferAgent
+{
+public:
+    MooncakeTransferAgent(BaseAgentConfig const& config);
+    ~MooncakeTransferAgent();
+
+    void registerMemory(RegisterDescs const& descs) override;
+
+    void deregisterMemory(RegisterDescs const& descs) override;
+
+    void loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) override;
+
+    void loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) override;
+
+    void invalidateRemoteAgent(std::string const& name) override;
+
+    AgentDesc getLocalAgentDesc() override;
+
+    ConnectionInfoType getLocalConnectionInfo() override;
+
+    [[nodiscard]] std::unique_ptr<TransferStatus> submitTransferRequests(TransferRequest const& request) override;
+
+    void notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) override;
+
+    [[nodiscard]] std::unordered_map<std::string, std::vector<SyncMessage>> getNotifiedSyncMessages() override;
+
+    bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) override;
+
+private:
+    struct AgentInfo
+    {
+        int segmentId;
+    };
+
+    mutable std::mutex mMutex;
+    transfer_engine_t mEngine;
+    std::unordered_map<uintptr_t, std::shared_ptr<MooncakeMemoryDesc>> mMemRegInfo;
+    std::unordered_map<std::string, AgentInfo> mConnectedAgents;
+    std::string mLocalAgentName;
+};
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+#endif
+
+extern "C"
+{
+    [[nodiscard]] std::unique_ptr<BaseTransferAgent> createMooncakeTransferAgent(BaseAgentConfig const* config);
+}
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+} // namespace tensorrt_llm::executor::kv_cache
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
index bed5db70f74..051586b7fe3 100644
--- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -449,6 +449,7 @@ void initConfigBindings(nb::module_& m)
         .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
         .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
         .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
+        .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE)
         .def("from_string",
             [](std::string const& str)
             {
@@ -460,6 +461,8 @@ void initConfigBindings(nb::module_& m)
                     return tle::CacheTransceiverConfig::BackendType::UCX;
                 if (str == "NIXL" || str == "nixl")
                     return tle::CacheTransceiverConfig::BackendType::NIXL;
+                if (str == "MOONCAKE" || str == "mooncake")
+                    return tle::CacheTransceiverConfig::BackendType::MOONCAKE;
                 throw std::runtime_error("Invalid backend type: " + str);
             });
 
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index 79194232560..4fe20a6c664 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -431,6 +431,7 @@ void initConfigBindings(pybind11::module_& m)
         .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
         .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
         .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
+        .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE)
         .def("from_string",
             [](std::string const& str)
             {
@@ -442,6 +443,8 @@ void initConfigBindings(pybind11::module_& m)
                     return tle::CacheTransceiverConfig::BackendType::UCX;
                 if (str == "NIXL" || str == "nixl")
                     return tle::CacheTransceiverConfig::BackendType::NIXL;
+                if (str == "MOONCAKE" || str == "mooncake")
+                    return tle::CacheTransceiverConfig::BackendType::MOONCAKE;
                 throw std::runtime_error("Invalid backend type: " + str);
             });
 
diff --git a/cpp/tests/unit_tests/executor/CMakeLists.txt b/cpp/tests/unit_tests/executor/CMakeLists.txt
index de3a694d21d..069363c5edb 100644
--- a/cpp/tests/unit_tests/executor/CMakeLists.txt
+++ b/cpp/tests/unit_tests/executor/CMakeLists.txt
@@ -38,10 +38,31 @@ add_gtest(ucxCommTest ucxCommTest.cpp)
 target_link_libraries(ucxCommTest PRIVATE ${Python3_LIBRARIES})
 target_link_libraries(serializeUtilsTest PRIVATE ${Python3_LIBRARIES})
 
-if(NIXL_ROOT)
-  add_gtest(transferAgentTest transferAgentTest.cpp)
+# Skip MOONCAKE related tests on Rocky8
+set(IS_ROCKY8 FALSE)
+if(EXISTS "/etc/redhat-release")
+  set(IS_ROCKY8 TRUE)
+endif()
+
+if(NIXL_ROOT OR (MOONCAKE_ROOT AND NOT IS_ROCKY8))
   add_gtest(agentCommTest agentCommTest.cpp)
-  target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper)
-  target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper
-                                              ${Python3_LIBRARIES})
+  add_gtest(transferAgentTest transferAgentTest.cpp)
+
+  if(NIXL_ROOT)
+    target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper)
+    target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper
+                                                ${Python3_LIBRARIES})
+    target_compile_definitions(transferAgentTest PRIVATE TEST_NIXL_BACKEND=1)
+    target_compile_definitions(agentCommTest PRIVATE TEST_NIXL_BACKEND=1)
+  endif()
+
+  if(MOONCAKE_ROOT)
+    target_link_libraries(transferAgentTest
+                          PRIVATE tensorrt_llm_mooncake_wrapper)
+    target_link_libraries(agentCommTest PRIVATE tensorrt_llm_mooncake_wrapper
+                                                ${Python3_LIBRARIES})
+    target_compile_definitions(transferAgentTest
+                               PRIVATE TEST_MOONCAKE_BACKEND=1)
+    target_compile_definitions(agentCommTest PRIVATE TEST_MOONCAKE_BACKEND=1)
+  endif()
 endif()
diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp
index ccd54ab926f..025a3a8bc6a 100644
--- a/cpp/tests/unit_tests/executor/agentCommTest.cpp
+++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp
@@ -22,22 +22,54 @@ using namespace tensorrt_llm::batch_manager::kv_cache_manager;
 using namespace tensorrt_llm::runtime;
 using namespace tensorrt_llm::executor::kv_cache;
 
-bool needSkipTest(std::string& skipReason)
+std::vector<std::string> getAvailableBackends()
+{
+    std::vector<std::string> backends;
+
+#ifdef TEST_NIXL_BACKEND
+    backends.push_back("nixl");
+#endif
+
+#ifdef TEST_MOONCAKE_BACKEND
+    backends.push_back("mooncake");
+#endif
+
+    return backends;
+}
+
+bool needSkipTest(std::string const& backend, std::string& skipReason)
 {
     bool skip = false;
     try
     {
         auto& loader = tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance();
 
-        using CreateNixlFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
-            tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
-        auto* func = loader.getFunctionPointer<CreateNixlFuncType>(
-            "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
+        if (backend == "nixl")
+        {
+            using CreateNixlFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
+                tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
+            auto* func = loader.getFunctionPointer<CreateNixlFuncType>(
+                "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent");
+        }
+        else if (backend == "mooncake")
+        {
+            using CreateMooncakeFuncType = std::unique_ptr<tensorrt_llm::executor::kv_cache::BaseTransferAgent> (*)(
+                tensorrt_llm::executor::kv_cache::BaseAgentConfig const*);
+            auto* func = loader.getFunctionPointer<CreateMooncakeFuncType>(
+                "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent");
+        }
+        else
+        {
+            skip = true;
+            skipReason = "Unknown backend: " + backend;
+        }
     }
     catch (std::exception const& e)
     {
         std::string error = e.what();
-        if (error.find("libtensorrt_llm_nixl_wrapper.so") != std::string::npos)
+        std::string libName
+            = (backend == "nixl") ? "libtensorrt_llm_nixl_wrapper.so" : "libtensorrt_llm_mooncake_wrapper.so";
+        if (error.find(libName) != std::string::npos)
         {
             skip = true;
             skipReason = error;
@@ -46,17 +78,26 @@ bool needSkipTest(std::string& skipReason)
     return skip;
 }
 
-class AgentCommTest : public ::testing::Test
+class AgentCommTest : public ::testing::TestWithParam<std::string>
 {
 protected:
     void SetUp() override
     {
+        backend = GetParam();
         std::string skipReason;
-        if (needSkipTest(skipReason))
+        if (needSkipTest(backend, skipReason))
         {
             GTEST_SKIP() << skipReason;
         }
-        setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1);
+
+        if (backend == "nixl")
+        {
+            setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1);
+        }
+        else if (backend == "mooncake")
+        {
+            setenv("TRTLLM_USE_MOONCAKE_KVCACHE", "1", 1);
+        }
 
         auto constexpr numLayers = 8;
         auto constexpr numHeads = 16;
@@ -106,15 +147,16 @@ class AgentCommTest : public ::testing::Test
         mCacheState.reset();
     }
 
+    std::string backend;
     std::unique_ptr<CacheTransBufferManager> mTransBufferManager;
     std::unique_ptr<KVCacheManager> mCacheManager;
     std::unique_ptr<CacheState> mCacheState;
 };
 
-TEST_F(AgentCommTest, AgentConnectionManagerBasic)
+TEST_P(AgentCommTest, AgentConnectionManagerBasic)
 {
     std::vector<CacheTransBufferManager*> bufferManagers{mTransBufferManager.get()};
-    auto connectionManager = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
+    auto connectionManager = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
     ASSERT_TRUE(connectionManager != nullptr);
     ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size());
     ASSERT_TRUE(connectionManager->getCacheTransBufferManagers().front() != nullptr);
@@ -126,11 +168,11 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic)
     ASSERT_EQ(commState.getAgentState().size(), 1);
 }
 
-TEST_F(AgentCommTest, AgentConnectionManagerConnect)
+TEST_P(AgentCommTest, AgentConnectionManagerConnect)
 {
     std::vector<CacheTransBufferManager*> bufferManagers{mTransBufferManager.get()};
-    auto connectionManager0 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
-    auto connectionManager1 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState);
+    auto connectionManager0 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
+    auto connectionManager1 = std::make_unique<AgentConnectionManager>(bufferManagers, *mCacheState, backend);
     auto agentName0 = connectionManager0->getAgentName();
     auto agentName1 = connectionManager1->getAgentName();
     ASSERT_TRUE(!agentName0.empty());
@@ -189,3 +231,6 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect)
     }
     TLLM_LOG_INFO("after finish");
 }
+
+INSTANTIATE_TEST_SUITE_P(AvailableBackends, AgentCommTest, ::testing::ValuesIn(getAvailableBackends()),
+    [](::testing::TestParamInfo<AgentCommTest::ParamType> const& info) { return info.param; });
diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
index 0f21449f30a..7218611a0e4 100644
--- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp
+++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
@@ -22,11 +22,27 @@
 #include <gtest/gtest.h>
 
 #include <filesystem>
+#include <vector>
 
 namespace fs = std::filesystem;
 
 using namespace tensorrt_llm::executor::kv_cache;
 
+std::vector<std::string> getAvailableBackends()
+{
+    std::vector<std::string> backends;
+
+#ifdef TEST_NIXL_BACKEND
+    backends.push_back("nixl");
+#endif
+
+#ifdef TEST_MOONCAKE_BACKEND
+    backends.push_back("mooncake");
+#endif
+
+    return backends;
+}
+
 class RegisteredHostMemory
 {
 public:
@@ -54,100 +70,105 @@ class RegisteredHostMemory
     BaseTransferAgent* mAgentPtr{};
 };
 
-class TransferAgentTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init)
+class TransferAgentTest : public ::testing::TestWithParam<std::string> // NOLINT(cppcoreguidelines-pro-type-member-init)
 {
 public:
-    void SetUp() override {}
+    void SetUp() override
+    {
+        backend = GetParam();
+    }
 
     void TearDown() override {}
 
     [[nodiscard]] std::unique_ptr<BaseTransferAgent> makeTransferAgent(BaseAgentConfig const& config)
     {
-        return tensorrt_llm::executor::kv_cache::makeTransferAgent("nixl", &config);
+        return tensorrt_llm::executor::kv_cache::makeTransferAgent(backend, &config);
     }
+
+    std::string backend;
 };
 
-TEST_F(TransferAgentTest, Basic)
+TEST_P(TransferAgentTest, Basic)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
-        // wait for regMem is unpacked by nixlAgent0
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        // wait for regMem is unpacked by xferAgent0
     } while (!checked);
 
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, Basic2)
+TEST_P(TransferAgentTest, Basic2)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked);
 
     TransferRequest readReq{TransferOp::kREAD, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(readReq);
+    auto status = xferAgent0->submitTransferRequests(readReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, DeviceMemory)
+TEST_P(TransferAgentTest, DeviceMemory)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
     char* dev_ptr0;
     char* dev_ptr1;
     size_t size = 100;
@@ -159,20 +180,20 @@ TEST_F(TransferAgentTest, DeviceMemory)
     cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice);
     cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice);
     RegisteredHostMemory regMem0(
-        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, nixlAgent0.get());
+        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, xferAgent0.get());
     RegisteredHostMemory regMem1(
-        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, nixlAgent1.get());
+        MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked);
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost);
@@ -181,98 +202,99 @@ TEST_F(TransferAgentTest, DeviceMemory)
     TLLM_CHECK(memory0 == memory1);
     TLLM_CUDA_CHECK(cudaFree(dev_ptr0));
     TLLM_CUDA_CHECK(cudaFree(dev_ptr1));
-    nixlAgent0->invalidateRemoteAgent(agent1);
+    xferAgent0->invalidateRemoteAgent(agent1);
 }
 
-TEST_F(TransferAgentTest, Connect)
+TEST_P(TransferAgentTest, Connect)
 {
 
     std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true}, config2{agent2, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
-    auto nixlAgent2 = makeTransferAgent(config2);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
+    auto xferAgent2 = makeTransferAgent(config2);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
     MemoryDescs memDescs0{MemoryType::kDRAM, {MemoryDesc{memory0}}};
     MemoryDescs memDescs1{MemoryType::kDRAM, {MemoryDesc{memory1}}};
 
-    nixlAgent0->registerMemory(memDescs0);
-    nixlAgent1->registerMemory(memDescs1);
-    nixlAgent2->registerMemory(memDescs0);
+    xferAgent0->registerMemory(memDescs0);
+    xferAgent1->registerMemory(memDescs1);
+    xferAgent2->registerMemory(memDescs0);
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, memDescs1);
+        checked = xferAgent0->checkRemoteDescs(agent1, memDescs1);
     } while (!checked);
     TransferRequest writeReq{TransferOp::kWRITE, memDescs0, memDescs1, agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
     status->wait();
 
     TLLM_CHECK(memory0 == memory1);
-    nixlAgent2->loadRemoteAgent(agent1, connectionInfo);
+    xferAgent2->loadRemoteAgent(agent1, connectionInfo);
     checked = false;
     do
     {
-        checked = nixlAgent2->checkRemoteDescs(agent1, memDescs1);
+        checked = xferAgent2->checkRemoteDescs(agent1, memDescs1);
     } while (!checked);
     TransferRequest writeReq2{TransferOp::kWRITE, memDescs0, memDescs1, agent1};
-    auto status2 = nixlAgent2->submitTransferRequests(writeReq2);
+    auto status2 = xferAgent2->submitTransferRequests(writeReq2);
     status2->wait();
     TLLM_CHECK(memory0 == memory1);
-    nixlAgent0->invalidateRemoteAgent(agent1);
-    nixlAgent2->invalidateRemoteAgent(agent1);
-    nixlAgent0->deregisterMemory(memDescs0);
-    nixlAgent1->deregisterMemory(memDescs1);
-    nixlAgent2->deregisterMemory(memDescs0);
+    xferAgent0->invalidateRemoteAgent(agent1);
+    xferAgent2->invalidateRemoteAgent(agent1);
+    xferAgent0->deregisterMemory(memDescs0);
+    xferAgent1->deregisterMemory(memDescs1);
+    xferAgent2->deregisterMemory(memDescs0);
 }
 
-TEST_F(TransferAgentTest, SyncMessage)
+TEST_P(TransferAgentTest, SyncMessage)
 {
     constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits<size_t>::max();
     std::string const agent0{"agent0"}, agent1{"agent1"};
     BaseAgentConfig config0{agent0, true}, config1{agent1, true};
-    auto nixlAgent0 = makeTransferAgent(config0);
-    auto nixlAgent1 = makeTransferAgent(config1);
+    auto xferAgent0 = makeTransferAgent(config0);
+    auto xferAgent1 = makeTransferAgent(config1);
 
-    TLLM_CHECK(nixlAgent0);
-    TLLM_CHECK(nixlAgent1);
+    TLLM_CHECK(xferAgent0);
+    TLLM_CHECK(xferAgent1);
 
     std::vector<char> memory0(100, 10);
     std::vector<char> memory1(100, 1);
 
-    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get());
-    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent0.get());
+    RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get());
+    RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent0.get());
 
-    RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent1.get());
-    RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get());
+    RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent1.get());
+    RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get());
 
-    // nixlAgent0->loadRemoteAgent(agent1);
-    auto connectionInfo = nixlAgent1->getLocalConnectionInfo();
-    nixlAgent0->loadRemoteAgent(agent1, connectionInfo);
+    // xferAgent0->loadRemoteAgent(agent1);
+    auto connectionInfo = xferAgent1->getLocalConnectionInfo();
+    xferAgent0->loadRemoteAgent(agent1, connectionInfo);
     bool checked = false;
     do
     {
-        checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
+        checked = xferAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
     } while (!checked);
     auto syncMessage = std::string("agent_sync_message");
     TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1};
-    auto status = nixlAgent0->submitTransferRequests(writeReq);
-    nixlAgent0->notifySyncMessage(agent1, syncMessage);
+    auto status = xferAgent0->submitTransferRequests(writeReq);
+    xferAgent0->notifySyncMessage(agent1, syncMessage);
 
-    auto notif = nixlAgent1->getNotifiedSyncMessages();
+    auto notif = xferAgent1->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++)
     {
-        notif = nixlAgent1->getNotifiedSyncMessages();
+        notif = xferAgent1->getNotifiedSyncMessages();
     }
+    status->wait();
     TLLM_CHECK(status->isCompleted());
     TLLM_CHECK(notif.size() == 1);
     TLLM_CHECK(notif[agent0].size() == 1);
@@ -281,25 +303,25 @@ TEST_F(TransferAgentTest, SyncMessage)
     TLLM_CHECK(memory0 == memory1);
 
     std::string syncMessage2 = "two_agent_sync_message";
-    nixlAgent0->notifySyncMessage(agent1, syncMessage2);
-    auto notif2 = nixlAgent1->getNotifiedSyncMessages();
+    xferAgent0->notifySyncMessage(agent1, syncMessage2);
+    auto notif2 = xferAgent1->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++)
     {
-        notif2 = nixlAgent1->getNotifiedSyncMessages();
+        notif2 = xferAgent1->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif2.size() == 1);
     TLLM_CHECK(notif2[agent0].size() == 1);
     TLLM_CHECK(notif2[agent0][0] == syncMessage2);
 
-    // nixlAgent1->loadRemoteAgent(agent0);
-    auto connectionInfo2 = nixlAgent0->getLocalConnectionInfo();
-    nixlAgent1->loadRemoteAgent(agent0, connectionInfo2);
+    // xferAgent1->loadRemoteAgent(agent0);
+    auto connectionInfo2 = xferAgent0->getLocalConnectionInfo();
+    xferAgent1->loadRemoteAgent(agent0, connectionInfo2);
     std::string syncMessage3 = "three_agent_sync_message";
-    nixlAgent1->notifySyncMessage(agent0, syncMessage3);
-    auto notif3 = nixlAgent0->getNotifiedSyncMessages();
+    xferAgent1->notifySyncMessage(agent0, syncMessage3);
+    auto notif3 = xferAgent0->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++)
     {
-        notif3 = nixlAgent0->getNotifiedSyncMessages();
+        notif3 = xferAgent0->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif3.size() == 1);
     TLLM_CHECK(notif3[agent1].size() == 1);
@@ -308,19 +330,20 @@ TEST_F(TransferAgentTest, SyncMessage)
     bool checked2 = false;
     do
     {
-        checked2 = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
+        checked2 = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs());
     } while (!checked2);
 
     std::string syncMessage4 = "four_agent_sync_message";
     TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0};
-    auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
-    nixlAgent1->notifySyncMessage(agent0, syncMessage4);
+    auto status1 = xferAgent1->submitTransferRequests(writeReq1);
+    xferAgent1->notifySyncMessage(agent0, syncMessage4);
 
-    auto notif4 = nixlAgent0->getNotifiedSyncMessages();
+    auto notif4 = xferAgent0->getNotifiedSyncMessages();
     for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
     {
-        notif4 = nixlAgent0->getNotifiedSyncMessages();
+        notif4 = xferAgent0->getNotifiedSyncMessages();
     }
+    status1->wait();
     TLLM_CHECK(status1->isCompleted());
     TLLM_CHECK(notif4.size() == 1);
     TLLM_CHECK(notif4[agent1].size() == 1);
@@ -335,11 +358,11 @@ TEST_F(TransferAgentTest, SyncMessage)
     std::stringstream ss;
     Serialization::serialize(state, ss);
     std::string serializedState = ss.str();
-    nixlAgent0->notifySyncMessage(agent1, serializedState);
-    auto notif5 = nixlAgent1->getNotifiedSyncMessages();
+    xferAgent0->notifySyncMessage(agent1, serializedState);
+    auto notif5 = xferAgent1->getNotifiedSyncMessages();
     for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++)
     {
-        notif5 = nixlAgent1->getNotifiedSyncMessages();
+        notif5 = xferAgent1->getNotifiedSyncMessages();
     }
     TLLM_CHECK(notif5.size() == 1);
     TLLM_CHECK(notif5[agent0].size() == 1);
@@ -348,10 +371,16 @@ TEST_F(TransferAgentTest, SyncMessage)
     auto state2 = Serialization::deserializeCommState(ss2);
     TLLM_CHECK(state2 == state);
 
-    nixlAgent0->invalidateRemoteAgent(agent1);
-    nixlAgent1->invalidateRemoteAgent(agent0);
+    xferAgent0->invalidateRemoteAgent(agent1);
+    xferAgent1->invalidateRemoteAgent(agent0);
 }
 
+INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()),
+    [](::testing::TestParamInfo<TransferAgentTest::ParamType> const& info) { return info.param; });
+
+// Skip LoopbackAgentTest for mooncake backend for now
+#ifdef TEST_NIXL_BACKEND
+
 class LoopbackAgentTest : public ::testing::Test,
                           public ::testing::WithParamInterface<bool> // NOLINT(cppcoreguidelines-pro-type-member-init)
 {
@@ -466,3 +495,5 @@ TEST_P(LoopbackAgentTest, GpuToFile)
 }
 
 INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false));
+
+#endif // TEST_NIXL_BACKEND
diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
index 17ca989eee5..41dd8e7a92c 100644
--- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
+++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -46,6 +46,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <filesystem>
 #include <memory>
 #include <random>
 #include <tensorrt_llm/batch_manager/cacheTransBuffer.h>
@@ -713,7 +714,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
             return;
         }
         else if (tensorrt_llm::common::getEnvUseMPIKvCache() || tensorrt_llm::common::getEnvUseUCXKvCache()
-            || tensorrt_llm::common::getEnvUseNixlKvCache())
+            || tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())
         {
             int maxNumTokens = 2048;
             mCacheTransBufferManagers.clear();
@@ -729,7 +730,15 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
             }
             bool isUcx = tensorrt_llm::common::getEnvUseUCXKvCache();
             bool isNixl = tensorrt_llm::common::getEnvUseNixlKvCache();
-            TLLM_LOG_INFO("Enable %s KV cache transport.", isUcx ? "UCX" : isNixl ? "NIXL" : "MPI");
+            bool isMooncake = tensorrt_llm::common::getEnvUseMooncakeKvCache();
+            // Skip tests for MOONCAKE when on Rocky8
+            bool isRocky8 = std::filesystem::exists("/etc/redhat-release");
+            isMooncake = isMooncake && !isRocky8;
+            TLLM_LOG_INFO("Enable %s KV cache transport.",
+                isUcx            ? "UCX"
+                    : isNixl     ? "NIXL"
+                    : isMooncake ? "MOONCAKE"
+                                 : "MPI");
 
             if (isUcx)
             {
@@ -756,7 +765,12 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
                 setenv("TRTLLM_NIXL_PORT", std::to_string(port).c_str(), 1);
 
                 mConnectionManager
-                    = std::make_unique<texec::kv_cache::AgentConnectionManager>(bufferManagers, *mCacheState);
+                    = std::make_unique<texec::kv_cache::AgentConnectionManager>(bufferManagers, *mCacheState, "nixl");
+            }
+            else if (isMooncake)
+            {
+                mConnectionManager = std::make_unique<texec::kv_cache::AgentConnectionManager>(
+                    bufferManagers, *mCacheState, "mooncake");
             }
             else
             {
@@ -783,7 +797,7 @@ class AsymmetricalCacheTest : public ::testing::TestWithParam<AsymmetricTestPara
             std::vector<int> contextRankVec(mContextRankSize);
             std::iota(contextRankVec.begin(), contextRankVec.end(), 0);
 
-            if (isUcx || isNixl)
+            if (isUcx || isNixl || isMooncake)
             {
                 auto commState = mConnectionManager->getCommState();
                 namespace su = tensorrt_llm::executor::serialize_utils;
@@ -1286,9 +1300,9 @@ TEST_P(AsymmetricalCacheTest, TestCase)
     int indexerDimPerHead = std::get<17>(param);
     int indexerKCacheQuantBlockSize = std::get<18>(param);
 
-    if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache())
+    if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()))
     {
-        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP.";
+        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
     }
     std::vector<int> lenList = {30, 10, 60, 80};
     if (genCp > 1)
@@ -1410,9 +1424,9 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase)
     int indexerDimPerHead = std::get<17>(param);
     int indexerKCacheQuantBlockSize = std::get<18>(param);
 
-    if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache())
+    if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()))
     {
-        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP.";
+        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
     }
     setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP);
 
diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst
index d28fed25a8e..c2e1e5b55df 100644
--- a/docs/source/deployment-guide/config_table.rst
+++ b/docs/source/deployment-guide/config_table.rst
@@ -167,162 +167,162 @@
      - 4
      - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 4
-     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 8
-     - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 1024 / 1024
      - 16
      - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
    * - 4xB200_NVL
-     - Low Latency
+     - Balanced
      - 1024 / 1024
      - 32
      - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 1024 / 1024
      - 128
      - `1k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 128
-     - `1k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
    * - 4xB200_NVL
-     - High Throughput
+     - Max Throughput
      - 1024 / 1024
      - 256
      - `1k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml``
-   * - 8xB200_NVL
-     - Max Throughput
-     - 1024 / 1024
-     - 256
-     - `1k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
    * - 4xB200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
    * - 4xB200_NVL
      - Low Latency
      - 8192 / 1024
      - 16
      - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
    * - 4xB200_NVL
-     - Low Latency
+     - Balanced
      - 8192 / 1024
      - 32
      - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
    * - 4xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp4_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml``
+   * - 4xB200_NVL
+     - Max Throughput
+     - 8192 / 1024
+     - 256
+     - `8k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
+   * - 8xB200_NVL
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml``
+   * - 8xB200_NVL
+     - Balanced
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 1024 / 1024
+     - 128
+     - `1k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml``
+   * - 8xB200_NVL
+     - Max Throughput
+     - 1024 / 1024
+     - 256
+     - `1k1k_tp8_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml``
+   * - 8xB200_NVL
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml``
+   * - 8xB200_NVL
+     - Balanced
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml>`_
+     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml``
    * - 8xB200_NVL
      - High Throughput
      - 8192 / 1024
      - 128
      - `8k1k_tp8_conc128.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml>`_
      - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 256
-     - `8k1k_tp4_conc256.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml>`_
-     - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml``
    * - 8xB200_NVL
      - Max Throughput
      - 8192 / 1024
@@ -356,714 +356,714 @@
      - 4
      - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 4
-     - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 4
-     - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 4
-     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 8
-     - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 8
-     - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 8
-     - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
    * - B200_NVL
-     - Low Latency
+     - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml``
-   * - 2xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
    * - B200_NVL
-     - High Throughput
+     - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml``
-   * - 2xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
-   * - 8xB200_NVL
-     - Max Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
    * - B200_NVL
-     - Low Latency
+     - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml``
-   * - 2xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
    * - B200_NVL
-     - High Throughput
+     - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml``
-   * - 2xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
-   * - 8xB200_NVL
-     - Max Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
    * - B200_NVL
      - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
    * - B200_NVL
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
-   * - 4xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
-   * - 8xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
    * - B200_NVL
-     - Low Latency
+     - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml``
-   * - 2xB200_NVL
-     - Low Latency
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
    * - B200_NVL
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml``
-   * - 2xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
-   * - 8xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
    * - B200_NVL
-     - High Throughput
+     - Max Throughput
      - 8192 / 1024
      - 64
      - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml``
    * - 2xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
-   * - 4xB200_NVL
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
-   * - 8xB200_NVL
-     - Max Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
-   * - H200_SXM
      - Min Latency
      - 1024 / 1024
      - 4
-     - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
-   * - 2xH200_SXM
+     - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml``
+   * - 2xB200_NVL
      - Low Latency
      - 1024 / 1024
-     - 4
-     - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
-   * - 4xH200_SXM
-     - Low Latency
+     - 8
+     - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml``
+   * - 2xB200_NVL
+     - Balanced
      - 1024 / 1024
-     - 4
-     - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
-   * - 8xH200_SXM
-     - Low Latency
+     - 16
+     - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml``
+   * - 2xB200_NVL
+     - High Throughput
      - 1024 / 1024
-     - 4
-     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
-   * - H200_SXM
-     - Low Latency
+     - 32
+     - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml``
+   * - 2xB200_NVL
+     - Max Throughput
      - 1024 / 1024
-     - 8
-     - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
-   * - 2xH200_SXM
+     - 64
+     - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml``
+   * - 2xB200_NVL
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml``
+   * - 2xB200_NVL
      - Low Latency
-     - 1024 / 1024
+     - 1024 / 8192
      - 8
-     - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
-   * - 4xH200_SXM
-     - Low Latency
+     - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml``
+   * - 2xB200_NVL
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml``
+   * - 2xB200_NVL
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml``
+   * - 2xB200_NVL
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml``
+   * - 2xB200_NVL
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml``
+   * - 2xB200_NVL
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml``
+   * - 2xB200_NVL
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml``
+   * - 2xB200_NVL
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml``
+   * - 2xB200_NVL
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml``
+   * - 4xB200_NVL
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml``
+   * - 4xB200_NVL
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml``
+   * - 4xB200_NVL
+     - Balanced
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml``
+   * - 4xB200_NVL
+     - High Throughput
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml``
+   * - 4xB200_NVL
+     - Max Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml``
+   * - 4xB200_NVL
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml``
+   * - 4xB200_NVL
+     - Low Latency
+     - 1024 / 8192
+     - 8
+     - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml``
+   * - 4xB200_NVL
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml``
+   * - 4xB200_NVL
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml``
+   * - 4xB200_NVL
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml``
+   * - 4xB200_NVL
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml``
+   * - 4xB200_NVL
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml``
+   * - 4xB200_NVL
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml``
+   * - 4xB200_NVL
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml``
+   * - 4xB200_NVL
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml``
+   * - 8xB200_NVL
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml``
+   * - 8xB200_NVL
+     - Balanced
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml``
+   * - 8xB200_NVL
+     - Max Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml``
+   * - 8xB200_NVL
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 1024 / 8192
+     - 8
+     - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml``
+   * - 8xB200_NVL
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml``
+   * - 8xB200_NVL
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml``
+   * - 8xB200_NVL
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml``
+   * - 8xB200_NVL
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml``
+   * - 8xB200_NVL
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml``
+   * - 8xB200_NVL
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml``
+   * - 8xB200_NVL
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml``
+   * - H200_SXM
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml``
+   * - H200_SXM
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml``
+   * - H200_SXM
+     - Balanced
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
+   * - H200_SXM
+     - High Throughput
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
+   * - H200_SXM
+     - Max Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
+   * - H200_SXM
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
+   * - H200_SXM
+     - Low Latency
+     - 1024 / 8192
+     - 8
+     - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
+   * - H200_SXM
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
+   * - H200_SXM
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
+   * - H200_SXM
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
+   * - H200_SXM
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
+   * - H200_SXM
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
+   * - H200_SXM
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
+   * - H200_SXM
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
+   * - H200_SXM
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
+   * - 2xH200_SXM
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml``
+   * - 2xH200_SXM
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml``
+   * - 2xH200_SXM
+     - Balanced
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
+   * - 2xH200_SXM
+     - High Throughput
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
+   * - 2xH200_SXM
+     - Max Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
+   * - 2xH200_SXM
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
+   * - 2xH200_SXM
+     - Low Latency
+     - 1024 / 8192
+     - 8
+     - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
+   * - 2xH200_SXM
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
+   * - 2xH200_SXM
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
+   * - 2xH200_SXM
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
+   * - 2xH200_SXM
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
+   * - 2xH200_SXM
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
+   * - 2xH200_SXM
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
+   * - 2xH200_SXM
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
+   * - 2xH200_SXM
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
+   * - 4xH200_SXM
+     - Min Latency
+     - 1024 / 1024
+     - 4
+     - `1k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml``
+   * - 4xH200_SXM
+     - Low Latency
+     - 1024 / 1024
+     - 8
+     - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
+   * - 4xH200_SXM
+     - Balanced
+     - 1024 / 1024
+     - 16
+     - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
+   * - 4xH200_SXM
+     - High Throughput
+     - 1024 / 1024
+     - 32
+     - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
+   * - 4xH200_SXM
+     - Max Throughput
+     - 1024 / 1024
+     - 64
+     - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
+   * - 4xH200_SXM
+     - Min Latency
+     - 1024 / 8192
+     - 4
+     - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
+   * - 4xH200_SXM
+     - Low Latency
+     - 1024 / 8192
+     - 8
+     - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
+   * - 4xH200_SXM
+     - Balanced
+     - 1024 / 8192
+     - 16
+     - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
+   * - 4xH200_SXM
+     - High Throughput
+     - 1024 / 8192
+     - 32
+     - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
+   * - 4xH200_SXM
+     - Max Throughput
+     - 1024 / 8192
+     - 64
+     - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
+   * - 4xH200_SXM
+     - Min Latency
+     - 8192 / 1024
+     - 4
+     - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
+   * - 4xH200_SXM
+     - Low Latency
+     - 8192 / 1024
+     - 8
+     - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
+   * - 4xH200_SXM
+     - Balanced
+     - 8192 / 1024
+     - 16
+     - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
+   * - 4xH200_SXM
+     - High Throughput
+     - 8192 / 1024
+     - 32
+     - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
+   * - 4xH200_SXM
+     - Max Throughput
+     - 8192 / 1024
+     - 64
+     - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
+   * - 8xH200_SXM
+     - Min Latency
      - 1024 / 1024
-     - 8
-     - `1k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml``
+     - 4
+     - `1k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml>`_
+     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 1024
      - 8
      - `1k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml``
-   * - H200_SXM
-     - Low Latency
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 16
-     - `1k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml``
    * - 8xH200_SXM
-     - High Throughput
+     - Balanced
      - 1024 / 1024
      - 16
      - `1k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 32
-     - `1k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 1024
      - 32
      - `1k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 1024
-     - 64
-     - `1k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 1024
      - 64
      - `1k1k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml``
-   * - H200_SXM
-     - Min Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml``
-   * - 4xH200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 4
-     - `1k8k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml``
    * - 8xH200_SXM
-     - Low Latency
+     - Min Latency
      - 1024 / 8192
      - 4
      - `1k8k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml``
-   * - H200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml``
-   * - 4xH200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 8
-     - `1k8k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 1024 / 8192
      - 8
      - `1k8k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml``
-   * - H200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 16
-     - `1k8k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml``
    * - 8xH200_SXM
-     - High Throughput
+     - Balanced
      - 1024 / 8192
      - 16
      - `1k8k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 32
-     - `1k8k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 1024 / 8192
      - 32
      - `1k8k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 1024 / 8192
-     - 64
-     - `1k8k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 1024 / 8192
      - 64
      - `1k8k_tp8_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml``
-   * - H200_SXM
-     - Min Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp1_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp2_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml``
-   * - 4xH200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 4
-     - `8k1k_tp4_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml``
    * - 8xH200_SXM
-     - Low Latency
+     - Min Latency
      - 8192 / 1024
      - 4
      - `8k1k_tp8_conc4.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml``
-   * - H200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp1_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp2_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml``
-   * - 4xH200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 8
-     - `8k1k_tp4_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml``
    * - 8xH200_SXM
      - Low Latency
      - 8192 / 1024
      - 8
      - `8k1k_tp8_conc8.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml``
-   * - H200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp1_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml``
-   * - 2xH200_SXM
-     - Low Latency
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp2_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 16
-     - `8k1k_tp4_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml``
    * - 8xH200_SXM
-     - High Throughput
+     - Balanced
      - 8192 / 1024
      - 16
      - `8k1k_tp8_conc16.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp1_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp2_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 32
-     - `8k1k_tp4_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml``
    * - 8xH200_SXM
      - High Throughput
      - 8192 / 1024
      - 32
      - `8k1k_tp8_conc32.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml>`_
      - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml``
-   * - H200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp1_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml``
-   * - 2xH200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp2_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml``
-   * - 4xH200_SXM
-     - High Throughput
-     - 8192 / 1024
-     - 64
-     - `8k1k_tp4_conc64.yaml <https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml>`_
-     - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml``
    * - 8xH200_SXM
      - Max Throughput
      - 8192 / 1024
diff --git a/examples/configs/database/database.py b/examples/configs/database/database.py
index e0c73a8ef1c..f2c6d45b0b2 100644
--- a/examples/configs/database/database.py
+++ b/examples/configs/database/database.py
@@ -15,15 +15,20 @@
 
 
 from pathlib import Path
-from typing import Any, Dict, Iterator, List
+from typing import Any, Dict, Iterator, List, Tuple
 
 import yaml
 from pydantic import BaseModel, Field, RootModel
 
+REPO_ROOT = Path(__file__).parent.parent.parent.parent
 DATABASE_LIST_PATH = Path(__file__).parent / "lookup.yaml"
 
+LOW_LATENCY_CONCURRENCY_THRESHOLD = 8
+HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32
+KEY_PROFILES = {"Min Latency", "Balanced", "Max Throughput"}
 
-class RecipeConstraints(BaseModel):
+
+class Recipe(BaseModel):
     """Recipe record for scenario list."""
 
     model: str = Field(description="Model name")
@@ -36,29 +41,68 @@ class RecipeConstraints(BaseModel):
 
     def load_config(self) -> Dict[str, Any]:
         """Load and return the YAML config at config_path."""
-        with open(self.config_path) as f:
-            data = yaml.safe_load(f)
-        return data if data is not None else {}
-
-
-class Recipe(BaseModel):
-    """Recipe that describes a single scenario."""
-
-    constraints: RecipeConstraints = Field(description="Recipe constraints")
-    env_overrides: Dict[str, Any] = Field(description="Environment overrides", default_factory=dict)
-    config: Dict[str, Any] = Field(description="Configuration overrides", default_factory=dict)
+        config_relative_path = Path(self.config_path)
+        # Ensure config path is within the repo root
+        if config_relative_path.is_absolute() or ".." in config_relative_path.parts:
+            raise ValueError(f"Invalid config path: {self.config_path}")
+        full_path = REPO_ROOT / self.config_path
+        if not full_path.exists():
+            raise FileNotFoundError(f"Config not found: {full_path}")
+        with open(full_path, encoding="utf-8") as f:
+            return yaml.safe_load(f)
 
 
-class RecipeList(RootModel[List[RecipeConstraints]]):
+class RecipeList(RootModel[List[Recipe]]):
     @classmethod
     def from_yaml(cls, yaml_path: Path) -> "RecipeList":
         """Load and validate recipe list from YAML file."""
-        with open(yaml_path) as f:
+        with open(yaml_path, encoding="utf-8") as f:
             data = yaml.safe_load(f)
         return cls(data)
 
-    def __iter__(self) -> Iterator[RecipeConstraints]:
+    def __iter__(self) -> Iterator[Recipe]:
         return iter(self.root)
 
     def __len__(self) -> int:
         return len(self.root)
+
+
+def assign_profile(num_recipes: int, idx: int, concurrency: int) -> str:
+    """Assign performance profile to a recipe based on its position in a concurrency-sorted list."""
+    if num_recipes == 1:
+        if concurrency <= LOW_LATENCY_CONCURRENCY_THRESHOLD:
+            return "Low Latency"
+        elif concurrency >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD:
+            return "High Throughput"
+        else:
+            return "Balanced"
+    elif idx == 0:
+        return "Min Latency"
+    elif idx == num_recipes - 1:
+        return "Max Throughput"
+    elif idx in ((num_recipes - 1) // 2, num_recipes // 2):
+        return "Balanced"
+    elif idx < num_recipes // 2:
+        return "Low Latency"
+    else:
+        return "High Throughput"
+
+
+def select_key_recipes(recipes: List[Recipe]) -> List[Tuple[Recipe, str]]:
+    """Select key recipes (min latency, balanced, max throughput) from a list of recipes."""
+    if not recipes:
+        return []
+
+    sorted_recipes = sorted(recipes, key=lambda r: r.concurrency)
+    n = len(sorted_recipes)
+
+    result = []
+    seen_profiles = set()
+    for idx, recipe in enumerate(sorted_recipes):
+        profile = assign_profile(n, idx, recipe.concurrency)
+        # For n==1, keep whatever profile is assigned
+        # For n>=2, only keep key profiles and dedupe (for even n, two indices get "Balanced")
+        if n == 1 or (profile in KEY_PROFILES and profile not in seen_profiles):
+            result.append((recipe, profile))
+            seen_profiles.add(profile)
+    return result
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index bfa3af44129..261c0a6d3a0 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -60,12 +60,12 @@ def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
   // cmake-vars cannot be empty, so passing (default) multi-device configuration.
   (CONFIG_LINUX_X86_64_VANILLA) : [
-    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
   (CONFIG_LINUX_X86_64_PYBIND) : [
-    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake --micro_benchmarks",
     (TARNAME) : "pybind-TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
@@ -80,13 +80,13 @@ def BUILD_CONFIGS = [
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
   (CONFIG_LINUX_AARCH64): [
-    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl",
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
     (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
   ],
   (CONFIG_LINUX_AARCH64_PYBIND): [
-    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl",
+    (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
     (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 03aae586175..ed2d1b88fd7 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -458,6 +458,7 @@ def main(*,
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
          nixl_root: str = None,
+         mooncake_root: str = None,
          internal_cutlass_kernels_root: str = None,
          clean: bool = False,
          clean_wheel: bool = False,
@@ -559,6 +560,11 @@ def main(*,
     if nixl_root is not None:
         cmake_def_args.append(f"-DNIXL_ROOT={nixl_root}")
 
+    if mooncake_root is not None:
+        if on_windows:
+            raise RuntimeError("Mooncake is not supported on Windows.")
+        cmake_def_args.append(f"-DMOONCAKE_ROOT={mooncake_root}")
+
     build_dir = get_build_dir(build_dir, build_type)
     first_build = not Path(build_dir, "CMakeFiles").exists()
 
@@ -819,6 +825,14 @@ def symlink_remove_dst_tree(src, dst, dirs_exist_ok=True):
                 build_run(
                     f"find {nixl_dir} -type f -name '*.so*' -exec patchelf --set-rpath \'$ORIGIN:$ORIGIN/plugins:$ORIGIN/../:$ORIGIN/../ucx/:$ORIGIN/../../ucx/\' {{}} \\;"
                 )
+        if os.path.exists(
+                build_dir /
+                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so"
+        ):
+            install_file(
+                build_dir /
+                "tensorrt_llm/executor/cache_transmission/mooncake_utils/libtensorrt_llm_mooncake_wrapper.so",
+                lib_dir / "libtensorrt_llm_mooncake_wrapper.so")
         install_file(
             build_dir /
             "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/libdecoder_attention_0.so",
@@ -1041,6 +1055,10 @@ def add_arguments(parser: ArgumentParser):
                         help="Directory containing NCCL headers and libraries")
     parser.add_argument("--nixl_root",
                         help="Directory containing NIXL headers and libraries")
+    parser.add_argument(
+        "--mooncake_root",
+        help=
+        "Directory containing Mooncake transfer engine headers and libraries")
     parser.add_argument(
         "--internal-cutlass-kernels-root",
         default="",
diff --git a/scripts/generate_config_database_tests.py b/scripts/generate_config_database_tests.py
new file mode 100644
index 00000000000..c198e975a47
--- /dev/null
+++ b/scripts/generate_config_database_tests.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generate a performance regression test list from the config database.
+
+This script:
+1. Reads recipes from the examples/configs/database directory
+2. Generates test config files per GPU type (e.g., config_database_b200_nvl.yaml)
+3. Generates llm_config_database.yml test list with condition blocks grouped by GPU name and count
+"""
+
+import copy
+from collections import defaultdict
+from pathlib import Path
+
+import yaml
+
+from examples.configs.database.database import (
+    DATABASE_LIST_PATH,
+    Recipe,
+    RecipeList,
+    select_key_recipes,
+)
+
+REPO_ROOT = Path(__file__).parent.parent
+PERF_SANITY_DIR = REPO_ROOT / "tests" / "scripts" / "perf-sanity"
+TEST_LIST_PATH = (
+    REPO_ROOT / "tests" / "integration" / "test_lists" / "qa" / "llm_config_database.yml"
+)
+ITERATIONS = 10
+
+# GPU type to condition wildcards mapping for test list
+# Note: cpu is used to distinguish between e.g. H200_SXM and GH200
+GPU_WILDCARDS = {
+    "B200_NVL": {"gpu": ["*b200*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"},
+    "H200_SXM": {"gpu": ["*h200*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"},
+    "H100_SXM": {"gpu": ["*h100*"], "cpu": "x86_64", "linux_distribution_name": "ubuntu*"},
+    "GH200": {"gpu": ["*gh200*"], "cpu": "aarch64", "linux_distribution_name": "ubuntu*"},
+    "GB200": {"gpu": ["*gb200*"], "cpu": "aarch64", "linux_distribution_name": "ubuntu*"},
+}
+
+
+def generate_server_name(recipe: Recipe) -> str:
+    """Generate a unique server name from recipe."""
+    model_slug = recipe.model.replace("/", "_").replace("-", "_").replace(".", "_")
+    return f"{model_slug}_{recipe.isl}_{recipe.osl}_conc{recipe.concurrency}_gpu{recipe.num_gpus}"
+
+
+def generate_client_name(recipe: Recipe) -> str:
+    """Generate client config name."""
+    return f"con{recipe.concurrency}_isl{recipe.isl}_osl{recipe.osl}"
+
+
+def recipe_to_server_config(recipe: Recipe, llm_api_config: dict) -> dict:
+    """Convert a recipe + LLM API config to aggr_server format."""
+    server_config = {
+        "name": generate_server_name(recipe),
+        "model_name": recipe.model,
+        "gpus": recipe.num_gpus,
+        # Enable scenario-only matching for baseline comparison
+        "match_mode": "scenario",
+    }
+
+    # Copy LLM API config fields
+    for key, value in llm_api_config.items():
+        server_config[key] = value
+
+    # Disable KV cache reuse to ensure consistency
+    if "kv_cache_config" not in server_config:
+        server_config["kv_cache_config"] = {}
+    server_config["kv_cache_config"]["enable_block_reuse"] = False
+
+    # Add client configs
+    server_config["client_configs"] = [
+        {
+            "name": generate_client_name(recipe),
+            "concurrency": recipe.concurrency,
+            "iterations": ITERATIONS,
+            "isl": recipe.isl,
+            "osl": recipe.osl,
+            "random_range_ratio": 0.0,  # Fixed ISL/OSL for reproducibility
+            "backend": "openai",
+            "streaming": True,
+        }
+    ]
+
+    return server_config
+
+
+def group_recipes_by_scenario(recipes: RecipeList) -> dict:
+    """Group recipes by scenario key (model, gpu, isl, osl, num_gpus)."""
+    groups = defaultdict(list)
+    for recipe in recipes:
+        key = (recipe.model, recipe.gpu, recipe.isl, recipe.osl, recipe.num_gpus)
+        groups[key].append(recipe)
+    return groups
+
+
+def filter_to_key_recipes(recipes: RecipeList) -> list[Recipe]:
+    """Filter recipes to only key configs (min latency, balanced, max throughput)."""
+    scenario_groups = group_recipes_by_scenario(recipes)
+    key_recipes = []
+    for scenario_recipes in scenario_groups.values():
+        for recipe, _ in select_key_recipes(scenario_recipes):
+            key_recipes.append(recipe)
+    return key_recipes
+
+
+def group_recipes_by_gpu(recipes: list[Recipe]) -> dict[str, list[Recipe]]:
+    """Group recipes by GPU type."""
+    groups = defaultdict(list)
+    for recipe in recipes:
+        groups[recipe.gpu].append(recipe)
+    return groups
+
+
+def group_recipes_by_num_gpus(recipes: list[Recipe]) -> dict[int, list[Recipe]]:
+    """Group recipes by num_gpus within a GPU type."""
+    groups = defaultdict(list)
+    for recipe in recipes:
+        groups[recipe.num_gpus].append(recipe)
+    return groups
+
+
+def generate_aggr_config(recipes: list[Recipe]) -> dict[str, list[dict]]:
+    """Generate aggr_server config from recipes."""
+    server_configs = []
+
+    for recipe in recipes:
+        llm_api_config = recipe.load_config()
+        server_config = recipe_to_server_config(recipe, llm_api_config)
+        server_configs.append(server_config)
+
+    return {"server_configs": server_configs}
+
+
+def generate_condition_entry(
+    gpu_name: str, num_gpus: int, config_name: str, server_names: list
+) -> dict:
+    # using copy.deepcopy to avoid creating YAML anchors
+    wildcards = copy.deepcopy(GPU_WILDCARDS[gpu_name])
+    condition = {
+        "wildcards": wildcards,
+        "ranges": {"system_gpu_count": {"gte": num_gpus}},
+    }
+
+    tests = [
+        f"perf/test_perf.py::test_perf[perf_sanity_upload-{config_name}-{name}]"
+        for name in server_names
+    ]
+    return {"condition": condition, "tests": tests}
+
+
+def generate_tests(test_list_path: Path = TEST_LIST_PATH, test_config_dir: Path = PERF_SANITY_DIR):
+    test_list_path.parent.mkdir(parents=True, exist_ok=True)
+
+    all_recipes = RecipeList.from_yaml(DATABASE_LIST_PATH)
+    recipes = filter_to_key_recipes(all_recipes)
+    print(f"Selected {len(recipes)} key recipes from {len(all_recipes)} total")
+
+    gpu_groups = group_recipes_by_gpu(recipes)
+    condition_entries = []
+    config_files = {}
+
+    for gpu_name in sorted(gpu_groups.keys()):
+        gpu_recipes = gpu_groups[gpu_name]
+        config_name = f"config_database_{gpu_name.lower()}"
+        config_path = test_config_dir / f"{config_name}.yaml"
+
+        aggr_config = generate_aggr_config(gpu_recipes)
+        config_content = yaml.dump(
+            aggr_config, default_flow_style=False, sort_keys=False, width=120
+        )
+
+        with open(config_path, "w", encoding="utf-8") as f:
+            f.write(config_content)
+        print(f"Generated {config_path}")
+
+        config_files[config_path] = config_content
+
+        # Generate condition entries grouped by num_gpus
+        num_gpus_groups = group_recipes_by_num_gpus(gpu_recipes)
+        for num_gpus in sorted(num_gpus_groups.keys()):
+            server_names = [generate_server_name(r) for r in num_gpus_groups[num_gpus]]
+            entry = generate_condition_entry(gpu_name, num_gpus, config_name, server_names)
+            condition_entries.append(entry)
+
+    test_list = {
+        "version": "0.0.1",
+        "llm_config_database": condition_entries,
+    }
+
+    header = """# ===============================================================================
+# Config Database Performance Tests (AUTO-GENERATED)
+# ===============================================================================
+# Generated by: scripts/generate_config_database_tests.py
+#
+# These tests use scenario-only matching (match_mode: scenario) for baselines.
+# Baselines are matched by (model, gpu, isl, osl, concurrency, num_gpus) instead
+# of full config fields, allowing configs to evolve while maintaining comparison.
+#
+# To regenerate:
+#   python scripts/generate_config_database_tests.py
+# ===============================================================================
+
+"""
+    with open(test_list_path, "w", encoding="utf-8") as f:
+        f.write(header)
+        yaml.dump(test_list, f, default_flow_style=False, sort_keys=False, width=120)
+    print(f"Generated {test_list_path}")
+
+
+if __name__ == "__main__":
+    generate_tests()
diff --git a/scripts/generate_config_table.py b/scripts/generate_config_table.py
index 2d423c0811f..3c68c7edcb3 100644
--- a/scripts/generate_config_table.py
+++ b/scripts/generate_config_table.py
@@ -19,7 +19,7 @@
 from collections import defaultdict
 from pathlib import Path
 
-from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList
+from examples.configs.database.database import DATABASE_LIST_PATH, RecipeList, assign_profile
 
 SCRIPT_DIR = Path(__file__).parent.resolve()
 REPO_ROOT = SCRIPT_DIR.parent
@@ -38,9 +38,6 @@
     },
 }
 
-LOW_LATENCY_CONCURRENCY_THRESHOLD = 8
-HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD = 32
-
 
 def generate_rst(yaml_path, output_file=None):
     """Generate RST table from YAML config database.
@@ -51,10 +48,10 @@ def generate_rst(yaml_path, output_file=None):
     """
     recipe_list = RecipeList.from_yaml(Path(yaml_path))
 
-    # Group by model -> (gpu, isl, osl) -> list of recipes
+    # Group by model -> (gpu, num_gpus, isl, osl) -> list of recipes
     model_groups = defaultdict(lambda: defaultdict(list))
     for recipe in recipe_list:
-        key = (recipe.gpu, recipe.isl, recipe.osl)
+        key = (recipe.gpu, recipe.num_gpus, recipe.isl, recipe.osl)
         model_groups[recipe.model][key].append(recipe)
 
     lines = []
@@ -97,7 +94,8 @@ def generate_rst(yaml_path, output_file=None):
 
         subgroups = model_groups[model]
         sorted_keys = sorted(
-            subgroups.keys(), key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0))
+            subgroups.keys(),
+            key=lambda k: (str(k[0]), int(k[1] or 0), int(k[2] or 0), int(k[3] or 0)),
         )
 
         for key in sorted_keys:
@@ -114,23 +112,7 @@ def generate_rst(yaml_path, output_file=None):
                 conc = entry.concurrency
                 config_path = entry.config_path
 
-                if n == 1:
-                    if conc <= LOW_LATENCY_CONCURRENCY_THRESHOLD:
-                        profile = "Low Latency"
-                    elif conc >= HIGH_THROUGHPUT_CONCURRENCY_THRESHOLD:
-                        profile = "High Throughput"
-                    else:
-                        profile = "Balanced"
-                elif idx == 0:
-                    profile = "Min Latency"
-                elif idx == n - 1:
-                    profile = "Max Throughput"
-                elif idx in ((n - 1) // 2, n // 2):
-                    profile = "Balanced"
-                elif idx < n // 2:
-                    profile = "Low Latency"
-                else:
-                    profile = "High Throughput"
+                profile = assign_profile(n, idx, conc)
 
                 full_config_path = config_path
                 command = f"trtllm-serve {model} --extra_llm_api_options ${{TRTLLM_DIR}}/{full_config_path}"
diff --git a/scripts/generate_lock_file.py b/scripts/generate_lock_file.py
index 9b37858c0e1..5a0992902c5 100755
--- a/scripts/generate_lock_file.py
+++ b/scripts/generate_lock_file.py
@@ -156,9 +156,10 @@ def generate_metadata_json():
             packages = packages[:-1]
 
         for package in packages:
+            package = re.sub(r'\s#.*$', '', package).rstrip()
             # WAR: ignore lines with "-f": No tool exists to parse complex requirements.txt
-            if '-f' in package or \
-                "#" in package or \
+            if not package or \
+                '-f' in package or \
                 package.startswith('--'):
                 continue
 
diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock
index ac1ce39f45b..f2f8e40c409 100644
--- a/security_scanning/docs/poetry.lock
+++ b/security_scanning/docs/poetry.lock
@@ -900,13 +900,13 @@ files = [
 
 [[package]]
 name = "soupsieve"
-version = "2.8"
+version = "2.8.1"
 description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
-    {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
+    {file = "soupsieve-2.8.1-py3-none-any.whl", hash = "sha256:a11fe2a6f3d76ab3cf2de04eb339c1be5b506a8a47f2ceb6d139803177f85434"},
+    {file = "soupsieve-2.8.1.tar.gz", hash = "sha256:4cf733bc50fa805f5df4b8ef4740fc0e0fa6218cf3006269afd3f9d6d80fd350"},
 ]
 
 [[package]]
diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock
index 261179a6251..a2004681e08 100644
--- a/security_scanning/examples/models/core/qwen/poetry.lock
+++ b/security_scanning/examples/models/core/qwen/poetry.lock
@@ -2927,30 +2927,30 @@ six = ">=1.14.0"
 
 [[package]]
 name = "ruff"
-version = "0.14.9"
+version = "0.14.10"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.14.9-py3-none-linux_armv6l.whl", hash = "sha256:f1ec5de1ce150ca6e43691f4a9ef5c04574ad9ca35c8b3b0e18877314aba7e75"},
-    {file = "ruff-0.14.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ed9d7417a299fc6030b4f26333bf1117ed82a61ea91238558c0268c14e00d0c2"},
-    {file = "ruff-0.14.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d5dc3473c3f0e4a1008d0ef1d75cee24a48e254c8bed3a7afdd2b4392657ed2c"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84bf7c698fc8f3cb8278830fb6b5a47f9bcc1ed8cb4f689b9dd02698fa840697"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:aa733093d1f9d88a5d98988d8834ef5d6f9828d03743bf5e338bf980a19fce27"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a1cfb04eda979b20c8c19550c8b5f498df64ff8da151283311ce3199e8b3648"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1e5cb521e5ccf0008bd74d5595a4580313844a42b9103b7388eca5a12c970743"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd429a8926be6bba4befa8cdcf3f4dd2591c413ea5066b1e99155ed245ae42bb"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab208c1b7a492e37caeaf290b1378148f75e13c2225af5d44628b95fd7834273"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72034534e5b11e8a593f517b2f2f2b273eb68a30978c6a2d40473ad0aaa4cb4a"},
-    {file = "ruff-0.14.9-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:712ff04f44663f1b90a1195f51525836e3413c8a773574a7b7775554269c30ed"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a111fee1db6f1d5d5810245295527cda1d367c5aa8f42e0fca9a78ede9b4498b"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8769efc71558fecc25eb295ddec7d1030d41a51e9dcf127cbd63ec517f22d567"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:347e3bf16197e8a2de17940cd75fd6491e25c0aa7edf7d61aa03f146a1aa885a"},
-    {file = "ruff-0.14.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:7715d14e5bccf5b660f54516558aa94781d3eb0838f8e706fb60e3ff6eff03a8"},
-    {file = "ruff-0.14.9-py3-none-win32.whl", hash = "sha256:df0937f30aaabe83da172adaf8937003ff28172f59ca9f17883b4213783df197"},
-    {file = "ruff-0.14.9-py3-none-win_amd64.whl", hash = "sha256:c0b53a10e61df15a42ed711ec0bda0c582039cf6c754c49c020084c55b5b0bc2"},
-    {file = "ruff-0.14.9-py3-none-win_arm64.whl", hash = "sha256:8e821c366517a074046d92f0e9213ed1c13dbc5b37a7fc20b07f79b64d62cc84"},
-    {file = "ruff-0.14.9.tar.gz", hash = "sha256:35f85b25dd586381c0cc053f48826109384c81c00ad7ef1bd977bfcc28119d5b"},
+    {file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"},
+    {file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"},
+    {file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"},
+    {file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"},
+    {file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"},
+    {file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"},
+    {file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"},
+    {file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"},
+    {file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"},
 ]
 
 [[package]]
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
index 2356583a718..084a98983f2 100644
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "c1cfb61b1b0940e9212b68e7ee72d42c6126e242",
-  "timestamp": "2025-12-18T02:42:21Z"
+  "commit_hash": "a7ac5a6bca6eab92723ec2d4abacee940e56ad22",
+  "timestamp": "2025-12-19T02:39:13Z"
 }
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index 286d967e6fa..fce2b50e726 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -740,6 +740,82 @@ files = [
     {file = "colored-2.3.1.tar.gz", hash = "sha256:fe6e888e12dc16643daa0b108f785df6d0b48420084b5d0a567de27bb09a14d8"},
 ]
 
+[[package]]
+name = "contourpy"
+version = "1.3.2"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"},
+    {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"},
+    {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"},
+    {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"},
+    {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"},
+    {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"},
+    {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"},
+    {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"},
+    {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"},
+    {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"},
+    {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"},
+    {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"},
+    {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"},
+    {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"},
+    {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"},
+    {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"},
+    {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"},
+    {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"},
+    {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"},
+    {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"},
+    {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"},
+    {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"},
+    {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"},
+    {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"},
+    {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"},
+    {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"},
+    {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"},
+    {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"},
+    {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"},
+    {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"},
+    {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"},
+    {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"},
+    {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"},
+    {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"},
+    {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"},
+    {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"},
+    {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"},
+    {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"},
+]
+
+[package.dependencies]
+numpy = ">=1.23"
+
+[package.extras]
+bokeh = ["bokeh", "selenium"]
+docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"]
+mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"]
+test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
+test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+
 [[package]]
 name = "cuda-bindings"
 version = "13.1.1"
@@ -841,6 +917,21 @@ opencl = ["nvidia-cuda-opencl (==13.0.85.*)"]
 profiler = ["nvidia-cuda-profiler-api (==13.0.85.*)"]
 sanitizer = ["nvidia-cuda-sanitizer-api (==13.0.85.*)"]
 
+[[package]]
+name = "cycler"
+version = "0.12.1"
+description = "Composable style cycles"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"},
+    {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"},
+]
+
+[package.extras]
+docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
+tests = ["pytest", "pytest-cov", "pytest-xdist"]
+
 [[package]]
 name = "datasets"
 version = "3.1.0"
@@ -1084,6 +1175,78 @@ tabulate = "*"
 torch = "*"
 tqdm = "*"
 
+[[package]]
+name = "fonttools"
+version = "4.61.1"
+description = "Tools to manipulate font files"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"},
+    {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"},
+    {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"},
+    {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"},
+    {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"},
+    {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"},
+    {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"},
+    {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"},
+    {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"},
+    {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"},
+    {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"},
+    {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"},
+    {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"},
+    {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"},
+    {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"},
+    {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"},
+    {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"},
+    {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"},
+    {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"},
+    {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"},
+    {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"},
+    {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"},
+    {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"},
+    {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"},
+    {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"},
+    {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"},
+    {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"},
+    {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"},
+    {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"},
+    {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"},
+    {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"},
+    {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"},
+    {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"},
+    {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"},
+    {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"},
+    {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"},
+    {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"},
+    {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"},
+    {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"},
+    {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"},
+    {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"},
+    {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"},
+    {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"},
+    {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"},
+    {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"},
+    {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"},
+    {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"},
+    {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"},
+    {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"},
+    {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"},
+]
+
+[package.extras]
+all = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres", "pycairo", "scipy", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0)", "xattr", "zopfli (>=0.1.4)"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["munkres", "pycairo", "scipy"]
+lxml = ["lxml (>=4.0)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.45.0)"]
+symfont = ["sympy"]
+type1 = ["xattr"]
+unicode = ["unicodedata2 (>=17.0.0)"]
+woff = ["brotli (>=1.0.1)", "brotlicffi (>=0.8.0)", "zopfli (>=0.1.4)"]
+
 [[package]]
 name = "frozenlist"
 version = "1.8.0"
@@ -1707,6 +1870,116 @@ files = [
 [package.dependencies]
 referencing = ">=0.31.0"
 
+[[package]]
+name = "kiwisolver"
+version = "1.4.9"
+description = "A fast implementation of the Cassowary constraint solver"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"},
+    {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"},
+    {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"},
+    {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"},
+    {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"},
+    {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"},
+    {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"},
+    {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"},
+    {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"},
+    {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"},
+    {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"},
+]
+
 [[package]]
 name = "lark"
 version = "1.3.1"
@@ -2017,6 +2290,84 @@ files = [
     {file = "markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698"},
 ]
 
+[[package]]
+name = "matplotlib"
+version = "3.10.8"
+description = "Python plotting package"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"},
+    {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"},
+    {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"},
+    {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"},
+    {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"},
+    {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"},
+    {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"},
+    {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"},
+    {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"},
+    {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"},
+    {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"},
+    {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"},
+    {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"},
+    {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"},
+    {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"},
+    {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"},
+    {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"},
+    {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"},
+    {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"},
+    {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"},
+    {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"},
+    {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"},
+    {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"},
+    {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"},
+    {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"},
+    {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"},
+    {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"},
+    {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"},
+    {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"},
+    {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"},
+    {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"},
+    {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"},
+    {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"},
+    {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"},
+    {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"},
+    {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"},
+    {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"},
+    {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"},
+    {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"},
+]
+
+[package.dependencies]
+contourpy = ">=1.0.1"
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.3.1"
+numpy = ">=1.23"
+packaging = ">=20.0"
+pillow = ">=8"
+pyparsing = ">=3"
+python-dateutil = ">=2.7"
+
+[package.extras]
+dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"]
+
 [[package]]
 name = "mdurl"
 version = "0.1.2"
@@ -2551,6 +2902,75 @@ files = [
     {file = "ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978"},
 ]
 
+[[package]]
+name = "numexpr"
+version = "2.13.1"
+description = "Fast numerical expression evaluator for NumPy"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "numexpr-2.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bdbc2b93ac59667f0ba725b24cd3b5559c300e91e179d09c74ebaf8c8961eef6"},
+    {file = "numexpr-2.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ad6b5dfc191c766e3ec89d2e3f956f7ef3181a1f8bf2bb00ec48fb3bf97b44ac"},
+    {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a12dbd4c07a8303c6f01cdade531d75c9b4f5b8f72cbe5821d8f9197ee6fba47"},
+    {file = "numexpr-2.13.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2de5c8ca2f25690d48e475d53a3524876164227cf4044743818f5704c28a8639"},
+    {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:533ec2d77fc059e3868e9798ef2f13ab57161517cd2e0c521bb33d1dc99068ca"},
+    {file = "numexpr-2.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a75ddffc36f6b7a679fbc7df492685aed7e8888aec80ec2cd8e30f21fc019caa"},
+    {file = "numexpr-2.13.1-cp310-cp310-win32.whl", hash = "sha256:790af35095626ad2d02201c56ac2d49ae45fc95a02af85f40808752ed32ee103"},
+    {file = "numexpr-2.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:aadf3118b6ef87294277ffb77a9562970228341aaaa4b78de634a43ea8ea2c6e"},
+    {file = "numexpr-2.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bdf62745e072c670151c0705bddfe3f33c341dacb7eb255ddb1e8d2a257bfef5"},
+    {file = "numexpr-2.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:91cf0521d8fed3f804640c4a6d22b5d9813d7e64b32c38215de163c7f092f7cc"},
+    {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:58e2f111756fff63e27e495473d950e4c98bbebca55aa1572798b59110d6c84b"},
+    {file = "numexpr-2.13.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a5a37b74561ed8dbd5f9be182d94419fa53f452e2d7d3e8d6dbef35a20f19f7"},
+    {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:78cb76676e63f02dcf507e3c563888018a68b6a2e2cd444628e09df270dfd0b2"},
+    {file = "numexpr-2.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d29b3351de4c43b56d2ef7f138ab7a8988e797291bcbbd56d545e4e7902f254a"},
+    {file = "numexpr-2.13.1-cp311-cp311-win32.whl", hash = "sha256:912488ddbd500937bb6f4dfc010bdb3bf757a76e0b93db2f2c56db49ef6b9351"},
+    {file = "numexpr-2.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:66d0292f3b9dc5faadb4dd8a89d733321ff01c9699aee0c3cdbf513c9505e39c"},
+    {file = "numexpr-2.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6aa48c2f2bfa142dfe260441486452be8f70b5551c17bc846fccf76123d4a226"},
+    {file = "numexpr-2.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:67a3dd8b51e94251f535a9a404f1ac939a3ebeb9398caad20ae9d0de37c6d3b3"},
+    {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca152998d44ea30b45ad6b8a050ac4a9408b61a17508df87ad0d919335d79b44"},
+    {file = "numexpr-2.13.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b4280c8f7cc024846be8fdd6582572bb0b6bad98fb2a68a367ef5e6e2e130d5f"},
+    {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b86e1daa4e27d6bf6304008ed4630a055babf863db2ec8f282b4058bbfe466bd"},
+    {file = "numexpr-2.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:30d189fc52ee4a33b869a0592553cd2ed686c20cded21b2ddf347a4d143f1bea"},
+    {file = "numexpr-2.13.1-cp312-cp312-win32.whl", hash = "sha256:e926b59d385de2396935b362143ac2c282176875cf8ee7baba0a150b58421b5c"},
+    {file = "numexpr-2.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:8230a8f7cd4e6ba4022643c85e119aa4ca90412267ef20acdf1f54fb3136680d"},
+    {file = "numexpr-2.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e4314ee477a2cfb9ecf4b15f2ef24bf7859f62b35de3caef297136ff25bb0b0"},
+    {file = "numexpr-2.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d82d088f67647861b61a7b0e0148fd7487000a20909d65734821dd27e0839a68"},
+    {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c615b13976e6332336a052d5b03be1fed231bc1afe07699f4c7cc116c7c3092c"},
+    {file = "numexpr-2.13.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4874124bccc3c2462558ad2a75029bcc2d1c63ee4914b263bb06339e757efb85"},
+    {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0fc7b5b0f8d7ba6c81e948b1d967a56097194c894e4f57852ed8639fc653def2"},
+    {file = "numexpr-2.13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e22104ab53f0933b5b522829149990cb74e0a8ec4b69ff0e6545eb4641b3f013"},
+    {file = "numexpr-2.13.1-cp313-cp313-win32.whl", hash = "sha256:824aea72663ec123e042341cea4a2a2b3c71f315e4bc58ee5035ffc7f945bd29"},
+    {file = "numexpr-2.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:9c7b1c3e9f398a5b062d9740c48ca454238bf1be433f0f75fe68619527bb7f1a"},
+    {file = "numexpr-2.13.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:366a7887c2bad86e6f64666e178886f606cf8e81a6871df450d19f0f83421501"},
+    {file = "numexpr-2.13.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:33ff9f071d06aaa0276cb5e2369efd517fe155ea091e43790f1f8bfd85e64d29"},
+    {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c29a204b1d35941c088ec39a79c2e83e382729e4066b4b1f882aa5f70bf929a8"},
+    {file = "numexpr-2.13.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:40e02db74d66c5b0a81c925838f42ec2d58cc99b49cbaf682f06ac03d9ff4102"},
+    {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:36bd9a2b9bda42506377c7510c61f76e08d50da77ffb86a7a15cc5d57c56bb0f"},
+    {file = "numexpr-2.13.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b9203651668a3994cf3fe52e079ff6be1c74bf775622edbc226e94f3d8ec8ec4"},
+    {file = "numexpr-2.13.1-cp313-cp313t-win32.whl", hash = "sha256:b73774176b15fe88242e7ed174b5be5f2e3e830d2cd663234b1495628a30854c"},
+    {file = "numexpr-2.13.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9e6228db24b7faa96fbb2beee55f90fc8b0fe167cf288f8481c53ff5e95865a"},
+    {file = "numexpr-2.13.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cbadcbd2cf0822d595ccf5345c69478e9fe42d556b9823e6b0636a3efdf990f0"},
+    {file = "numexpr-2.13.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a189d514e8aa321ef1c650a2873000c08f843b3e3e66d69072005996ac25809c"},
+    {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6b01e9301bed8f89f6d561d79dcaa8731a75cc50efc072526cfbc07df74226c"},
+    {file = "numexpr-2.13.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7749e8c0ff0bae41a534e56fab667e529f528645a0216bb64260773ae8cb697"},
+    {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0b0f326542185c23fca53e10fee3c39bdadc8d69a03c613938afaf3eea31e77f"},
+    {file = "numexpr-2.13.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:33cc6d662a606cc5184c7faef1d7b176474a8c46b8b0d2df9ff0fa67ed56425f"},
+    {file = "numexpr-2.13.1-cp314-cp314-win32.whl", hash = "sha256:71f442fd01ebfa77fce1bac37f671aed3c0d47a55e460beac54b89e767fbc0fa"},
+    {file = "numexpr-2.13.1-cp314-cp314-win_amd64.whl", hash = "sha256:208cd9422d87333e24deb2fe492941cd13b65dc8b9ce665de045a0be89e9a254"},
+    {file = "numexpr-2.13.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:37d31824b9c021078046bb2aa36aa1da23edaa7a6a8636ee998bf89a2f104722"},
+    {file = "numexpr-2.13.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:15cee07c74e4792993cd2ecd46c5683815e8758ac56e1d4d236d2c9eb9e8ae01"},
+    {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65cb46136f068ede2fc415c5f3d722f2c7dde3eda04ceafcfbcac03933f5d997"},
+    {file = "numexpr-2.13.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:abc3c1601380c90659b9ac0241357c5788ab58de148f56c5f98adffe293c308c"},
+    {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2836e900377ce27e99c043a35e008bc911c51781cea47623612a4e498dfa9592"},
+    {file = "numexpr-2.13.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f4e4c5b38bb5695fff119672c3462d9a36875256947bafb2df4117b3271fd6a3"},
+    {file = "numexpr-2.13.1-cp314-cp314t-win32.whl", hash = "sha256:156591eb23684542fd53ca1cbefff872c47c429a200655ef7e59dd8c03eeeaef"},
+    {file = "numexpr-2.13.1-cp314-cp314t-win_amd64.whl", hash = "sha256:a2cc21b2d2e59db63006f190dbf20f5485dd846770870504ff2a72c8d0406e4e"},
+    {file = "numexpr-2.13.1.tar.gz", hash = "sha256:ecb722249c2d6ed7fefe8504bb17e056481a5f31233c23a7ee02085c3d661fa1"},
+]
+
+[package.dependencies]
+numpy = ">=1.23.0"
+
 [[package]]
 name = "numpy"
 version = "1.26.4"
@@ -4060,6 +4480,20 @@ nvidia-ml-py = ">=12.0.0"
 [package.extras]
 test = ["pytest (>=3.6)", "pytest-cov", "pytest-runner"]
 
+[[package]]
+name = "pyparsing"
+version = "3.2.5"
+description = "pyparsing - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"},
+    {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
 [[package]]
 name = "pyproject-hooks"
 version = "1.2.0"
@@ -5210,19 +5644,56 @@ opt-einsum = ["opt-einsum (>=3.3)"]
 optree = ["optree (>=0.13.0)"]
 pyyaml = ["pyyaml"]
 
+[[package]]
+name = "torch-c-dlpack-ext"
+version = "0.1.3"
+description = "torch c dlpack ext"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:49f8a1eaea21443c338df7bcf93f9026274b910ab23850777a88db040608c0a1"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e2cb08aa7591a08b4992fc99b10e86b46a65d9a46c34d9697e8fab03bfcaf46"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9ae36f7d4ccd4a9806528fa8dc8f0e3cfc47530adff8c7b6a72762bc97643b0"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:f92a0582cfa28418924f94bd6b89f662555d73dcc7ca0de1cad78a4f04ebca26"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:770fd7618973f70bfea288d5c419bdf974fc578e84248341524bb1ed20b969fd"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71de2233ff974f09379e84699af88e83aeb63dd885627123f745780ff592d15c"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78b963243b5b0e7d463fab365f31ec1569223845942f6591ab2ac067ad0f0338"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:b0244f282e0e74f2cefa843caeb601f5acfd88342029b0ca901dd40ab883818b"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b7d64453fa62c75551f2413cde55748a3461af475da386b2e709239555e07c3"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cd69fb034cd638eb0908767d74e5d0ea87df18d366b18d66c2c3472b29c80e5e"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8ebf732b5079912e0b85f32a75bae6932f021fbc13c2dff1c9f7cea437b71345"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:69685ac393f1f402c289ac04435120d518bde890388474fe2f8a58e7d290eb50"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5f87b18064c017edb240b1766e858d18fe9472c11180a2811216293376ba6ef0"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2afc7165195f4a256aab16147040d63a0cc55b7c530946d9726125268a54303a"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96743df478df006b21ae18111f4a2528abcc46131389b8d99176c37c30559474"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp313-cp313-win_amd64.whl", hash = "sha256:74f491fe1ec64ff631a4844ef87339a1e825d375d87bad79ec8e9b922292a043"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:61d17b3be0c43c846e8ff4c54e5f05a35daeb8453fb14cec05742fcce41bada7"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa8bf3a52fc13306866282e204ee6979a0cabaf64c8ef8d6ee700d4c4b2519a1"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fa48bb2e613c3a1fec135edbde1c7923a20b7dc3a5a3f2d17be7e0a7d333b18"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp314-cp314-win_amd64.whl", hash = "sha256:d7344b830359c4ef3165c10a82de96daf711a38c21b18b82c30d9d8dcd3e4529"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:b81bfa08d3dc791f808610e1abf0603c745b8c82681009a089b3dae650b6ff61"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d079d66404ec3911c02d4fd4cd41f42ef56f1ebdd5ecd68bcc2f425cbd12d08e"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d03bf108eab58b2c6dbe7e94211f670422c961b1e1e32fbaec442d5359ac02bf"},
+    {file = "torch_c_dlpack_ext-0.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:5ee661e6b910e67200ba7c911436a5af8be288f938883971a0cf5632645183c8"},
+    {file = "torch_c_dlpack_ext-0.1.3.tar.gz", hash = "sha256:4b5da66432af7224dcf02aad4f13cc416eeef5331cd153588b7e081a193f4972"},
+]
+
+[package.dependencies]
+torch = "*"
+
 [[package]]
 name = "torchao"
-version = "0.14.1"
+version = "0.15.0"
 description = "Package for applying ao techniques to GPU models"
 optional = false
 python-versions = "*"
 files = [
-    {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"},
-    {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"},
+    {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"},
+    {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"},
 ]
 
 [package.extras]
-dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
+dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
 
 [[package]]
 name = "torchprofile"
@@ -5856,4 +6327,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "955bcecb84ae2d8555ba7c10772099be9a6451a8a00f61d5aa3b86d2666a4ef6"
+content-hash = "d44d9d44355bac8ca580030e7e4eeb0a7cfdff7cf25045ffd8f38d077b27306c"
diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
index 4253fe2ac00..a9a9aa0e2a2 100644
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@@ -62,6 +62,7 @@ llguidance = "0.7.29"
 jsonschema = "^4.25.1"
 backoff = "^2.2.1"
 nvtx = "^0.2.14"
+matplotlib = "^3.10.8"
 meson = "^1.10.0"
 ninja = "^1.13.0"
 etcd3 = {git = "https://github.com/kragniz/python-etcd3.git", rev = "e58a899579ba416449c4e225b61f039457c8072a"}
@@ -73,7 +74,9 @@ blobfile = "^3.1.0"
 openai-harmony = "0.0.4"
 nvidia-cutlass-dsl = "4.3.1"
 plotly = "^6.5.0"
+numexpr = "<2.14.0"
 partial-json-parser = "^0.2.1.1.post7"
+torch-c-dlpack-ext = "0.1.3"
 mistral-common = "1.8.6"
 torchao = ">=0.14.1"
 
diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock
index 159351cf115..b530fa57c39 100644
--- a/security_scanning/triton_backend/poetry.lock
+++ b/security_scanning/triton_backend/poetry.lock
@@ -842,17 +842,17 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "r
 
 [[package]]
 name = "torchao"
-version = "0.14.1"
+version = "0.15.0"
 description = "Package for applying ao techniques to GPU models"
 optional = false
 python-versions = "*"
 files = [
-    {file = "torchao-0.14.1-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f68db5e41952e88daa383fc2f358541e617654f388f508d5c7580c3bee9447"},
-    {file = "torchao-0.14.1-py3-none-any.whl", hash = "sha256:c9896e14531817bc2ca6847b3fe71c42592ab80a43628b36668b2d6d6713fb5b"},
+    {file = "torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6"},
+    {file = "torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c"},
 ]
 
 [package.extras]
-dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
+dev = ["bitsandbytes", "blobfile", "cmake (>=3.19.0,<4.0.0)", "diskcache", "expecttest", "fire", "hypothesis", "importlib_metadata", "lm_eval", "matplotlib", "ninja", "packaging", "pandas", "parameterized", "pre-commit", "pycocotools", "pytest (==8.4.2)", "ruff (==0.11.6)", "sentencepiece", "tabulate", "tiktoken", "tqdm", "transformers", "unittest-xml-reporting"]
 
 [[package]]
 name = "tqdm"
diff --git a/setup.py b/setup.py
index 5c61029aad8..094ca01467e 100644
--- a/setup.py
+++ b/setup.py
@@ -114,9 +114,9 @@ def has_ext_modules(self):
         'libs/libnvinfer_plugin_tensorrt_llm.so',
         'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so',
         'libs/libtensorrt_llm_nixl_wrapper.so', 'libs/nixl/**/*',
-        'libs/ucx/**/*', 'libs/libpg_utils.so',
-        'libs/libdecoder_attention_1.so', 'libs/nvshmem/License.txt',
-        'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
+        'libs/libtensorrt_llm_mooncake_wrapper.so', 'libs/ucx/**/*',
+        'libs/libpg_utils.so', 'libs/libdecoder_attention_1.so',
+        'libs/nvshmem/License.txt', 'libs/nvshmem/nvshmem_bootstrap_uid.so.3',
         'libs/nvshmem/nvshmem_transport_ibgda.so.103', 'bindings.*.so',
         'deep_ep/LICENSE', 'deep_ep/*.py', 'deep_ep_cpp_tllm.*.so',
         "include/**/*", 'deep_gemm/LICENSE', 'deep_gemm/include/**/*',
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
index 4265217453d..7ce9b7befa8 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
@@ -2,6 +2,8 @@
 
 import flashinfer
 import torch
+import torch.nn.functional as F
+from einops import rearrange
 
 from ...flashinfer_utils import get_env_enable_pdl
 from ...modules.mamba.layernorm_gated import _layer_norm_fwd
@@ -159,3 +161,35 @@ def _triton_rmsnorm_gated_meta(
         assert gate.shape == x.shape, "gate must match x shape"
 
     return x.new_empty(x.shape, dtype=torch.float32)
+
+
+# Forked from:
+# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
+# NOTES:
+# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
+#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
+#    We therefore replace it with one that uses einops + pytorch.
+def gated_rms_norm_ref(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
+):
+    dtype = x.dtype
+    # N = x.shape[-1]
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        z = z.float() if z is not None else z
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+    if group_size is None:
+        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    else:
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
+        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+    return out.to(dtype)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/__init__.py
index 6eae19f23c5..327d084bf0b 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/__init__.py
@@ -1,4 +1,2 @@
-# TODO: When getting rid of the nemotron H patches, import `modeling_nemotron_h` here to ensure the
-# custom model implementation is registered.
 from . import custom, hf, nemotron_flash, patches
 from .factory import *
diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
index fef9fdb1660..e32f72f56f9 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py
@@ -1 +1,8 @@
 from .modeling_nemotron_flash import NemotronFlashForCausalLM, NemotronFlashPreTrainedTokenizerFast
+from .modeling_nemotron_h import NemotronHForCausalLM
+
+__all__ = (
+    "NemotronFlashForCausalLM",
+    "NemotronFlashPreTrainedTokenizerFast",
+    "NemotronHForCausalLM",
+)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
similarity index 83%
rename from tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py
rename to tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
index 6a54617497e..3756c054f76 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
@@ -25,17 +25,14 @@
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput
 
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
-    _nemotron_h_moe_forward,
-    _nemotron_h_topk_router_forward,
-)
+from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import gated_rms_norm_ref
+from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
 
 
 class MambaRMSNormGated(torch.nn.Module):
@@ -46,7 +43,7 @@ def __init__(self, hidden_size, group_size, eps=1e-5):
         self.group_size = group_size
 
     def forward(self, hidden_states, gate=None):
-        return _rms_norm_ref(
+        return gated_rms_norm_ref(
             x=hidden_states,
             weight=self.weight,
             bias=None,
@@ -57,38 +54,6 @@ def forward(self, hidden_states, gate=None):
         )
 
 
-# Forked from:
-# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
-# NOTES:
-# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
-#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
-#    We therefore replace it with one that uses einops + pytorch.
-def _rms_norm_ref(
-    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
-):
-    dtype = x.dtype
-    # N = x.shape[-1]
-    weight = weight.float()
-    bias = bias.float() if bias is not None else None
-    if upcast:
-        x = x.float()
-        z = z.float() if z is not None else z
-    if z is not None and not norm_before_gate:
-        x = x * F.silu(z)
-    if group_size is None:
-        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
-    else:
-        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
-        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
-        if bias is not None:
-            out = out + bias
-    if z is not None and norm_before_gate:
-        out *= F.silu(z)
-    return out.to(dtype)
-
-
 class NemotronHMamba2Mixer(nn.Module):
     """
     Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
@@ -149,9 +114,9 @@ def __init__(self, config, layer_idx: int):
         self.A_log._no_weight_decay = True
         # Instead of recomputing `torch.exp(self.A_log.float())` on every forward pass, we will register a hook
         # that sets this appropriately when loading weights.
-        # NOTE: we explicitly do NOT make this a `nn.Parameter` so that it does not appear in the state dict of
-        # this module, or an equivalent graph module trace from it.
-        self._minus_A = -A.float()
+        # NOTE: we explicitly register this as a non-persistent buffer so that it does not appear in the state dict of
+        # this module, or an equivalent graph module trace from it, but still gets included in e.g. `to()` calls.
+        self.register_buffer("_minus_A", -A.float(), persistent=False)
         self.norm = MambaRMSNormGated(
             self.intermediate_size,
             eps=self.layer_norm_epsilon,
@@ -317,8 +282,43 @@ def __init__(self, config, layer_idx: Optional[int] = None):
             layer_idx=layer_idx,
         )
 
-    # TODO: inline code from `_nemotron_h_moe_forward` when removing patches.
-    forward = _nemotron_h_moe_forward
+    def forward(self, hidden_states: torch.Tensor):
+        residuals = hidden_states
+        orig_shape = hidden_states.shape
+        topk_indices, topk_weights = self.gate(hidden_states)
+        x_flat = hidden_states.view(-1, hidden_states.shape[-1])
+
+        # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported
+        # graph.
+        # We dispatch shared expert first so that we can easily fork the execution of the routed experts
+        # (using the custom op below) to an auxiliary stream.
+        shared_out = self.shared_experts(residuals)
+        # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj)
+        has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj")
+
+        if has_latent_proj:
+            # Latent MOE: project to latent space before routing
+            x_flat = self.fc1_latent_proj(x_flat)
+
+        # Route through experts (operates in latent space if latent MOE, full space otherwise)
+        out_flat = torch.ops.auto_deploy.torch_moe(
+            x_flat,
+            topk_indices,
+            topk_weights,
+            w1_weight=[e.up_proj.weight for e in self.experts],
+            w2_weight=[e.down_proj.weight for e in self.experts],
+            w3_weight=[],
+            act_fn="relu2",
+            mlp_style="mlp",
+        )
+
+        if has_latent_proj:
+            # Latent MOE: project back from latent space
+            out_flat = self.fc2_latent_proj(out_flat)
+
+        routed_out = out_flat.view(*orig_shape)
+        out = shared_out + routed_out
+        return out
 
 
 class NemotronHTopkRouter(nn.Module):
@@ -339,22 +339,33 @@ def __init__(self, config):
             "e_score_correction_bias", torch.zeros(self.n_routed_experts, dtype=torch.float32)
         )
 
-    forward = _nemotron_h_topk_router_forward
+    def forward(self, hidden_states):
+        """
+        Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.
 
+        This replaces the original forward method which used pure PyTorch operations
+        with optimized CUDA kernels:
+        """
+        hidden_states = hidden_states.view(-1, self.config.hidden_size)
+        if self.weight.dtype == torch.float32:
+            router_logits = F.linear(hidden_states.type(torch.float32), self.weight)
+        else:
+            router_logits = torch.ops.trtllm.dsv3_router_gemm_op(
+                hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32
+            )
 
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(
-        batch, num_key_value_heads, n_rep, slen, head_dim
-    )
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+        # Use the fused noaux_tc_op kernel which applies sigmoid internally
+        # and performs group-based top-k selection with normalization
+        topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op(
+            router_logits,
+            self.e_score_correction_bias,
+            self.n_group,
+            self.topk_group,
+            self.top_k,
+            self.routed_scaling_factor,
+        )
+
+        return topk_indices, topk_weights
 
 
 class NemotronHAttention(nn.Module):
@@ -369,8 +380,23 @@ def __init__(self, config, layer_idx: Optional[int] = None):
 
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
-        if config.head_dim is not None:
-            self.head_dim = config.head_dim
+
+        # At some point during NemotronH development, what used to be called `attention_head_dim`
+        # was renamed to `head_dim`. Since no configuration class's code (nor the modeling code,
+        # for that matter) was ever upstreamed into `transformers`, we have to resort to the below
+        # hack in order to support multiple iterations of NemotronH models.
+        if hasattr(config, "head_dim"):
+            head_dim = config.head_dim
+        elif hasattr(config, "attention_head_dim"):
+            head_dim = config.attention_head_dim
+        else:
+            raise AttributeError(
+                "Expected either `head_dim` or `attention_head_dim` to be present in the config "
+                "class, found neither."
+            )
+
+        if head_dim is not None:
+            self.head_dim = head_dim
         else:
             self.head_dim = config.hidden_size // config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
@@ -594,7 +620,4 @@ def forward(
         return NemotronHCausalLMOutput(logits)
 
 
-# TODO: uncomment after removing patches (and make sure it is imported in `__init__.py`).
-# from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
-#
-# AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)
+AutoModelForCausalLMFactory.register_custom_model_cls("NemotronHConfig", NemotronHForCausalLM)
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
index 095e47f299d..e69de29bb2d 100644
--- a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
+++ b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
@@ -1,200 +0,0 @@
-import contextlib
-import importlib.util
-import sys
-import types
-from typing import Callable, Dict, List, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-from transformers import AutoModelForCausalLM
-
-from tensorrt_llm._torch.auto_deploy.models.patches.bamba import _bamba_mixer_torch_forward
-
-
-# Forked from:
-# https://github.com/state-spaces/mamba/blob/6b32be06d026e170b3fdaf3ae6282c5a6ff57b06/mamba_ssm/ops/triton/layernorm_gated.py
-# NOTES:
-# 1. At time of writing (09/25/2025), the nano nemotron v2 modeling code expects `mamba_ssm`
-#    to be installed so as to be able to make use of its grouped gated RMS norm operation.
-#    We therefore replace it with one that uses einops + pytorch.
-def _rms_norm_ref(
-    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True
-):
-    dtype = x.dtype
-    # N = x.shape[-1]
-    weight = weight.float()
-    bias = bias.float() if bias is not None else None
-    if upcast:
-        x = x.float()
-        z = z.float() if z is not None else z
-    if z is not None and not norm_before_gate:
-        x = x * F.silu(z)
-    if group_size is None:
-        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
-    else:
-        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
-        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
-        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
-        if bias is not None:
-            out = out + bias
-    if z is not None and norm_before_gate:
-        out *= F.silu(z)
-    return out.to(dtype)
-
-
-# The original implementation looks at `cache_position[0]` to decide what to do which does not
-# play well with export. Plus, we do not want it to be updated anyway.
-def _nemotron_h_model_update_mamba_mask(self, attention_mask, cache_position):
-    return None
-
-
-def _nemotron_h_model_update_causal_mask(self, attention_mask, input_tensor, cache_position):
-    # Force attention to use causal mode without explicit masks
-    return None
-
-
-def _nemotron_h_block_forward(
-    self,
-    hidden_states,
-    cache_params=None,
-    cache_position: Optional[torch.LongTensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-):
-    device = hidden_states.device
-    with contextlib.ExitStack() as stack:
-        if device.type == "cuda":
-            stack.enter_context(torch.cuda.stream(torch.cuda.default_stream(device)))
-        # * Use torch.cuda.stream() to avoid NaN issues when using multiple GPUs
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states.to(dtype=self.norm.weight.dtype))
-        if self.residual_in_fp32:
-            residual = residual.to(torch.float32)
-
-        if self.block_type == "mamba":
-            hidden_states = self.mixer(
-                hidden_states, cache_params=cache_params, cache_position=cache_position
-            )
-        elif self.block_type == "attention":
-            hidden_states = self.mixer(hidden_states, cache_position=cache_position)
-            hidden_states = hidden_states[0]
-        elif self.block_type in ["mlp", "moe"]:
-            hidden_states = self.mixer(hidden_states)
-        else:
-            raise ValueError(f"Invalid block_type: {self.block_type}")
-
-        hidden_states = residual + hidden_states
-        return hidden_states
-
-
-def _nemotron_h_topk_router_forward(self, hidden_states):
-    """
-    Forward pass for NemotronHTopkRouter using the optimized noaux_tc_op kernel.
-
-    This replaces the original forward method which used pure PyTorch operations
-    with optimized CUDA kernels:
-    """
-    hidden_states = hidden_states.view(-1, self.config.hidden_size)
-    if self.weight.dtype == torch.float32:
-        router_logits = F.linear(hidden_states.type(torch.float32), self.weight)
-    else:
-        router_logits = torch.ops.trtllm.dsv3_router_gemm_op(
-            hidden_states, self.weight.t(), bias=None, out_dtype=torch.float32
-        )
-
-    # Use the fused noaux_tc_op kernel which applies sigmoid internally
-    # and performs group-based top-k selection with normalization
-    topk_weights, topk_indices = torch.ops.trtllm.noaux_tc_op(
-        router_logits,
-        self.e_score_correction_bias,
-        self.n_group,
-        self.topk_group,
-        self.top_k,
-        self.routed_scaling_factor,
-    )
-
-    return topk_indices, topk_weights
-
-
-# Note: we assume experts have no bias for now
-def _nemotron_h_moe_forward(self, hidden_states: torch.Tensor):
-    """
-    Uses NemotronH router (returns indices, weights) and dispatches through auto_deploy::torch_moe
-    with act_fn='relu2'. Handles both latent MOE and direct MOE architectures.
-    """
-
-    residuals = hidden_states
-    orig_shape = hidden_states.shape
-    topk_indices, topk_weights = self.gate(hidden_states)
-    x_flat = hidden_states.view(-1, hidden_states.shape[-1])
-
-    # NOTE: So far we've seen that the dispatch order in eager code is the same as the node order in the exported graph.
-    # We dispatch shared expert first so that we can easily fork the execution of the routed experts
-    # (using the custom op below) to an auxiliary stream.
-    shared_out = self.shared_experts(residuals)
-    # Check if this is a latent MOE (has fc1_latent_proj and fc2_latent_proj)
-    has_latent_proj = hasattr(self, "fc1_latent_proj") and hasattr(self, "fc2_latent_proj")
-
-    if has_latent_proj:
-        # Latent MOE: project to latent space before routing
-        x_flat = self.fc1_latent_proj(x_flat)
-
-    # Route through experts (operates in latent space if latent MOE, full space otherwise)
-    out_flat = torch.ops.auto_deploy.torch_moe(
-        x_flat,
-        topk_indices,
-        topk_weights,
-        w1_weight=[e.up_proj.weight for e in self.experts],
-        w2_weight=[e.down_proj.weight for e in self.experts],
-        w3_weight=[],
-        act_fn="relu2",
-        mlp_style="mlp",
-    )
-
-    if has_latent_proj:
-        # Latent MOE: project back from latent space
-        out_flat = self.fc2_latent_proj(out_flat)
-
-    routed_out = out_flat.view(*orig_shape)
-    out = shared_out + routed_out
-    return out
-
-
-_from_config_original = AutoModelForCausalLM.from_config
-
-CUSTOM_MODULE_PATCHES: Dict[str, List[Tuple[str, Callable]]] = {
-    "NemotronHMamba2Mixer": [("forward", _bamba_mixer_torch_forward)],
-    "NemotronHModel": [
-        ("_update_causal_mask", _nemotron_h_model_update_causal_mask),
-        ("_update_mamba_mask", _nemotron_h_model_update_mamba_mask),
-    ],
-    "NemotronHBlock": [("forward", _nemotron_h_block_forward)],
-    "NemotronHMOE": [("forward", _nemotron_h_moe_forward)],
-    "NemotronHTopkRouter": [("forward", _nemotron_h_topk_router_forward)],
-}
-
-
-def get_model_from_config_patched(config, **kwargs):
-    model = _from_config_original(config, **kwargs)
-    # Patch modules
-    for _, module in model.named_modules():
-        if (module_name := type(module).__name__) in CUSTOM_MODULE_PATCHES.keys():
-            patches = CUSTOM_MODULE_PATCHES[module_name]
-            for method_name, method_patch in patches:
-                setattr(module, method_name, types.MethodType(method_patch, module))
-
-    return model
-
-
-# TODO: figure out how this can be incorporated into the export patch system
-AutoModelForCausalLM.from_config = get_model_from_config_patched
-
-# TODO: figure out how this can be incorporated into the export patch system
-# Only patch if the module isn't available
-_mamba_ssm_module = "mamba_ssm"
-_mamba_ssm_submodule = f"{_mamba_ssm_module}.ops.triton.layernorm_gated"
-if importlib.util.find_spec(_mamba_ssm_module) is None:
-    stub_mod = types.ModuleType(_mamba_ssm_submodule)
-    stub_mod.rmsnorm_fn = _rms_norm_ref
-    sys.modules[_mamba_ssm_submodule] = stub_mod
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
index 28c61e74dd4..2fdaaf55067 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -123,7 +123,7 @@ def _apply(
             cnt += 1
 
         return gm, TransformInfo(
-            skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=True
+            skipped=False, num_matches=cnt, is_clean=False, has_valid_shapes=(cnt == 0)
         )
 
     def _insert_quantized_linear(
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
index 36c2e683bf6..860b5b7de5b 100644
--- a/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
+++ b/tensorrt_llm/_torch/auto_deploy/transform/library/rms_norm.py
@@ -7,8 +7,8 @@
 from pydantic import Field
 from torch.fx import GraphModule
 
+from ...custom_ops.rms_norm import gated_rms_norm_ref
 from ...models.factory import ModelFactory
-from ...models.patches.nemotron_h import _rms_norm_ref
 from ...shim.interface import CachedSequenceInterface
 
 # It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher
@@ -225,7 +225,7 @@ def _gated_rmsnorm_pattern_ref(
     eps: float = 1e-5,
     group_size: int = 512,
 ) -> torch.Tensor:
-    y = _rms_norm_ref(
+    y = gated_rms_norm_ref(
         x,
         weight,
         bias=None,
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
index 73ee3f5c7b5..5616be77081 100644
--- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
+++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -42,6 +42,7 @@ def create_kv_cache_transceiver(
         cache_transceiver_config.backend = "NIXL"
         # Ordered by priority
         env_vars = [("TRTLLM_USE_UCX_KVCACHE", "UCX"),
+                    ("TRTLLM_USE_MOONCAKE_KVCACHE", "MOONCAKE"),
                     ("TRTLLM_USE_MPI_KVCACHE", "MPI")]
         for env_var, be_type in env_vars:
             if getenv(env_var) == "1":
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index c2d5f23f50c..2f22f493406 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1739,10 +1739,11 @@ class CacheTransceiverConfig(StrictBaseModel, PybindMirror):
     Configuration for the cache transceiver.
     """
 
-    backend: Optional[Literal["DEFAULT", "UCX", "NIXL", "MPI"]] = Field(
-        default=None,
-        description=
-        "The communication backend type to use for the cache transceiver.")
+    backend: Optional[Literal[
+        "DEFAULT", "UCX", "NIXL", "MOONCAKE", "MPI"]] = Field(
+            default=None,
+            description=
+            "The communication backend type to use for the cache transceiver.")
 
     max_tokens_in_buffer: Optional[int] = Field(
         default=None,
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 31f04f99685..894114c0f4a 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -863,10 +863,7 @@ def test_auto_dtype_with_helix(self):
             "disable_overlap_scheduler": True,
             "kv_cache_config": kv_cache_config,
             "enable_chunked_prefill": False,
-            "cuda_graph_config": {
-                "enable_padding": True,
-                "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128]
-            },
+            "cuda_graph_config": None,
             "cache_transceiver_config": {
                 "backend": "UCX"
             },
diff --git a/tests/integration/defs/cpp/test_multi_gpu.py b/tests/integration/defs/cpp/test_multi_gpu.py
index 7cf92efaadb..1124178cccc 100644
--- a/tests/integration/defs/cpp/test_multi_gpu.py
+++ b/tests/integration/defs/cpp/test_multi_gpu.py
@@ -25,6 +25,7 @@ class KVCacheType(Enum):
     MPI = auto()
     UCX = auto()
     NIXL = auto()
+    MOONCAKE = auto()
 
 
 def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
@@ -37,6 +38,9 @@ def get_multi_gpu_env(kv_cache_type=KVCacheType.NONE, llama_multi_gpu=False):
             env["TRTLLM_USE_UCX_KVCACHE"] = "1"
         case KVCacheType.NIXL:
             env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
+        case KVCacheType.MOONCAKE:
+            env["TRTLLM_USE_MOONCAKE_KVCACHE"] = "1"
+            env["MC_FORCE_TCP"] = "1"
         case KVCacheType.NONE:
             pass
         case _:
@@ -502,8 +506,9 @@ def test_fused_gemm_allreduce(build_google_tests, nprocs, build_dir):
 
 @pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
                          indirect=True)
-@pytest.mark.parametrize("kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX],
-                         ids=["nixl_kvcache", "ucx_kvcache"])
+@pytest.mark.parametrize(
+    "kvcache_type", [KVCacheType.NIXL, KVCacheType.UCX, KVCacheType.MOONCAKE],
+    ids=["nixl_kvcache", "ucx_kvcache", "mooncake_kvcache"])
 @pytest.mark.parametrize("nprocs", [2, 8], ids=["2proc", "8proc"])
 def test_cache_transceiver(build_google_tests, nprocs, kvcache_type, build_dir):
 
diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py
index 5824670d6f1..2687a730ce3 100644
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@@ -58,6 +58,20 @@
     "d_p99_e2el",
 ]
 
+# Fields for scenario-only matching for recipe tests.
+# Unlike regular tests that match on all config fields, recipes match only on the benchmark
+# scenario, allowing the underlying config to change while still comparing against baselines
+# for the same scenario.
+SCENARIO_MATCH_FIELDS = [
+    "s_runtime",
+    "s_model_name",
+    "s_gpu_type",
+    "l_isl",
+    "l_osl",
+    "l_concurrency",
+    "l_num_gpus",
+]
+
 
 def add_id(data):
     OpenSearchDB.add_id_of_json(data)
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index f55113f05c1..82891ca8470 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -29,7 +29,8 @@
                                        print_warning)
 
 from ..conftest import get_llm_root, llm_models_root, trt_environment
-from .open_search_db_utils import (add_id, get_history_data, get_job_info,
+from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id,
+                                   get_history_data, get_job_info,
                                    post_new_perf_data, prepare_baseline_data,
                                    prepare_regressive_test_cases,
                                    write_regressive_test_cases)
@@ -597,6 +598,11 @@ def __init__(self, server_config_data: dict, env_vars: str = ""):
         self.speculative_model_dir = speculative_config.get(
             'speculative_model_dir', "")
 
+        # match_mode: "config" (default, 40+ fields) or "scenario" (benchmark scenario fields for recipe testing)
+        # When match_mode is "scenario", baselines are matched by scenario identity
+        # (model, gpu, isl, osl, concurrency, num_gpus) instead of full config fields.
+        self.match_mode = server_config_data.get('match_mode', "config")
+
         # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
         self.extra_llm_api_config_data = {
             k: v
@@ -2438,9 +2444,12 @@ def prefix_server_config_dict(config_dict: dict,
                     new_data_dict[cmd_idx] = new_data
                     cmd_idx += 1
                     if not match_keys:
-                        match_keys.append("s_runtime")
-                        match_keys.extend(server_config_dict.keys())
-                        match_keys.extend(client_config_dict.keys())
+                        if server_config.match_mode == "scenario":
+                            match_keys = SCENARIO_MATCH_FIELDS.copy()
+                        else:
+                            match_keys.append("s_runtime")
+                            match_keys.extend(server_config_dict.keys())
+                            match_keys.extend(client_config_dict.keys())
 
         elif self._config.runtime == "multi_node_disagg_server":
             if self._config.disagg_configs[0][
diff --git a/tests/integration/test_lists/qa/README.md b/tests/integration/test_lists/qa/README.md
index 1a15c87ccfb..a0e3afb3dcf 100644
--- a/tests/integration/test_lists/qa/README.md
+++ b/tests/integration/test_lists/qa/README.md
@@ -59,6 +59,7 @@ This directory contains various test configuration files:
 - `llm_perf_full.yml` - Main performance test configuration
 - `llm_perf_cluster.yml` - Cluster-based performance tests
 - `llm_perf_sanity.yml` - Performance sanity checks
+- `llm_config_database.yml` - Performance regression tests for the config database in `examples/configs/database` (auto-generated by `scripts/generate_config_database_tests.py`)
 - `llm_perf_nim.yml` - NIM-specific performance tests
 - `llm_trt_integration_perf.yml` - Integration performance tests
 - `llm_trt_integration_perf_sanity.yml` - Integration performance sanity checks
@@ -77,7 +78,7 @@ QA tests are executed on a regular schedule:
 - **Weekly**: Automated regression testing
 - **Release**: Comprehensive validation before each release
    - **Full Cycle Testing**:
-        run all gpu with llm_function_core.txt + run NIM specific gpu with llm_function_nim.txt
+        run all gpu with llm_function_core.txt, run NIM specific gpu with llm_function_nim.txt, and run config database tests with llm_config_database.yml
     - **Sanity Cycle Testing**:
         run all gpu with llm_function_core_sanity.txt
     - **NIM Cycle Testing**:
diff --git a/tests/integration/test_lists/qa/llm_config_database.yml b/tests/integration/test_lists/qa/llm_config_database.yml
new file mode 100644
index 00000000000..15f14162b7b
--- /dev/null
+++ b/tests/integration/test_lists/qa/llm_config_database.yml
@@ -0,0 +1,191 @@
+# ===============================================================================
+# Config Database Performance Tests (AUTO-GENERATED)
+# ===============================================================================
+# Generated by: scripts/generate_config_database_tests.py
+#
+# These tests use scenario-only matching (match_mode: scenario) for baselines.
+# Baselines are matched by (model, gpu, isl, osl, concurrency, num_gpus) instead
+# of full config fields, allowing configs to evolve while maintaining comparison.
+#
+# To regenerate:
+#   python scripts/generate_config_database_tests.py
+# ===============================================================================
+
+version: 0.0.1
+llm_config_database:
+- condition:
+    wildcards:
+      gpu:
+      - '*b200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 1
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
+- condition:
+    wildcards:
+      gpu:
+      - '*b200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 2
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
+- condition:
+    wildcards:
+      gpu:
+      - '*b200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 4
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
+- condition:
+    wildcards:
+      gpu:
+      - '*b200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 8
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_b200_nvl-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
+- condition:
+    wildcards:
+      gpu:
+      - '*h200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 1
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu1]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu1]
+- condition:
+    wildcards:
+      gpu:
+      - '*h200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 2
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu2]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu2]
+- condition:
+    wildcards:
+      gpu:
+      - '*h200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 4
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu4]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu4]
+- condition:
+    wildcards:
+      gpu:
+      - '*h200*'
+      cpu: x86_64
+      linux_distribution_name: ubuntu*
+    ranges:
+      system_gpu_count:
+        gte: 8
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_1024_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_1024_8192_conc64_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc4_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc16_gpu8]
+  - perf/test_perf.py::test_perf[perf_sanity_upload-config_database_h200_sxm-openai_gpt_oss_120b_8192_1024_conc64_gpu8]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 382dd135531..13a078c463a 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -181,4 +181,3 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp4_tp2pp2
-  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 63817ed9afd..4e90db0050b 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -231,6 +231,7 @@ l0_dgx_h100:
   - cpp/test_multi_gpu.py::test_cache_transceiver[2proc-ucx_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-nixl_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-ucx_kvcache-90] ISOLATION
+  - cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] ISOLATION
   - cpp/test_multi_gpu.py::test_user_buffer[2proc-90]
   - cpp/test_multi_gpu.py::test_enc_dec[t5-90]
   - cpp/test_multi_gpu.py::test_llama_executor[llama-orchestrator-90]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
index 40fe6ed6750..503a3024a00 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
@@ -69,6 +69,7 @@ l0_gb200_multi_gpus:
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml
index 894bc21b1e7..f88ac773375 100644
--- a/tests/integration/test_lists/test-db/l0_sanity_check.yml
+++ b/tests/integration/test_lists/test-db/l0_sanity_check.yml
@@ -34,3 +34,5 @@ l0_sanity_check:
       - examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] ISOLATION
       - unittest/others/test_kv_cache_transceiver.py::test_kv_cache_transceiver_single_process[NIXL-mha-ctx_fp16_gen_fp16]
       - unittest/others/test_kv_cache_transceiver.py::test_kv_cache_transceiver_single_process[UCX-mha-ctx_fp16_gen_fp16]
+      - unittest/others/test_kv_cache_transceiver.py::test_cancel_request_in_transmission[mha]
+      - unittest/others/test_kv_cache_transceiver.py::test_cancel_request_in_transmission[mla]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2b3cc7427f7..4f6d8e75a44 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -407,7 +407,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/5715568)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5721661)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/5715568)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[CUTLASS] SKIP (https://nvbugs/5721912)
 unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_flashinfer_attention_op.py::test_flashinfer_attention_op_context_input_pos[cuda-dtype0-4-8-seq6] SKIP (https://nvbugs/5721907)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
@@ -436,7 +435,6 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740377, https://nvbugs/5740075)
-accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype_with_helix SKIP (https://nvbugs/5741331)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5722653)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5740087)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5740075)
@@ -459,14 +457,9 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5740075)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5740075)
-test_e2e.py::test_openai_mmencoder_example SKIP (https://nvbugs/5747911)
 test_e2e.py::test_trtllm_serve_multimodal_example SKIP (https://nvbugs/5747920)
 examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime] SKIP (https://nvbugs/5747930)
 test_e2e.py::test_trtllm_serve_example SKIP (https://nvbugs/5747938)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_patch_forward[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-2-6-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
-unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py::test_nemotronh_moe_custom_implementation[dtype0-1-8-nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3] SKIP (https://nvbugs/5747867)
 unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args8] SKIP (https://nvbugs/5747878)
 unittest/_torch/auto_deploy/unit/singlegpu/test_ad_build_small_single.py::test_build_ad[meta-llama/Llama-4-Scout-17B-16E-Instruct-llm_extra_args9] SKIP (https://nvbugs/5747878)
 triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981)
@@ -482,6 +475,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True] SKIP (https://nvbugs/5702793)
 disaggregated/test_auto_scaling.py::test_minimal_instances[etcd-round_robin] SKIP (https://nvbugs/5748564)
+disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] SKIP (https://nvbugs/5755963)
 unittest/llmapi/apps/test_disagg_serving_perf_metrics.py SKIP (https://nvbugs/5752516)
 unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py::test_flashinfer_fused_moe_matches_torch_moe SKIP (https://nvbugs/5752521)
 unittest/llmapi/apps/_test_openai_responses.py::test_reasoning_effort[DeepSeek-R1-Distill-Qwen-1.5B] SKIP (https://nvbugs/5753250)
diff --git a/tests/scripts/perf-sanity/config_database_b200_nvl.yaml b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml
new file mode 100644
index 00000000000..3ad69455a4e
--- /dev/null
+++ b/tests/scripts/perf-sanity/config_database_b200_nvl.yaml
@@ -0,0 +1,1839 @@
+server_configs:
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con32_isl1024_osl1024
+    concurrency: 32
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1344
+  max_seq_len: 2068
+  client_configs:
+  - name: con256_isl1024_osl1024
+    concurrency: 256
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc4_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc32_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con32_isl1024_osl1024
+    concurrency: 32
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_1024_1024_conc256_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1344
+  max_seq_len: 2068
+  client_configs:
+  - name: con256_isl1024_osl1024
+    concurrency: 256
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con32_isl8192_osl1024
+    concurrency: 32
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu4
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 4
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8512
+  max_seq_len: 9416
+  client_configs:
+  - name: con256_isl8192_osl1024
+    concurrency: 256
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc4_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc32_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con32_isl8192_osl1024
+    concurrency: 32
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: nvidia_DeepSeek_R1_0528_FP4_v2_8192_1024_conc256_gpu8
+  model_name: nvidia/DeepSeek-R1-0528-FP4-v2
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 512
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8512
+  max_seq_len: 9416
+  client_configs:
+  - name: con256_isl8192_osl1024
+    concurrency: 256
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 256
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.8
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: DEEPGEMM
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+    NCCL_GRAPH_REGISTER: 0
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: fp8
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  print_iter_log: true
+  stream_interval: 20
+  num_postprocess_workers: 4
+  moe_config:
+    backend: TRTLLM
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
diff --git a/tests/scripts/perf-sanity/config_database_h200_sxm.yaml b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
new file mode 100644
index 00000000000..9d2f8481ce9
--- /dev/null
+++ b/tests/scripts/perf-sanity/config_database_h200_sxm.yaml
@@ -0,0 +1,1415 @@
+server_configs:
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc4_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc16_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_1024_1024_conc64_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 1152
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc4_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc16_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: false
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: deepseek_ai_DeepSeek_R1_0528_8192_1024_conc64_gpu8
+  model_name: deepseek-ai/DeepSeek-R1-0528
+  gpus: 8
+  match_mode: scenario
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 128
+  enable_attention_dp: true
+  print_iter_log: true
+  kv_cache_config:
+    dtype: fp8
+    free_gpu_memory_fraction: 0.75
+    enable_block_reuse: false
+  stream_interval: 10
+  moe_config:
+    backend: CUTLASS
+  attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 8320
+  max_seq_len: 9416
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con4_isl1024_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con16_isl1024_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_1024_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 2068
+  client_configs:
+  - name: con64_isl1024_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl1024_osl8192
+    concurrency: 4
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl1024_osl8192
+    concurrency: 16
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_1024_8192_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl1024_osl8192
+    concurrency: 64
+    iterations: 10
+    isl: 1024
+    osl: 8192
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu1
+  model_name: openai/gpt-oss-120b
+  gpus: 1
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 1
+  moe_expert_parallel_size: 1
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu2
+  model_name: openai/gpt-oss-120b
+  gpus: 2
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 2
+  moe_expert_parallel_size: 2
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu4
+  model_name: openai/gpt-oss-120b
+  gpus: 4
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 4
+  moe_expert_parallel_size: 4
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc4_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 4
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con4_isl8192_osl1024
+    concurrency: 4
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc16_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 16
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con16_isl8192_osl1024
+    concurrency: 16
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
+- name: openai_gpt_oss_120b_8192_1024_conc64_gpu8
+  model_name: openai/gpt-oss-120b
+  gpus: 8
+  match_mode: scenario
+  env_overrides:
+    TRTLLM_ENABLE_PDL: 1
+  cuda_graph_config:
+    enable_padding: true
+    max_batch_size: 64
+  enable_attention_dp: false
+  kv_cache_config:
+    dtype: auto
+    enable_block_reuse: false
+    free_gpu_memory_fraction: 0.85
+  moe_config:
+    backend: TRITON
+  num_postprocess_workers: 4
+  print_iter_log: true
+  stream_interval: 20
+  tensor_parallel_size: 8
+  moe_expert_parallel_size: 8
+  trust_remote_code: true
+  backend: pytorch
+  max_num_tokens: 20000
+  max_seq_len: 9236
+  client_configs:
+  - name: con64_isl8192_osl1024
+    concurrency: 64
+    iterations: 10
+    isl: 8192
+    osl: 1024
+    random_range_ratio: 0.0
+    backend: openai
+    streaming: true
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
index 35b293686d2..59952a6c89f 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_mamba_rms_norm.py
@@ -1,8 +1,10 @@
 import pytest
 import torch
 
-import tensorrt_llm._torch.auto_deploy.custom_ops  # noqa: F401
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import _rms_norm_ref
+from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import (
+    gated_rms_norm_ref,
+    triton_rmsnorm_gated,
+)
 
 
 @pytest.mark.skipif(
@@ -19,12 +21,12 @@ def test_custom_op_matches_ref(B, T, H, group, use_gate, dtype):
     z = torch.randn_like(x) if use_gate else None
     w = torch.ones(H, dtype=dtype, device=device)
 
-    y_ref = _rms_norm_ref(
+    y_ref = gated_rms_norm_ref(
         x, w, bias=None, z=z, eps=1e-5, group_size=group, norm_before_gate=False, upcast=True
     )
 
     # Custom op (currently returns fp32). Cast it back to x.dtype for apples-to-apples with ref.
-    y_op_fp32 = torch.ops.auto_deploy.triton_rmsnorm_gated(x, w, z, 1e-5, group, False)
+    y_op_fp32 = triton_rmsnorm_gated(x, w, z, 1e-5, group, False)
     y_op = y_op_fp32.to(x.dtype)
 
     assert y_ref.dtype == x.dtype and y_op.dtype == x.dtype
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
index 82f3774511d..d6e63b6433d 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hf.py
@@ -205,10 +205,12 @@ def test_custom_model_mapping_in_parent_does_not_affect_parent():
     class Child(AutoModelForCausalLMFactory):
         pass
 
+    parent_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping)
+
     custom_model_cls = MagicMock(spec=AutoModelForCausalLM)
     custom_model_cls.configure_mock(_from_config=MagicMock(side_effect=MyError))
     Child.register_custom_model_cls(
         config_cls_name=FooConfig.__name__, custom_model_cls=custom_model_cls
     )
 
-    assert AutoModelForCausalLMFactory._custom_model_mapping == {}
+    assert AutoModelForCausalLMFactory._custom_model_mapping == parent_mapping
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
index ceabe6c1b98..6ea5c0efa17 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_hybrid_patches.py
@@ -1,5 +1,3 @@
-import copy
-
 import pytest
 import torch
 from _model_test_utils import get_small_model_config
@@ -7,8 +5,6 @@
 
 from tensorrt_llm._torch.auto_deploy.export import apply_export_patches, torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig
-from tensorrt_llm._torch.auto_deploy.models.hf import AutoModelForCausalLMFactory
-from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM
 from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
 
 # NOTE: find example inputs with the same tokenization length to avoid seq concat.
@@ -16,37 +12,15 @@
 EXAMPLE_INPUT2 = "Tiger is a cat with the following properties:"
 
 
-@pytest.fixture
-def setup_custom_model_cls_registry(request):
-    # TODO: remove all this when the patches in `bamba.py` and `nemotron_h.py` can be removed.
-    old_mapping = copy.copy(AutoModelForCausalLMFactory._custom_model_mapping)
-    AutoModelForCausalLMFactory._custom_model_mapping = {}
-
-    register_custom_model = request.node.callspec.params.get("register_custom_model", False)
-    if register_custom_model:
-        AutoModelForCausalLMFactory.register_custom_model_cls(
-            config_cls_name="NemotronHConfig",
-            custom_model_cls=NemotronHForCausalLM,
-        )
-    yield
-    AutoModelForCausalLMFactory._custom_model_mapping = old_mapping
-
-
 @pytest.mark.parametrize(
-    "model_dir,run_verify_generation,register_custom_model",
+    "model_dir,run_verify_generation",
     [
-        ("ibm-ai-platform/Bamba-9B-v2", True, False),
-        # This tests the incumbent patching approach.
-        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, False),
-        # This tests the new custom model implementation.
-        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True, True),
+        ("ibm-ai-platform/Bamba-9B-v2", True),
     ],
 )
 def test_bamba_patches(
     model_dir: str,
     run_verify_generation: bool,
-    register_custom_model: bool,
-    setup_custom_model_cls_registry,
 ):
     # NOTE: set to False if you want to locally test the full model.
     use_small_config: bool = True
@@ -124,13 +98,14 @@ def _run_torch_export_to_gm():
         move_to_device(gm, "cuda")
         factory._to_maybe_random(model, "cuda")
         model.load_state_dict(gm.state_dict())
+        gm.load_state_dict(model.state_dict())
     else:
         factory.load_or_random_init(model, device="cuda")
         gm = _run_torch_export_to_gm()
         move_to_device(gm, "cuda")
 
     if run_verify_generation:
-        _verify_generation(factory, model, tokenizer)
+        _verify_generation(model, tokenizer)
 
     # let's do a comparison of every state dict item between the model and the gm
     torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0)
@@ -157,7 +132,7 @@ def _run_torch_export_to_gm():
         )
 
 
-def _verify_generation(factory, model, tokenizer):
+def _verify_generation(model, tokenizer):
     print("====== WITHOUT PATCH ======")
     _generate(tokenizer, model)
     with apply_export_patches(patch_list=["bamba"]):
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py
new file mode 100644
index 00000000000..94b22ed14fc
--- /dev/null
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_modeling_nemotron_h.py
@@ -0,0 +1,235 @@
+import importlib.util
+import sys
+import types
+from unittest import mock
+
+import pytest
+import torch
+from _model_test_utils import get_small_model_config
+from torch.export import Dim
+from transformers import AutoConfig, AutoModelForCausalLM
+from utils.llm_data import llm_models_root
+
+from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
+from tensorrt_llm._torch.auto_deploy.llm_args import AutoDeployConfig
+from tensorrt_llm._torch.auto_deploy.models.custom.modeling_nemotron_h import NemotronHForCausalLM
+from tensorrt_llm._torch.auto_deploy.utils._graph import move_to_device
+
+_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8))
+
+
+@pytest.fixture(scope="function", autouse=True)
+def set_seed():
+    torch.manual_seed(42)
+
+
+@pytest.fixture(autouse=True)
+def stub_mamba_ssm_if_missing():
+    """Stub `mamba_ssm` package.
+
+    The `modeling_nemotron_h.py` code in all recent nemotron checkpoints have a hard dependency
+    on `mamba_ssm.ops.triton.layernorm_gated.rmsnorm_fn`. This fixture stubs it, such that we
+    at least can get past the import stage of the remote modeling code.
+    """
+    module = "mamba_ssm"
+    submodule = f"{module}.ops.triton.layernorm_gated"
+
+    if importlib.util.find_spec(module) is not None:
+        yield
+        return
+
+    stub_mod = types.ModuleType(submodule)
+    stub_mod.rmsnorm_fn = None
+
+    with mock.patch.dict(sys.modules, {submodule: stub_mod}):
+        yield
+
+
+def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None):
+    """
+    Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module.
+    """
+    cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
+
+    cfg.use_cache = False
+
+    cfg.torch_dtype = "bfloat16"
+    cfg.hidden_size = 32
+    cfg.intermediate_size = 64
+    cfg.moe_intermediate_size = 64
+    cfg.moe_shared_expert_intermediate_size = 64
+    cfg.mamba_head_dim = 40
+    cfg.mamba_num_heads = 4
+    cfg.n_groups = 2
+    cfg.num_attention_heads = 4
+    cfg.num_hidden_layers = 9
+    cfg.num_key_value_heads = 2
+    cfg.ssm_state_size = 32
+
+    if custom_model_cls is None:
+        model = AutoModelForCausalLM.from_config(cfg, trust_remote_code=True)
+    else:
+        model = custom_model_cls._from_config(cfg)
+    model.eval()
+
+    nemotron_moe = None
+    for _, mod in model.named_modules():
+        if type(mod).__name__ == "NemotronHMOE":
+            nemotron_moe = mod
+            break
+
+    if nemotron_moe is None:
+        raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.")
+
+    _set_gate_weights(nemotron_moe)
+
+    return nemotron_moe
+
+
+def _set_gate_weights(module):
+    # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter`
+    # is initialized as `torch.empty` in the original model code, which no manner of random seed
+    # setting will have any effect on. We therefore set it like the below to ensure the
+    # reproducibility of the tests.
+    for _, mod in module.named_modules():
+        if type(mod).__name__ == "NemotronHTopkRouter":
+            if hasattr(mod, "weight"):
+                mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight))
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        llm_models_root() / "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+    ],
+)
+@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.no_grad()
+def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype):
+    device = "cuda"
+
+    module = _load_nemotron_moe_layer(model_name)
+    module.to(device)
+
+    H = module.config.hidden_size
+    x = torch.randn(B, S, H, device=device, dtype=dtype)
+
+    ref = module(x)
+
+    new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM)
+    new_module.to(device)
+    new_module.load_state_dict(module.state_dict())
+
+    test = new_module(x)
+
+    rtol = 0.05
+    atol = 0.05
+
+    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize(
+    "model_dir,model_on_meta_during_export",
+    [
+        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", True),
+        ("nvidia/NVIDIA-Nemotron-Nano-12B-v2", False),
+    ],
+)
+def test_custom_model_implementation_can_be_exported(
+    model_dir: str,
+    model_on_meta_during_export: bool,
+):
+    # NOTE: set to False if you want to locally test the full model.
+    use_small_config: bool = True
+
+    common_kwargs = {
+        "world_size": 0,
+        "runtime": "demollm",
+        "model_factory": "AutoModelForCausalLM",
+        "max_seq_len": 512,
+        "transforms": {
+            "insert_cached_attention": {"backend": "flashinfer"},
+            "compile_model": {"backend": "torch-simple"},
+        },
+    }
+
+    if use_small_config:
+        llm_args = get_small_model_config(model_dir, **common_kwargs)["args"]
+    else:
+        llm_args = {
+            "model": model_dir,
+            **common_kwargs,
+            "model_kwargs": {
+                "dtype": "bfloat16",
+            },
+        }
+    llm_args = AutoDeployConfig(**llm_args)
+
+    factory = llm_args.create_factory()
+    model = factory.build_model("meta")
+    tokenizer = factory.init_tokenizer()
+
+    # 1. Export wants min batch size of 2 (to avoid specialization during export).
+    # 2. Can't get `padding` / `truncation` to work without other steps so just use the prompts
+    #    with the same tokenized length in order for the tokenizer not to complain when creating
+    #    the tensor.
+    message = [
+        "Mamba is a snake with the following properties:",
+        "Tiger is a cat with the following properties:",
+    ]
+    inputs = tokenizer(message, return_tensors="pt", return_token_type_ids=False).to("cuda")
+
+    input_ids = inputs["input_ids"]
+    position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).repeat(
+        input_ids.shape[0], 1
+    )
+    dynamic_shapes = (
+        {0: Dim("batch_size", min=0, max=8), 1: Dim("seq_len", min=0, max=512)},
+        {
+            0: Dim("batch_size", min=0, max=8),
+            1: Dim("seq_len", min=0, max=512),
+        },
+    )
+
+    def _run_torch_export_to_gm():
+        return torch_export_to_gm(
+            model,
+            args=tuple(),
+            kwargs={"input_ids": input_ids, "position_ids": position_ids},
+            dynamic_shapes=dynamic_shapes,
+        )
+
+    if model_on_meta_during_export:
+        gm = _run_torch_export_to_gm()
+        factory.load_or_random_init(gm, device="cuda")
+        move_to_device(gm, "cuda")
+        factory._to_maybe_random(model, "cuda")
+        # In order to ensure the `_minus_A` (non-persistent buffer) is correct, we need to run the
+        # model's load state pre/post hooks by loading the state dicts after initialization.
+        # NOTE: this is done under the hood by `torch_export_to_gm`, so we only need this in this
+        # `if` clause.
+        model.load_state_dict(gm.state_dict())
+        gm.load_state_dict(model.state_dict())
+    else:
+        factory.load_or_random_init(model, device="cuda")
+        gm = _run_torch_export_to_gm()
+        move_to_device(gm, "cuda")
+
+    # let's do a comparison of every state dict item between the model and the gm
+    torch.testing.assert_close(model.state_dict(), gm.state_dict(), rtol=0.0, atol=0.0)
+    torch.testing.assert_close(
+        dict(model.named_buffers()), dict(gm.named_buffers()), rtol=0.0, atol=0.0
+    )
+
+    with torch.inference_mode():
+        out_original = model(input_ids=input_ids, position_ids=position_ids)
+        out_gm = gm(input_ids=input_ids, position_ids=position_ids)
+
+    atol, rtol = 1e-3, 1e-3
+    torch.testing.assert_close(
+        out_gm,
+        out_original,
+        rtol=rtol,
+        atol=atol,
+    )
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py
deleted file mode 100644
index 3ef4e8eb54f..00000000000
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_nemotron_h_patches.py
+++ /dev/null
@@ -1,158 +0,0 @@
-import functools
-import types
-
-import pytest
-import torch
-from _model_test_utils import _hf_model_dir_or_hub_id
-from transformers import AutoConfig
-
-from tensorrt_llm._torch.auto_deploy.models.modeling_nemotron_h import NemotronHForCausalLM
-from tensorrt_llm._torch.auto_deploy.models.patches.nemotron_h import (
-    _from_config_original,
-    _nemotron_h_moe_forward,
-)
-
-_BATCH_AND_SEQUENCE_TEST_CASES = ((2, 6), (1, 8))
-
-
-@pytest.fixture(scope="function", autouse=True)
-def set_seed():
-    torch.manual_seed(42)
-
-
-def skip_on_no_hf_access(func):
-    """Decorator for skipping tests that fail due to HF access issues.
-
-    This allows us to share the same test code for CI (where access may be restricted, especially for private
-    repositories) and locally.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except OSError as e:
-            if "not a valid model identifier" in str(e):
-                pytest.skip("Test skipped due to (no) HF access.")
-            raise
-
-    return wrapper
-
-
-def _load_nemotron_moe_layer(model_name_or_path: str, custom_model_cls=None):
-    """
-    Build a tiny NemotronH model (1 layer, small dims) and return the first NemotronHMOE module.
-    """
-    cfg = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
-
-    cfg.use_cache = False
-
-    cfg.torch_dtype = "bfloat16"
-    cfg.hidden_size = 32
-    cfg.intermediate_size = 64
-    cfg.moe_intermediate_size = 64
-    cfg.moe_shared_expert_intermediate_size = 64
-    cfg.mamba_head_dim = 40
-    cfg.mamba_num_heads = 4
-    cfg.n_groups = 2
-    cfg.num_attention_heads = 4
-    cfg.num_hidden_layers = 9
-    cfg.num_key_value_heads = 2
-    cfg.ssm_state_size = 32
-
-    if custom_model_cls is None:
-        model = _from_config_original(cfg, trust_remote_code=True)
-    else:
-        model = custom_model_cls._from_config(cfg)
-    model.eval()
-
-    nemotron_moe = None
-    for _, mod in model.named_modules():
-        if type(mod).__name__ == "NemotronHMOE":
-            nemotron_moe = mod
-            break
-
-    if nemotron_moe is None:
-        raise RuntimeError("NemotronHMOE layer not found. Check your model id or config.")
-
-    _set_gate_weights(nemotron_moe)
-
-    return nemotron_moe
-
-
-def _set_gate_weights(module):
-    # This helper function is necessary because the `weight` parameter of the `NemotronHTopkRouter`
-    # is initialized as `torch.empty` in the original model code, which no manner of random seed
-    # setting will have any effect on. We therefore set it like the below to ensure the
-    # reproducibility of the tests.
-    for _, mod in module.named_modules():
-        if type(mod).__name__ == "NemotronHTopkRouter":
-            if hasattr(mod, "weight"):
-                mod.weight = torch.nn.Parameter(torch.randn_like(mod.weight))
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        _hf_model_dir_or_hub_id(
-            "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3"
-        ),
-    ],
-)
-@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@torch.no_grad()
-@skip_on_no_hf_access
-def test_nemotronh_moe_patch_forward(model_name, B, S, dtype):
-    device = "cuda"
-
-    module = _load_nemotron_moe_layer(model_name)
-    module.to(device)
-
-    H = module.config.hidden_size
-    x = torch.randn(B, S, H, device=device, dtype=dtype)
-
-    ref = module(x)
-
-    module.forward = types.MethodType(_nemotron_h_moe_forward, module)
-    test = module(x)
-
-    rtol = 0.05
-    atol = 0.05
-
-    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
-
-
-@pytest.mark.parametrize(
-    "model_name",
-    [
-        _hf_model_dir_or_hub_id(
-            "NVIDIA-Nemotron-Nano-31B-A3-v3", "nvidia/NVIDIA-Nemotron-Nano-31B-A3-v3"
-        ),
-    ],
-)
-@pytest.mark.parametrize("B,S", _BATCH_AND_SEQUENCE_TEST_CASES)
-@pytest.mark.parametrize("dtype", [torch.bfloat16])
-@torch.no_grad()
-@skip_on_no_hf_access
-def test_nemotronh_moe_custom_implementation(model_name, B, S, dtype):
-    device = "cuda"
-
-    module = _load_nemotron_moe_layer(model_name)
-    module.to(device)
-
-    H = module.config.hidden_size
-    x = torch.randn(B, S, H, device=device, dtype=dtype)
-
-    ref = module(x)
-
-    new_module = _load_nemotron_moe_layer(model_name, custom_model_cls=NemotronHForCausalLM)
-    new_module.to(device)
-    new_module.load_state_dict(module.state_dict())
-
-    test = new_module(x)
-
-    rtol = 0.05
-    atol = 0.05
-
-    torch.testing.assert_close(test, ref, rtol=rtol, atol=atol)
diff --git a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
index 0c52852b9ec..ff9dd92e0ca 100644
--- a/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
+++ b/tests/unittest/llmapi/apps/_test_disagg_serving_multi_nodes.py
@@ -104,7 +104,7 @@ def gen_tp_pp_size(request):
 def worker(model_name: str, ctx_tp_pp_size: tuple, gen_tp_pp_size: tuple):
     extra_config = {
         "cache_transceiver_config": {
-            "backend": "UCX"
+            "backend": "DEFAULT"
         },
         "kv_cache_config": {
             "free_gpu_memory_fraction": 0.5,
diff --git a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
index 1ca1beec2ab..312f9232d40 100644
--- a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
+++ b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
@@ -5,6 +5,7 @@
 import pytest
 import requests
 import yaml
+from utils.llm_data import llm_models_root
 
 from ..test_llm import get_model_path
 from .openai_server import RemoteMMEncoderServer
@@ -69,7 +70,8 @@ def async_client(server: RemoteMMEncoderServer):
 def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str):
 
     content_text = "Describe the natural environment in the image."
-    image_url = "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"
+    image_url = str(llm_models_root() / "multimodals" / "test_data" /
+                    "seashore.png")
     messages = [{
         "role":
         "user",
diff --git a/tests/unittest/tools/test_config_database_sync.py b/tests/unittest/tools/test_config_database_sync.py
new file mode 100644
index 00000000000..92a42431669
--- /dev/null
+++ b/tests/unittest/tools/test_config_database_sync.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+
+# Add scripts directory to path
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
+SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts")
+sys.path.insert(0, SCRIPTS_DIR)
+
+from generate_config_database_tests import (  # noqa: E402
+    PERF_SANITY_DIR,
+    TEST_LIST_PATH,
+    generate_tests,
+)
+from generate_config_table import generate_rst  # noqa: E402
+
+
+class TestConfigDatabaseSync(unittest.TestCase):
+    def test_config_table_sync(self):
+        """Test that the config_table.rst file is synchronized with the lookup.yaml database.
+
+        Ensures that the RST file is up-to-date with the YAML database.
+        """
+        if generate_rst is None:
+            self.skipTest("generate_config_table not available")
+
+        # Define paths
+        yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml")
+        rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst")
+
+        # Ensure files exist
+        self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}")
+        self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}")
+
+        # Read existing RST content
+        with open(rst_path, "r") as f:
+            existing_content = f.read()
+
+        # Generate new RST content
+        with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp:
+            generate_rst(yaml_path, output_file=tmp.name)
+            tmp.seek(0)
+            generated_content = tmp.read()
+
+        # Compare content
+        self.assertEqual(
+            existing_content.strip(),
+            generated_content.strip(),
+            "config_table.rst is not synchronized with lookup.yaml. "
+            "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.",
+        )
+
+    def test_config_database_tests_sync(self):
+        """Test that config database test files are synchronized with lookup.yaml.
+
+        Ensures that both the test list YAML and per-GPU config files are up-to-date.
+        """
+        self.assertTrue(TEST_LIST_PATH.exists(), f"Test list not found: {TEST_LIST_PATH}")
+
+        with open(TEST_LIST_PATH) as f:
+            existing_test_list = f.read()
+
+        existing_config_files = {}
+        for config_path in PERF_SANITY_DIR.glob("config_database_*.yaml"):
+            with open(config_path) as f:
+                existing_config_files[config_path.name] = f.read()
+
+        # Generate to temp directory
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_config_dir = Path(tmp_dir) / "configs"
+            tmp_test_list_path = Path(tmp_dir) / "test_list.yml"
+            tmp_config_dir.mkdir(parents=True, exist_ok=True)
+
+            generate_tests(tmp_test_list_path, tmp_config_dir)
+
+            with open(tmp_test_list_path) as f:
+                generated_test_list = f.read()
+
+            self.assertEqual(
+                existing_test_list.strip(),
+                generated_test_list.strip(),
+                f"{TEST_LIST_PATH} is not synchronized with lookup.yaml. "
+                "Please run 'python3 scripts/generate_config_database_tests.py' from the repo root.",
+            )
+
+            generated_config_files = {}
+            for config_path in tmp_config_dir.glob("config_database_*.yaml"):
+                with open(config_path) as f:
+                    generated_config_files[config_path.name] = f.read()
+
+            # Check same set of files
+            self.assertEqual(
+                set(existing_config_files.keys()),
+                set(generated_config_files.keys()),
+                "Mismatch in config database config files. "
+                "Please run 'python scripts/generate_config_database_tests.py' from the repo root.",
+            )
+
+            # Compare each config file
+            for filename in existing_config_files:
+                self.assertEqual(
+                    existing_config_files[filename].strip(),
+                    generated_config_files[filename].strip(),
+                    f"{filename} is not synchronized with lookup.yaml. "
+                    "Please run 'python scripts/generate_config_database_tests.py' from the repo root.",
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/tools/test_generate_config_table.py b/tests/unittest/tools/test_generate_config_table.py
deleted file mode 100644
index a2dcf66783f..00000000000
--- a/tests/unittest/tools/test_generate_config_table.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import tempfile
-import unittest
-
-# Add scripts directory to path
-REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))
-SCRIPTS_DIR = os.path.join(REPO_ROOT, "scripts")
-sys.path.insert(0, SCRIPTS_DIR)
-
-from generate_config_table import generate_rst  # noqa: E402
-
-
-class TestConfigTableSync(unittest.TestCase):
-    def test_config_table_sync(self):
-        """Test that the config_table.rst file is synchronized with the lookup.yaml database.
-
-        Ensures that the RST file is up-to-date with the YAML database.
-        """
-        if generate_rst is None:
-            self.skipTest("generate_config_table not available")
-
-        # Define paths
-        yaml_path = os.path.join(REPO_ROOT, "examples/configs/database/lookup.yaml")
-        rst_path = os.path.join(REPO_ROOT, "docs/source/deployment-guide/config_table.rst")
-
-        # Ensure files exist
-        self.assertTrue(os.path.exists(yaml_path), f"YAML file not found: {yaml_path}")
-        self.assertTrue(os.path.exists(rst_path), f"RST file not found: {rst_path}")
-
-        # Read existing RST content
-        with open(rst_path, "r") as f:
-            existing_content = f.read()
-
-        # Generate new RST content
-        with tempfile.NamedTemporaryFile(mode="w+", delete=True) as tmp:
-            generate_rst(yaml_path, output_file=tmp.name)
-            tmp.seek(0)
-            generated_content = tmp.read()
-
-        # Compare content
-        self.assertEqual(
-            existing_content.strip(),
-            generated_content.strip(),
-            "config_table.rst is not synchronized with lookup.yaml. "
-            "Please run 'python3 scripts/generate_config_table.py' from the repo root to update it.",
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()