LarryXFly
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/cpp/utils/utils.cpp‎
Lines changed: 11 additions & 4 deletions b/‎benchmarks/cpp/utils/utils.cpp‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎benchmarks/cpp/utils/utils.h‎
Lines changed: 7 additions & 2 deletions b/‎benchmarks/cpp/utils/utils.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/algorithm.h‎
Lines changed: 4 additions & 3 deletions b/‎cpp/include/tensorrt_llm/common/algorithm.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/arrayView.h‎
Lines changed: 8 additions & 2 deletions b/‎cpp/include/tensorrt_llm/common/arrayView.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/assert.h‎
Lines changed: 11 additions & 6 deletions b/‎cpp/include/tensorrt_llm/common/assert.h‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/bindingUtils.h‎
Lines changed: 8 additions & 2 deletions b/‎cpp/include/tensorrt_llm/common/bindingUtils.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/config.h‎
Lines changed: 62 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/config.h‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/cudaFp8Utils.h‎
Lines changed: 6 additions & 3 deletions b/‎cpp/include/tensorrt_llm/common/cudaFp8Utils.h‎
Lines changed: 6 additions & 3 deletions
@@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog10_context_wait_performance.png  filter=lfs diff=lfs merge=lfs -text
+cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text
+cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text
@@ -74,6 +74,7 @@ llm-test-workspace/
 cpp/include/tensorrt_llm/executor/version.h
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/
 cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
+cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp
 .devcontainer/.env
 /examples/layer_wise_benchmarks/profiles/
 
 
@@ -1,6 +1,7 @@
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+ *AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,13 +18,16 @@
  */
 
 #include "utils.h"
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/logger.h"
 #include <random>
 
 #include <filesystem>
 #include <fstream>
 
-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
+
+namespace benchmark
 {
 
 std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
@@ -98,7 +102,8 @@ Samples parseWorkloadJson(
     if (samples.size() < maxNumSamples)
     {
         TLLM_LOG_WARNING(
-            "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n",
+            "Dataset size %zu is smaller than given max_num_samples "
+            "%d, max_num_samples will be ignored.\n",
             samples.size(), maxNumSamples);
     }
     return samples;
@@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric)
     return os;
 }
 
-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
+
+TRTLLM_NAMESPACE_END
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/executor/executor.h"
 
 #include <cstdint>
@@ -29,7 +30,9 @@
 
 #pragma once
 
-namespace tensorrt_llm::benchmark
+TRTLLM_NAMESPACE_BEGIN
+
+namespace benchmark
 {
 
 // using namespace tensorrt_llm::batch_manager;
@@ -237,4 +240,6 @@ std::vector<double> generateRandomExponentialValues(int count, float lambda, int
 
 std::vector<double> computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays);
 
-} // namespace tensorrt_llm::benchmark
+} // namespace benchmark
+
+TRTLLM_NAMESPACE_END
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace tensorrt_llm
-{
+#include "tensorrt_llm/common/config.h"
+
+TRTLLM_NAMESPACE_BEGIN
 
 // Base class for algorithms
 struct Algorithm
@@ -29,4 +30,4 @@ struct Algorithm
     Algorithm& operator=(Algorithm const&) = delete;
 };
 
-} // namespace tensorrt_llm
+TRTLLM_NAMESPACE_END
@@ -17,9 +17,13 @@
 #pragma once
 
 #include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/config.h"
+
 #include <cstdint>
 
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
 
 //!
@@ -100,4 +104,6 @@ class ArrayView
     size_type mSize;
 };
 
-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
@@ -16,14 +16,19 @@
 
 #pragma once
 
+#include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/tllmException.h"
 
+TRTLLM_NAMESPACE_BEGIN
+
 class DebugConfig
 {
 public:
     static bool isCheckDebugEnabled();
 };
 
+TRTLLM_NAMESPACE_END
+
 #if defined(_WIN32)
 #define TLLM_LIKELY(x) (__assume((x) == 1), (x))
 #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x))
@@ -35,8 +40,8 @@ class DebugConfig
 #define TLLM_CHECK(val)                                                                                                \
     do                                                                                                                 \
     {                                                                                                                  \
-        TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                               \
-                                            : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);       \
+        TLLM_LIKELY(static_cast<bool>(val))                                                                            \
+        ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                              \
     } while (0)
 
 #define TLLM_CHECK_WITH_INFO(val, info, ...)                                                                           \
@@ -51,17 +56,17 @@ class DebugConfig
 #define TLLM_CHECK_DEBUG(val)                                                                                          \
     do                                                                                                                 \
     {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
         {                                                                                                              \
-            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
-                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);   \
+            TLLM_LIKELY(static_cast<bool>(val))                                                                        \
+            ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);                          \
         }                                                                                                              \
     } while (0)
 
 #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...)                                                                     \
     do                                                                                                                 \
     {                                                                                                                  \
-        if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled()))                                                         \
+        if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled()))                                           \
         {                                                                                                              \
             TLLM_LIKELY(static_cast<bool>(val))                                                                        \
             ? ((void) 0)                                                                                               \
 
@@ -17,9 +17,13 @@
 #pragma once
 
 #include "c10/util/intrusive_ptr.h"
+#include "tensorrt_llm/common/config.h"
+
 #include <Python.h>
 
-namespace tensorrt_llm::common
+TRTLLM_NAMESPACE_BEGIN
+
+namespace common
 {
 
 // Adapted from pybind11's example implementation:
@@ -69,4 +73,6 @@ c10::intrusive_ptr<T> get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a
     return c10::intrusive_ptr<T>::reclaim_copy(p);
 }
 
-} // namespace tensorrt_llm::common
+} // namespace common
+
+TRTLLM_NAMESPACE_END
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifndef TRTLLM_CONFIG_H
+#define TRTLLM_CONFIG_H
+
+/**
+ * \def TRTLLM_ABI_NAMESPACE
+ * This macro is used to open an implicitly inline namespace block for the ABI version.
+ * This macro can be overridden to change the ABI version.
+ * The default ABI version is _v1.
+ */
+#ifndef TRTLLM_ABI_NAMESPACE
+#define TRTLLM_ABI_NAMESPACE _v1
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_BEGIN
+#define TRTLLM_ABI_NAMESPACE_BEGIN                                                                                     \
+    inline namespace TRTLLM_ABI_NAMESPACE                                                                              \
+    {
+#endif
+
+#ifndef TRTLLM_ABI_NAMESPACE_END
+#define TRTLLM_ABI_NAMESPACE_END }
+#endif
+
+/**
+ * \def TRTLLM_NAMESPACE_BEGIN
+ * This macro is used to open a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_BEGIN                                                                                         \
+    namespace tensorrt_llm                                                                                             \
+    {                                                                                                                  \
+    TRTLLM_ABI_NAMESPACE_BEGIN
+
+/**
+ * \def TRTLLM_NAMESPACE_END
+ * This macro is used to close a `tensorrt_llm::` namespace block, along with any
+ * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by TensorRT-LLM and may not be overridden.
+ */
+#define TRTLLM_NAMESPACE_END                                                                                           \
+    TRTLLM_ABI_NAMESPACE_END                                                                                           \
+    }  /* end namespace tensorrt_llm */
+
+#endif // TRTLLM_CONFIG_H
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "tensorrt_llm/common/config.h"
+
 #ifdef ENABLE_FP8
 #include <cuda_fp8.h>
 #include <cuda_runtime.h>
@@ -29,8 +31,8 @@
 #define USE_QGMMA
 #endif
 
-namespace tensorrt_llm
-{
+TRTLLM_NAMESPACE_BEGIN
+
 namespace common
 {
 
@@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T
     const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream);
 
 } // namespace common
-} // namespace tensorrt_llm
+
+TRTLLM_NAMESPACE_END
 #endif // ENABLE_FP8