NVIDIA
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/bertBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cpp/bertBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cpp/disaggServerBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/cpp/gptManagerBenchmark.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/deep_gemm/compiler.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/deep_gemm/compiler.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h‎
Lines changed: 15 additions & 15 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h‎
Lines changed: 6 additions & 6 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h‎
Lines changed: 6 additions & 6 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h‎
Lines changed: 6 additions & 6 deletions
@@ -25,7 +25,7 @@ TensorRT LLM
 * [08/01] Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.md)
 
-* [07/26] N-Gram Speculative Decoding in TensorRT‑LLM
+* [07/26] N-Gram Speculative Decoding in TensorRT LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.md)
 
 * [06/19] Disaggregated Serving in TensorRT LLM
 
@@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
 
 int main(int argc, char* argv[])
 {
-    cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");
+    cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");
     options.add_options()("h,help", "Print usage");
     options.add_options()(
         "m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));
 
@@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,
 int main(int argc, char* argv[])
 
 {
-    cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");
+    cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");
     options.add_options()("h,help", "Print usage");
     options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",
         cxxopts::value<std::vector<std::string>>());
 
@@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine
 int main(int argc, char* argv[])
 {
     cxxopts::Options options(
-        "TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");
+        "TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");
     options.add_options()("h,help", "Print usage");
     options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",
         cxxopts::value<std::string>());
 
@@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()
         }
         else
         {
-            TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");
+            TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");
         }
     }
     return includeDirs;
 
@@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
         {
             void* ret = dllGetSym(handle, name);
             TLLM_CHECK_WITH_INFO(ret != nullptr,
-                "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+                "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "
                 "built with UCX support, please rebuild in UCX-enabled environment.");
             return ret;
         };
 
@@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*
         break;
     default:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
             occupancy);
         break;
     case tkc::CutlassTileConfigSM100::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
         break;
     case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
             "heuristic.");
         break;
     default:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*
         break;
     default:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,
             occupancy);
         break;
     case tkc::CutlassTileConfigSM120::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");
         break;
     case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "
             "heuristic.");
         break;
     default:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*
         break;
     default:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
             occupancy);
         break;
     case tkc::CutlassTileConfigSM100::Undefined:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");
         break;
     case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
         throw std::runtime_error(
-            "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "
             "heuristic.");
         break;
     default:
-        throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
+        throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");
         break;
     }
 }
@@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
         else
         {
             throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
+                "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
         }
     }
     else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
@@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
         else
         {
             throw std::runtime_error(
-                "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
+                "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
         }
     }
     else
     {
         throw std::runtime_error(
-            "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
+            "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");
     }
 }
 
 
@@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
     int* occupancy)
 {
     throw std::runtime_error(
-        "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");
+        "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");
 }
 
 #else
@@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
     {
         std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "
             + std::to_string(mMaxSmemSize);
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
     }
     /* // Return workspace size */
     if (!A && !B && !D)
@@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*
     {
         std::string errMsg("Requested workspace size insufficient. Required "
             + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
     }
     auto can_implement = gemm.can_implement(args);
     if (can_implement != cutlass::Status::kSuccess)
     {
         std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "
             + std::string(cutlassGetStatusString(can_implement));
-        throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);
     }
     auto initStatus = gemm.initialize(args, workspace, stream);
     if (initStatus != cutlass::Status::kSuccess)
     {
         std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "
             + std::string(cutlassGetStatusString(initStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
     }
     auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());
     if (runStatus != cutlass::Status::kSuccess)
     {
         std::string errMsg
             = "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));
-        throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
+        throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);
     }
     return gemm.get_workspace_size(args);
 }
 
@@ -107,7 +107,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
         int* occupancy)                                                                                                \
     {                                                                                                                  \
         throw std::runtime_error(                                                                                      \
-            "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture."); \
+            "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture."); \
     }
 
 #else
@@ -268,7 +268,7 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
         {                                                                                                              \
             std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got " \
                 + std::to_string(mMaxSmemSize);                                                                        \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
         }                                                                                                              \
         /* // Return workspace size */                                                                                 \
         if (!A && !B && !D)                                                                                            \
@@ -279,28 +279,28 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void
         {                                                                                                              \
             std::string errMsg("Requested workspace size insufficient. Required "                                      \
                 + std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));          \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
         }                                                                                                              \
         auto can_implement = gemm.can_implement(args);                                                                 \
         if (can_implement != cutlass::Status::kSuccess)                                                                \
         {                                                                                                              \
             std::string errMsg = "FP4 Gemm cutlass kernel will fail for params. Error: "                               \
                 + std::string(cutlassGetStatusString(can_implement));                                                  \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
         }                                                                                                              \
         auto initStatus = gemm.initialize(args, workspace, stream);                                                    \
         if (initStatus != cutlass::Status::kSuccess)                                                                   \
         {                                                                                                              \
             std::string errMsg                                                                                         \
                 = "Failed to initialize cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(initStatus));  \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
         }                                                                                                              \
         auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());          \
         if (runStatus != cutlass::Status::kSuccess)                                                                    \
         {                                                                                                              \
             std::string errMsg                                                                                         \
                 = "Failed to run cutlass FP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));          \
-            throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);                               \
+            throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);                               \
         }                                                                                                              \
         return gemm.get_workspace_size(args);                                                                          \
     }
Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da`
`135`	`135`
`136`	`136`	`int main(int argc, char* argv[])`
`137`	`137`	`{`
`138`		`- cxxopts::Options options("TensorRT-LLM C++ Runtime Benchmark", "TensorRT-LLM C++ Runtime Benchmark for BERT.");`
	`138`	`+ cxxopts::Options options("TensorRT LLM C++ Runtime Benchmark", "TensorRT LLM C++ Runtime Benchmark for BERT.");`
`139`	`139`	`options.add_options()("h,help", "Print usage");`
`140`	`140`	`options.add_options()(`
`141`	`141`	`"m,model", "Model name specified for engines.", cxxopts::value<std::string>()->default_value("bert_base"));`
Original file line number	Diff line number	Diff line change
`@@ -1145,7 +1145,7 @@ void benchmark(std::vector<std::filesystem::path> const& contextEngineDirs,`
`1145`	`1145`	`int main(int argc, char* argv[])`
`1146`	`1146`
`1147`	`1147`	`{`
`1148`		`- cxxopts::Options options("TensorRT-LLm DisaggServer Benchmark");`
	`1148`	`+ cxxopts::Options options("TensorRT LLM DisaggServer Benchmark");`
`1149`	`1149`	`options.add_options()("h,help", "Print usage");`
`1150`	`1150`	`options.add_options()("context_engine_dirs", "Directories that store context engines,separator is a ,",`
`1151`	`1151`	`cxxopts::value<std::vector<std::string>>());`
Original file line number	Diff line number	Diff line change
`@@ -1055,7 +1055,7 @@ void benchmarkExecutor(std::optional<std::filesystem::path> const& decoderEngine`
`1055`	`1055`	`int main(int argc, char* argv[])`
`1056`	`1056`	`{`
`1057`	`1057`	`cxxopts::Options options(`
`1058`		`- "TensorRT-LLM BatchManager Benchmark", "TensorRT-LLM BatchManager Benchmark for GPT and GPT-like models.");`
	`1058`	`+ "TensorRT LLM BatchManager Benchmark", "TensorRT LLM BatchManager Benchmark for GPT and GPT-like models.");`
`1059`	`1059`	`options.add_options()("h,help", "Print usage");`
`1060`	`1060`	`options.add_options()("engine_dir, decoder_engine_dir", "Directory that store the engines of decoder models.",`
`1061`	`1061`	`cxxopts::value<std::string>());`
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ std::vector<std::filesystem::path> getJitIncludeDirs()`
`217`	`217`	`}`
`218`	`218`	`else`
`219`	`219`	`{`
`220`		`- TLLM_LOG_WARNING("Failed to find TensorRT-LLM installation, DeepGEMM will be disabled.");`
	`220`	`+ TLLM_LOG_WARNING("Failed to find TensorRT LLM installation, DeepGEMM will be disabled.");`
`221`	`221`	`}`
`222`	`222`	`}`
`223`	`223`	`return includeDirs;`
Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa`
`165`	`165`	`{`
`166`	`166`	`void* ret = dllGetSym(handle, name);`
`167`	`167`	`TLLM_CHECK_WITH_INFO(ret != nullptr,`
`168`		`- "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "`
	`168`	`+ "Unable to load UCX wrapper library symbol, possible cause is that TensorRT LLM library is not "`
`169`	`169`	`"built with UCX support, please rebuild in UCX-enabled environment.");`
`170`	`170`	`return ret;`
`171`	`171`	`};`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm100(T* D, void const* A, void const*`
`105`	`105`	`break;`
`106`	`106`	`default:`
`107`	`107`	`throw std::runtime_error(`
`108`		`- "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
	`108`	`+ "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
`109`	`109`	`break;`
`110`	`110`	`}`
`111`	`111`	`}`
`@@ -146,15 +146,15 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,`
`146`	`146`	`occupancy);`
`147`	`147`	`break;`
`148`	`148`	`case tkc::CutlassTileConfigSM100::Undefined:`
`149`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");`
	`149`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");`
`150`	`150`	`break;`
`151`	`151`	`case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:`
`152`	`152`	`throw std::runtime_error(`
`153`		`- "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
	`153`	`+ "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
`154`	`154`	`"heuristic.");`
`155`	`155`	`break;`
`156`	`156`	`default:`
`157`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
	`157`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
`158`	`158`	`break;`
`159`	`159`	`}`
`160`	`160`	`}`
`@@ -177,7 +177,7 @@ size_t dispatchNVFP4xNVFP4GemmClusterShapeSm120(T* D, void const* A, void const*`
`177`	`177`	`break;`
`178`	`178`	`default:`
`179`	`179`	`throw std::runtime_error(`
`180`		`- "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
	`180`	`+ "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
`181`	`181`	`break;`
`182`	`182`	`}`
`183`	`183`	`}`
`@@ -205,16 +205,16 @@ size_t dispatchNVFP4xNVFP4GemmCTAShapeSm120(T* D, void const* A, void const* B,`
`205`	`205`	`occupancy);`
`206`	`206`	`break;`
`207`	`207`	`case tkc::CutlassTileConfigSM120::Undefined:`
`208`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");`
	`208`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config undefined.");`
`209`	`209`	`break;`
`210`	`210`	`case tkc::CutlassTileConfigSM120::ChooseWithHeuristic:`
`211`	`211`	`throw std::runtime_error(`
`212`		`- "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
	`212`	`+ "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
`213`	`213`	`"heuristic.");`
`214`	`214`	`break;`
`215`	`215`	`default:`
`216`	`216`	`throw std::runtime_error(`
`217`		`- "[TensorRT-LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
	`217`	`+ "[TensorRT LLM Error][FP4][sm120][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
`218`	`218`	`break;`
`219`	`219`	`}`
`220`	`220`	`}`
`@@ -257,7 +257,7 @@ size_t dispatchMXFP8xMXFP4GemmClusterShapeSm100(T* D, void const* A, void const*`
`257`	`257`	`break;`
`258`	`258`	`default:`
`259`	`259`	`throw std::runtime_error(`
`260`		`- "[TensorRT-LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
	`260`	`+ "[TensorRT LLM Error][FP4][dispatch_gemm_cluster_shape] Config is invalid for FP4 GEMM.");`
`261`	`261`	`break;`
`262`	`262`	`}`
`263`	`263`	`}`
`@@ -293,15 +293,15 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,`
`293`	`293`	`occupancy);`
`294`	`294`	`break;`
`295`	`295`	`case tkc::CutlassTileConfigSM100::Undefined:`
`296`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");`
	`296`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config undefined.");`
`297`	`297`	`break;`
`298`	`298`	`case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:`
`299`	`299`	`throw std::runtime_error(`
`300`		`- "[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
	`300`	`+ "[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Gemm config should have already been set by "`
`301`	`301`	`"heuristic.");`
`302`	`302`	`break;`
`303`	`303`	`default:`
`304`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
	`304`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4][dispatch_gemm_cta_shape] Config is invalid for FP4 GEMM.");`
`305`	`305`	`break;`
`306`	`306`	`}`
`307`	`307`	`}`
`@@ -338,7 +338,7 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,`
`338`	`338`	`else`
`339`	`339`	`{`
`340`	`340`	`throw std::runtime_error(`
`341`		`- "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");`
	`341`	`+ "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");`
`342`	`342`	`}`
`343`	`343`	`}`
`344`	`344`	`else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)`
`@@ -356,13 +356,13 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,`
`356`	`356`	`else`
`357`	`357`	`{`
`358`	`358`	`throw std::runtime_error(`
`359`		`- "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");`
	`359`	`+ "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");`
`360`	`360`	`}`
`361`	`361`	`}`
`362`	`362`	`else`
`363`	`363`	`{`
`364`	`364`	`throw std::runtime_error(`
`365`		`- "[TensorRT-LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");`
	`365`	`+ "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] FP4 Gemm type unsupported for CUTLASS FP4 GEMM");`
`366`	`366`	`}`
`367`	`367`	`}`
`368`	`368`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*`
`93`	`93`	`int* occupancy)`
`94`	`94`	`{`
`95`	`95`	`throw std::runtime_error(`
`96`		`- "[TensorRT-LLM Error][FP4 gemm Runner] TensorRT-LLM is not compiled with support for this Architecture.");`
	`96`	`+ "[TensorRT LLM Error][FP4 gemm Runner] TensorRT LLM is not compiled with support for this Architecture.");`
`97`	`97`	`}`
`98`	`98`
`99`	`99`	`#else`
`@@ -250,7 +250,7 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*`
`250`	`250`	`{`
`251`	`251`	`std::string errMsg = "SMEM size exceeds maximum allowed. Required " + std::to_string(smem_size) + ", got "`
`252`	`252`	`+ std::to_string(mMaxSmemSize);`
`253`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);`
	`253`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);`
`254`	`254`	`}`
`255`	`255`	`/* // Return workspace size */`
`256`	`256`	`if (!A && !B && !D)`
`@@ -261,28 +261,28 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const*`
`261`	`261`	`{`
`262`	`262`	`std::string errMsg("Requested workspace size insufficient. Required "`
`263`	`263`	`+ std::to_string(gemm.get_workspace_size(args)) + ", got " + std::to_string(workspaceBytes));`
`264`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);`
	`264`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);`
`265`	`265`	`}`
`266`	`266`	`auto can_implement = gemm.can_implement(args);`
`267`	`267`	`if (can_implement != cutlass::Status::kSuccess)`
`268`	`268`	`{`
`269`	`269`	`std::string errMsg = "MXFP8xMXFP4 Gemm cutlass kernel will fail for params. Error: "`
`270`	`270`	`+ std::string(cutlassGetStatusString(can_implement));`
`271`		`- throw std::runtime_error("[TensorRT-LLM Error][FP4 gemm Runner] " + errMsg);`
	`271`	`+ throw std::runtime_error("[TensorRT LLM Error][FP4 gemm Runner] " + errMsg);`
`272`	`272`	`}`
`273`	`273`	`auto initStatus = gemm.initialize(args, workspace, stream);`
`274`	`274`	`if (initStatus != cutlass::Status::kSuccess)`
`275`	`275`	`{`
`276`	`276`	`std::string errMsg = "Failed to initialize cutlass MXFP8xMXFP4 gemm. Error: "`
`277`	`277`	`+ std::string(cutlassGetStatusString(initStatus));`
`278`		`- throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);`
	`278`	`+ throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);`
`279`	`279`	`}`
`280`	`280`	`auto runStatus = gemm.run(args, workspace, stream, nullptr, tensorrt_llm::common::getEnvEnablePDL());`
`281`	`281`	`if (runStatus != cutlass::Status::kSuccess)`
`282`	`282`	`{`
`283`	`283`	`std::string errMsg`
`284`	`284`	`= "Failed to run cutlass MXFP8xMXFP4 gemm. Error: " + std::string(cutlassGetStatusString(runStatus));`
`285`		`- throw std::runtime_error("[TensorRT-LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);`
	`285`	`+ throw std::runtime_error("[TensorRT LLM Error][MXFP8xMXFP4 gemm Runner] " + errMsg);`
`286`	`286`	`}`
`287`	`287`	`return gemm.get_workspace_size(args);`
`288`	`288`	`}`