NVIDIA
diff --git a/‎mlir-tensorrt/common/include/mlir-tensorrt-common/Utils/PassManagerUtils.h‎
Lines changed: 42 additions & 0 deletions b/‎mlir-tensorrt/common/include/mlir-tensorrt-common/Utils/PassManagerUtils.h‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td‎
Lines changed: 2 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Conversion/Passes.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CUDA/IR/CUDAOps.td‎
Lines changed: 17 additions & 0 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CUDA/IR/CUDAOps.td‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CUDA/Utils/CUDAUtils.h‎
Lines changed: 61 additions & 0 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/CUDA/Utils/CUDAUtils.h‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h‎
Lines changed: 1 addition & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td‎
Lines changed: 6 additions & 6 deletions b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/Plan/Transforms/Passes.td‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/StablehloExt/Transforms/Passes.td‎
Lines changed: 8 additions & 1 deletion b/‎mlir-tensorrt/compiler/include/mlir-tensorrt/Dialect/StablehloExt/Transforms/Passes.td‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎mlir-tensorrt/compiler/lib/Compiler/Extensions/KernelGenExtension/KernelGenExtension.cpp‎
Lines changed: 0 additions & 4 deletions b/‎mlir-tensorrt/compiler/lib/Compiler/Extensions/KernelGenExtension/KernelGenExtension.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎mlir-tensorrt/compiler/lib/Compiler/InputPipelines/LinalgInputPipeline.cpp‎
Lines changed: 11 additions & 11 deletions b/‎mlir-tensorrt/compiler/lib/Compiler/InputPipelines/LinalgInputPipeline.cpp‎
Lines changed: 11 additions & 11 deletions
@@ -0,0 +1,42 @@
+//===- PassManagerUtils.h --------------------------------------*- C++ -*-===//
+//
+// SPDX-FileCopyrightText: Copyright 2026 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Utilities for mlir::OpPassManager.
+///
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_TENSORRT_COMMON_UTILS_PASSMANAGERUTILS
+#define MLIR_TENSORRT_COMMON_UTILS_PASSMANAGERUTILS
+
+#include "mlir/Pass/PassManager.h"
+
+namespace mlir {
+
+/// Add nested passes to the given pass manager for the given operation type.
+template <typename OpT>
+static void
+addNestedPasses(OpPassManager &pm,
+                llvm::function_ref<void(OpPassManager &)> addPasses) {
+  auto &nestedPM = pm.nest<OpT>();
+  addPasses(nestedPM);
+}
+
+} // namespace mlir
+
+#endif // MLIR_TENSORRT_COMMON_UTILS_PASSMANAGERUTILS
@@ -44,7 +44,8 @@ def ConvertStablehloToPlanPass : Pass<"convert-stablehlo-to-plan", "::mlir::Modu
     "::mlir::cf::ControlFlowDialect",
     "::mlir::plan::PlanDialect",
     "::mlir::executor::ExecutorDialect",
-    "::mlir::tensor::TensorDialect"
+    "::mlir::tensor::TensorDialect",
+    "::mlir::cuda::CUDADialect"
   ];
 }
 
 
@@ -93,6 +93,23 @@ def CUDA_GetActiveDeviceOp : CUDA_Op<"get_active_device", [
   let assemblyFormat = "attr-dict";
 }
 
+def CUDA_GetProgramDeviceOp : CUDA_Op<"get_program_device", [
+      Pure, AlwaysSpeculatable]> {
+  let summary = "returns the CUDA device ordinal associated with a program logical device";
+  let description = [{
+    Returns the CUDA device ordinal for the given program "logical device"
+    identifier.
+
+    This operation is intended to support compilation modes where device
+    selection is modeled explicitly and provided by the runtime when the
+    program is loaded (e.g. via a constant mapping table). As a result, this
+    operation is marked as being speculatable and side-effect free.
+  }];
+  let arguments = (ins I32:$logicalDevice);
+  let results = (outs I32:$result);
+  let assemblyFormat = "attr-dict $logicalDevice `:` type($result)";
+}
+
 def CUDA_SetActiveDeviceOp : CUDA_Op<"set_active_device", [
       MemoryEffects<[MemWrite]>]> {
   let summary = "sets the active CUDA device context";
 
@@ -0,0 +1,61 @@
+//===- CUDAUtils.h ----------------------------------------------*- C++ -*-===//
+//
+// SPDX-FileCopyrightText: Copyright 2026 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Utility functions for the CUDA dialect.
+///
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_TENSORRT_DIALECT_CUDA_UTILS_CUDAUTILS_H
+#define MLIR_TENSORRT_DIALECT_CUDA_UTILS_CUDAUTILS_H
+
+#include "mlir/IR/Block.h"
+namespace mlir {
+class Operation;
+class Value;
+class Location;
+class RewriterBase;
+class PatternRewriter;
+
+namespace cuda {
+
+/// Create a default stream (stream 0) on device 0. This creates:
+/// - A constant 0 for the device index
+/// - A cuda.get_program_device operation
+/// - A cuda.stream.create operation with index 0
+Value createDefaultStream0(RewriterBase &rewriter, Location loc);
+
+/// Go over the operations in Block (containing anchor) from the first operation
+/// in the Block to the point before `anchor`. If we find a `cuda.stream.create`
+/// operation matching the pattern produced by `createDefaultStream0`, return
+/// the result of that operation. Otherwise, call createDefaultStream0 to create
+/// a new stream at the beginning of the block.
+Value getOrCreateDefaultStream0(RewriterBase &rewriter, Operation *anchor);
+
+/// Go over the operations in Block (containing anchor point) from the first
+/// operation in the Block to the point before `anchor point`. If we find a
+/// `cuda.stream.create` operation matching the pattern produced by
+/// `createDefaultStream0`, return the result of that operation. Otherwise, call
+/// createDefaultStream0 to create a new stream at the beginning of the block.
+Value getOrCreateDefaultStream0(RewriterBase &rewriter, Location loc,
+                                Block::iterator anchorPoint);
+
+} // namespace cuda
+} // namespace mlir
+
+#endif // MLIR_TENSORRT_DIALECT_CUDA_UTILS_CUDAUTILS_H
@@ -43,6 +43,7 @@ inline llvm::cl::ValuesClass createInputKindClOptions() {
       clEnumValN(InputKind::TensorRT, "tensorrt", "TensorRT IR"),
       clEnumValN(InputKind::Linalg, "linalg", "Linalg IR"));
 }
+
 } // namespace detail
 
 struct ClusterTargetOption;
@@ -83,7 +84,6 @@ struct PlanClusteringOptions : public mlir::OptionsGroup {
 void buildPlanSegmentationPipeline(OpPassManager &pm, int abiVersion,
                                    plan::InputKind inputKind,
                                    bool entrypointUsesAllocCConv,
-                                   llvm::StringRef entrypoint,
                                    const plan::PlanClusteringOptions &opts);
 
 struct PlanBufferizationOptions {
 
@@ -327,19 +327,19 @@ def ClusteringPass : Pass<"plan-clustering", "::mlir::ModuleOp"> {
     is to achieve a course segmentation that specifies how clusters of
     operations will be compiled.
 
+    The pass processes all functions in the module that:
+    - Are not declarations or external functions
+    - Do not already have a `plan.cluster_kind` attribute
+    - Are not private functions with `plan.decomposition` attribute
+
     The kinds of clusters that can be formed and the specific rules for
     clustering are defined by the clustering configuration specified
     by the module's `plan.backends` attribute. This is an array of
     attributes which all implement the
     [CompilerBackendAttrInterface](../IR/PlanInterfaces.td).
   }];
 
-  let options =
-      [Option<"entrypoint", "entrypoint", "std::string", "\"\"",
-              "the name of the entrypoint function; if empty then the "
-              "clustering runs"
-              " on all functions">,
-       InputKindOption];
+  let options = [InputKindOption];
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -106,6 +106,11 @@ def RefineShapesPass : Pass<"stablehlo-ext-refine-shapes", "ModuleOp"> {
     `stablehlo-refine-shapes` patterns as well as some additional patterns
     for handling `tensor.cast` operations.
   }];
+
+  let options = [
+    Option<"interprocedural", "interprocedural", "bool", "true",
+      "whether to try to simplify function types">
+  ];
 }
 
 //===----------------------------------------------------------------------===//
@@ -126,7 +131,9 @@ def CanonicalizeShapesPass : Pass<"stablehlo-ext-canonicalize-shapes", "ModuleOp
   let options = [
     Option<"maxIterations", "max-iterations", "int64_t", "8",
       "the maximum number of iterations to run the dynamism simplification and "
-      "shape refinement if a fixed-point is not reached">
+      "shape refinement if a fixed-point is not reached">,
+    Option<"interprocedural", "interprocedural", "bool", "true",
+      "whether to try to simplify function types">
   ];
 }
 
 
@@ -214,10 +214,6 @@ void KernelGenExtension::populatePasses(mlir::OpPassManager &pm,
     pm.addPass(createConvertKernelToCUDAPass());
     return;
   }
-
-  if (point == ExtensionPoint::ExecutorLowering) {
-    return;
-  }
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -1,6 +1,6 @@
 //===- LinalgInputPipeline.cpp --------------------------------------------===//
 //
-// SPDX-FileCopyrightText: Copyright 2024-2025 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright 2024-2026 NVIDIA CORPORATION & AFFILIATES.
 // All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 //
@@ -10,11 +10,10 @@
 ///
 //===----------------------------------------------------------------------===//
 #include "mlir-tensorrt/Compiler/InputPipelines/LinalgInputPipeline.h"
+#include "mlir-tensorrt-common/Utils/PassManagerUtils.h"
 #include "mlir-tensorrt/Transforms/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
-#include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Pass/PassOptions.h"
 #include "mlir/Transforms/Passes.h"
@@ -28,12 +27,13 @@ llvm::cl::OptionCategory LinalgInputOptions::category = {
 
 void mtrt::compiler::buildLinalgInputPipeline(OpPassManager &pm,
                                               const LinalgInputOptions &opts) {
-  OpPassManager &funcPM = pm.nest<func::FuncOp>();
-  funcPM.addPass(mlir::createLinalgGeneralizeNamedOpsPass());
-  if (opts.enableLinalgElementwiseFusion)
-    funcPM.addPass(mtrt::createLinalgElementwiseFusionPass());
-  funcPM.addPass(mtrt::createLinalgSimplifyExtractSlicePass());
-  funcPM.addPass(mtrt::createTensorExtPadToInsertSlicePass());
-  funcPM.addPass(mlir::createCSEPass());
-  funcPM.addPass(mlir::createCanonicalizerPass());
+  addNestedPasses<func::FuncOp>(pm, [&opts](OpPassManager &funcPM) {
+    funcPM.addPass(mlir::createLinalgGeneralizeNamedOpsPass());
+    if (opts.enableLinalgElementwiseFusion)
+      funcPM.addPass(mtrt::createLinalgElementwiseFusionPass());
+    funcPM.addPass(mtrt::createLinalgSimplifyExtractSlicePass());
+    funcPM.addPass(mtrt::createTensorExtPadToInsertSlicePass());
+    funcPM.addPass(mlir::createCSEPass());
+    funcPM.addPass(mlir::createCanonicalizerPass());
+  });
 }
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,8 @@ def ConvertStablehloToPlanPass : Pass<"convert-stablehlo-to-plan", "::mlir::Modu`
`44`	`44`	`"::mlir::cf::ControlFlowDialect",`
`45`	`45`	`"::mlir::plan::PlanDialect",`
`46`	`46`	`"::mlir::executor::ExecutorDialect",`
`47`		`- "::mlir::tensor::TensorDialect"`
	`47`	`+ "::mlir::tensor::TensorDialect",`
	`48`	`+ "::mlir::cuda::CUDADialect"`
`48`	`49`	`];`
`49`	`50`	`}`
`50`	`51`
Original file line number	Diff line number	Diff line change
`@@ -214,10 +214,6 @@ void KernelGenExtension::populatePasses(mlir::OpPassManager &pm,`
`214`	`214`	`pm.addPass(createConvertKernelToCUDAPass());`
`215`	`215`	`return;`
`216`	`216`	`}`
`217`		`-`
`218`		`- if (point == ExtensionPoint::ExecutorLowering) {`
`219`		`- return;`
`220`		`- }`
`221`	`217`	`}`
`222`	`218`
`223`	`219`	`//===----------------------------------------------------------------------===//`