Fix Attention addLayer, make cmake to work with TRT 10.14

yizhuoz004 · yizhuoz004 · commit 225ba18afa97 · 2025-11-12T20:07:39.000-08:00
diff --git a/mlir-tensorrt/CMakePresets.json b/mlir-tensorrt/CMakePresets.json
@@ -100,6 +100,20 @@
         "MLIR_TRT_ENABLE_NCCL": "OFF",
         "MLIR_TRT_DOWNLOAD_TENSORRT_VERSION": "$env{DOWNLOAD_TENSORRT_VERSION}"
       }
+    },
+    {
+      "name": "python-wheel-build",
+      "displayName": "Configuration for building the compiler/runtime Python package wheels",
+      "generator": "Ninja",
+      "binaryDir": "build",
+      "inherits": "ninja-llvm",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "LLVM_ENABLE_ASSERTIONS": "OFF",
+        "CMAKE_PLATFORM_NO_VERSIONED_SONAME": "ON",
+        "MLIR_TRT_ENABLE_NCCL": "OFF",
+        "MLIR_TRT_DOWNLOAD_TENSORRT_VERSION": "$env{DOWNLOAD_TENSORRT_VERSION}"
+      }
     }
   ]
 }
diff --git a/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake b/mlir-tensorrt/build_tools/cmake/MTRTDependencies.cmake
@@ -57,8 +57,8 @@ macro(configure_tensorrt_python_plugin_header)
     find_file(
       trt_python_plugin_header
       NAMES NvInferPythonPlugin.h plugin.h
-      HINTS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/python/include/impl
-      PATHS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/python/include/impl
+      HINTS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/include/impl
+      PATHS ${ARG_INSTALL_DIR} ${ARG_INSTALL_DIR}/include/impl
       REQUIRED
       NO_CMAKE_PATH NO_DEFAULT_PATH
       NO_CACHE
diff --git a/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake b/mlir-tensorrt/build_tools/cmake/TensorRTDownloadURL.cmake
@@ -80,6 +80,10 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_
     set(ARG_VERSION "10.12.0.36")
   endif()
 
+  if(ARG_VERSION VERSION_EQUAL "10.14")
+    set(ARG_VERSION "10.14.1.48")
+  endif()
+
   set(downloadable_versions
     "8.6.1.6"
     "9.0.1.4" "9.1.0.4" "9.2.0.5"
@@ -97,6 +101,7 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_
     "10.8.0.43"
     "10.9.0.34"
     "10.12.0.36"
+    "10.14.1.48"
   )
 
   if(NOT ARG_VERSION IN_LIST downloadable_versions)
@@ -164,6 +169,8 @@ function(mtrt_get_tensorrt_download_url ARG_VERSION OS_NAME TARGET_ARCH ARG_OUT_
   elseif(ARG_VERSION VERSION_GREATER 10.10
       AND ARG_VERSION VERSION_LESS 10.13)
     set(TRT_CUDA_VERSION 12.9)
+  elseif(ARG_VERSION VERSION_GREATER 10.13)
+    set(TRT_CUDA_VERSION 13.0)
   endif()
 
   # Handle TRT 8 versions.
diff --git a/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/dialects/test_tensorrt.py b/mlir-tensorrt/compiler/test/python/mlir_tensorrt_compiler/dialects/test_tensorrt.py
@@ -49,6 +49,8 @@ def test_attributes():
             tensorrt.TripLimitAttr.get("kWHILE"),
             tensorrt.FillOperationAttr.get("kRANDOM_UNIFORM"),
             tensorrt.ScatterModeAttr.get("kELEMENT"),
+            tensorrt.AttentionNormalizationOpAttr.get("kSOFTMAX"),
+            tensorrt.DataTypeAttr.get("kFLOAT"),
         ]:
             print(attr)
 
@@ -74,3 +76,5 @@ def test_attributes():
 # CHECK-NEXT: #tensorrt.trip_limit<kWHILE>
 # CHECK-NEXT: #tensorrt.fill_operation<kRANDOM_UNIFORM>
 # CHECK-NEXT: #tensorrt.scatter_mode<kELEMENT>
+# CHECK-NEXT: #tensorrt.attention_normalization_op<kSOFTMAX>
+# CHECK-NEXT: #tensorrt.data_type<kFLOAT>
diff --git a/mlir-tensorrt/compiler/tools/CMakeLists.txt b/mlir-tensorrt/compiler/tools/CMakeLists.txt
@@ -21,5 +21,5 @@ set(LLVM_LINK_COMPONENTS
 add_subdirectory(mlir-tensorrt-opt)
 add_subdirectory(mlir-tensorrt-compiler)
 add_subdirectory(mlir-tensorrt-translate)
-add_subdirectory(mlir-tensorrt-lsp-server)
+# add_subdirectory(mlir-tensorrt-lsp-server)
 add_subdirectory(mlir-tensorrt-runner)
diff --git a/mlir-tensorrt/integrations/python/setup_utils.py b/mlir-tensorrt/integrations/python/setup_utils.py
@@ -13,7 +13,7 @@
 import subprocess
 import atexit
 
-TENSORRT_VERSION = os.getenv("MLIR_TRT_DOWNLOAD_TENSORRT_VERSION", "10.12")
+TENSORRT_VERSION = os.getenv("MLIR_TRT_DOWNLOAD_TENSORRT_VERSION", "10.14")
 
 
 def log(*args):
@@ -105,8 +105,8 @@ def run_cmake_build(python_package_name: str, python_wheel_staging_dir: Path):
 
     # Environment variable overrides
     cmake_preset = os.environ.get("MLIR_TRT_CMAKE_PRESET", "python-wheel-build")
-    install_prefix = os.environ.get("MLIR_TRT_INSTALL_DIR", None)
-    build_dir = os.environ.get("MLIR_TRT_BUILD_DIR", None)
+    install_prefix = os.environ.get("MLIR_TRT_INSTALL_DIR", "./install")
+    build_dir = os.environ.get("MLIR_TRT_BUILD_DIR", "./build")
     parallel_jobs = os.environ.get("MLIR_TRT_PARALLEL_JOBS", str(os.cpu_count() or 1))
 
     # Additional CMake options from environment
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect-c/TensorRTAttributes.h b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect-c/TensorRTAttributes.h
@@ -188,6 +188,22 @@ DECLARE_ATTR_GETTER_FROM_STRING(ScatterMode)
 DECLARE_IS_ATTR(ScatterMode)
 DECLARE_STRING_GETTER_FROM_ATTR(ScatterMode)
 
+//===----------------------------------------------------------------------===//
+// AttentionNormalizationOp
+//===----------------------------------------------------------------------===//
+
+DECLARE_ATTR_GETTER_FROM_STRING(AttentionNormalizationOp)
+DECLARE_IS_ATTR(AttentionNormalizationOp)
+DECLARE_STRING_GETTER_FROM_ATTR(AttentionNormalizationOp)
+
+//===----------------------------------------------------------------------===//
+// DataType
+//===----------------------------------------------------------------------===//
+
+DECLARE_ATTR_GETTER_FROM_STRING(DataType)
+DECLARE_IS_ATTR(DataType)
+DECLARE_STRING_GETTER_FROM_ATTR(DataType)
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td
@@ -4485,7 +4485,8 @@ def TensorRT_AttentionOp : TensorRT_Op<"attention",
       indicates the position is allowed to attend. For other types, mask values
       are added to BMM1 output.
     - NormalizationQuantizeScale (optional): tensor of type f32, f16, or bf16
-      with rank 0 or 1, used for quantizing the normalization output.
+      with rank 0 (scalar) or 1 (1D tensor), used for quantizing the normalization output.
+      Required when normalization_quantize_to_type is specified.
 
     #### Attributes:
 
@@ -4502,6 +4503,10 @@ def TensorRT_AttentionOp : TensorRT_Op<"attention",
     - If normalization_quantize_to_type is specified:
       * It must be kFP8 or kINT8
       * normalization_quantize_scale input must be provided
+    - If normalization_quantize_scale is provided:
+      * normalization_quantize_to_type must be specified
+      * Element type must be f32, f16, or bf16
+      * Rank must be 0 (scalar) or 1 (1D tensor)
     - Cannot use both mask input and causal=true simultaneously
 
     #### Examples:
@@ -4539,7 +4544,7 @@ def TensorRT_AttentionOp : TensorRT_Op<"attention",
     TensorRT_RankedTensorOf<[F16, BF16, F32]>:$value,
     Optional<TensorRT_Tensor>:$mask,
     Optional<TensorRT_RankedTensorOf<[F16, BF16, F32]>>:$normalization_quantize_scale,
-    OptionalAttr<TensorRT_AttentionNormalizationOpAttr>:$normalization_operation,
+    DefaultValuedAttr<TensorRT_AttentionNormalizationOpAttr, "tensorrt::AttentionNormalizationOp::kSOFTMAX">:$normalization_operation,
     DefaultValuedAttr<BoolAttr, "false">:$causal,
     DefaultValuedAttr<BoolAttr, "false">:$decomposable,
     OptionalAttr<TensorRT_DataTypeAttr>:$normalization_quantize_to_type
@@ -4565,12 +4570,7 @@ def TensorRT_AttentionOp : TensorRT_Op<"attention",
   }] # baseClassDeclaration;
   
   let trtLayerAdd = [{
-    // Get normalization operation, default to kSOFTMAX
-    nvinfer1::AttentionNormalizationOp normOp = $normalization_operation 
-      ? *$normalization_operation 
-      : nvinfer1::AttentionNormalizationOp::kSOFTMAX;
-    
-    nvinfer1::IAttention *layer = $net->addAttention(*$query, *$key, *$value, normOp, $causal);
+    nvinfer1::IAttention *layer = $net->addAttention(*$query, *$key, *$value, *$normalization_operation, $causal);
     if (!layer)
       return failure();
     
@@ -4584,19 +4584,22 @@ def TensorRT_AttentionOp : TensorRT_Op<"attention",
     }
     
     if ($normalization_quantize_to_type) {
-      layer->setNormalizationQuantizeToType(*$normalization_quantize_to_type);
+      auto convertedDataType = ::mlir::tensorrt::convertDataTypeToNvInferEnum(*$normalization_quantize_to_type);
+      if (!convertedDataType)
+        return emitError($op->getLoc()) << "failed to convert DataType to nvinfer enum";
+      layer->setNormalizationQuantizeToType(*convertedDataType);
     }
     
     if (!$e.isStronglyTyped()){
       FailureOr<nvinfer1::DataType> outputTrtType = getNvInferDataType($op.getLoc(),
                                                           $op.getType().getElementType());
       if (failed(outputTrtType))
         return failure();
-      layer->setOutputType(0, *outputTrtType);
     }
     
     $results.push_back(layer->getOutput(0));
-    $e.setMetadata(layer, $op);
+    // TODO: nvinfer1::IAttention does not have setMetadata API in 10.14
+    // layer->setMetadata($op);
   }];
 }
 
diff --git a/mlir-tensorrt/tensorrt/lib/Bindings/Python/DialectTensorRT.cpp b/mlir-tensorrt/tensorrt/lib/Bindings/Python/DialectTensorRT.cpp
@@ -77,4 +77,6 @@ PYBIND11_MODULE(_tensorrt, m) {
   ADD_PYTHON_ATTRIBUTE_ADAPTOR(TripLimit)
   ADD_PYTHON_ATTRIBUTE_ADAPTOR(FillOperation)
   ADD_PYTHON_ATTRIBUTE_ADAPTOR(ScatterMode)
+  ADD_PYTHON_ATTRIBUTE_ADAPTOR(AttentionNormalizationOp)
+  ADD_PYTHON_ATTRIBUTE_ADAPTOR(DataType)
 }
diff --git a/mlir-tensorrt/tensorrt/lib/CAPI/TensorRTAttributes.cpp b/mlir-tensorrt/tensorrt/lib/CAPI/TensorRTAttributes.cpp
@@ -121,3 +121,11 @@ DEFINE_STRING_GETTER_FROM_ATTR(FillOperation)
 DEFINE_ATTR_GETTER_FROM_STRING(ScatterMode)
 DEFINE_IS_ATTR(ScatterMode)
 DEFINE_STRING_GETTER_FROM_ATTR(ScatterMode)
+
+DEFINE_ATTR_GETTER_FROM_STRING(AttentionNormalizationOp)
+DEFINE_IS_ATTR(AttentionNormalizationOp)
+DEFINE_STRING_GETTER_FROM_ATTR(AttentionNormalizationOp)
+
+DEFINE_ATTR_GETTER_FROM_STRING(DataType)
+DEFINE_IS_ATTR(DataType)
+DEFINE_STRING_GETTER_FROM_ATTR(DataType)
diff --git a/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TypeInferenceInterfaceImpls.cpp b/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TypeInferenceInterfaceImpls.cpp
@@ -1633,3 +1633,19 @@ LogicalResult tensorrt::DequantizeOp::inferReturnTypeComponents(
                                     /*elementType=*/nullptr);
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// AttentionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult tensorrt::AttentionOp::inferReturnTypeComponents(
+    MLIRContext *ctx, std::optional<Location> loc, ValueShapeRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  AttentionOp::Adaptor adaptor(operands, attributes, properties, regions);
+  auto queryType = cast<RankedTensorType>(adaptor.getQuery().getType());
+  inferredReturnShapes.emplace_back(
+      /*vec=*/queryType.getShape(),
+      /*elementType=*/queryType.getElementType());
+  return success();
+}
diff --git a/mlir-tensorrt/tensorrt/lib/TensorRT/IR/Verification.cpp b/mlir-tensorrt/tensorrt/lib/TensorRT/IR/Verification.cpp
@@ -1464,3 +1464,59 @@ static LogicalResult verifyAllowedDataTypes(UnaryOp op) {
 LogicalResult tensorrt::UnaryOp::verify() {
   return verifyAllowedDataTypes(*this);
 }
+
+//===----------------------------------------------------------------------===//
+// AttentionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult tensorrt::AttentionOp::verify() {
+  // Check 1: Cannot use both mask input and causal=true simultaneously
+  if (getMask() && getCausal())
+    return emitOpError(
+        "cannot use both mask input and causal=true simultaneously");
+
+  // Check 2: If normalization_quantize_to_type is specified, it must be kFP8
+  // or kINT8 and normalization_quantize_scale must be provided
+  std::optional<DataType> quantizeType = getNormalizationQuantizeToType();
+  if (quantizeType.has_value()) {
+    if (*quantizeType != DataType::kFP8 && *quantizeType != DataType::kINT8)
+      return emitOpError("normalization_quantize_to_type must be kFP8 or "
+                         "kINT8, but got ")
+             << stringifyDataType(*quantizeType);
+
+    if (!getNormalizationQuantizeScale())
+      return emitOpError(
+          "normalization_quantize_scale input must be provided when "
+          "normalization_quantize_to_type is specified");
+  }
+
+  // Check 3: If normalization_quantize_scale is provided,
+  // normalization_quantize_to_type must be specified
+  if (getNormalizationQuantizeScale() && !quantizeType.has_value())
+    return emitOpError(
+        "normalization_quantize_to_type must be specified when "
+        "normalization_quantize_scale input is provided");
+
+  // Check 4: If normalization_quantize_scale is provided, validate its type
+  if (getNormalizationQuantizeScale()) {
+    RankedTensorType scaleType = getNormalizationQuantizeScale().getType();
+    Type scaleElemType = scaleType.getElementType();
+
+    // Check that element type is f32, f16, or bf16
+    if (!scaleElemType.isF32() && !scaleElemType.isF16() &&
+        !scaleElemType.isBF16())
+      return emitOpError(
+                 "normalization_quantize_scale element type must be f32, f16, "
+                 "or bf16, but got ")
+             << scaleElemType;
+
+    // Check that scale is rank 0 or 1
+    if (scaleType.getRank() != 0 && scaleType.getRank() != 1)
+      return emitOpError(
+                 "normalization_quantize_scale must be rank 0 or 1, but got "
+                 "rank ")
+             << scaleType.getRank();
+  }
+
+  return success();
+}

Original file line number	Diff line number	Diff line change
`@@ -100,6 +100,20 @@`
`100`	`100`	`"MLIR_TRT_ENABLE_NCCL": "OFF",`
`101`	`101`	`"MLIR_TRT_DOWNLOAD_TENSORRT_VERSION": "$env{DOWNLOAD_TENSORRT_VERSION}"`
`102`	`102`	`}`
	`103`	`+ },`
	`104`	`+ {`
	`105`	`+ "name": "python-wheel-build",`
	`106`	`+ "displayName": "Configuration for building the compiler/runtime Python package wheels",`
	`107`	`+ "generator": "Ninja",`
	`108`	`+ "binaryDir": "build",`
	`109`	`+ "inherits": "ninja-llvm",`
	`110`	`+ "cacheVariables": {`
	`111`	`+ "CMAKE_BUILD_TYPE": "Release",`
	`112`	`+ "LLVM_ENABLE_ASSERTIONS": "OFF",`
	`113`	`+ "CMAKE_PLATFORM_NO_VERSIONED_SONAME": "ON",`
	`114`	`+ "MLIR_TRT_ENABLE_NCCL": "OFF",`
	`115`	`+ "MLIR_TRT_DOWNLOAD_TENSORRT_VERSION": "$env{DOWNLOAD_TENSORRT_VERSION}"`
	`116`	`+ }`
`103`	`117`	`}`
`104`	`118`	`]`
`105`	`119`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,4 +77,6 @@ PYBIND11_MODULE(_tensorrt, m) {`
`77`	`77`	`ADD_PYTHON_ATTRIBUTE_ADAPTOR(TripLimit)`
`78`	`78`	`ADD_PYTHON_ATTRIBUTE_ADAPTOR(FillOperation)`
`79`	`79`	`ADD_PYTHON_ATTRIBUTE_ADAPTOR(ScatterMode)`
	`80`	`+ ADD_PYTHON_ATTRIBUTE_ADAPTOR(AttentionNormalizationOp)`
	`81`	`+ ADD_PYTHON_ATTRIBUTE_ADAPTOR(DataType)`
`80`	`82`	`}`