using pass_manager to lower ttsharedir

Realtyxxx · Realtyxxx · commit 0edb8774cc4b · 2025-08-18T20:18:22.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 .cache
 compile_commands.json
 build/*
-.vscode/*
+.vscode/*
+.clangd/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,7 +13,40 @@ add_subdirectory(lib)
 add_subdirectory(test)
 add_subdirectory(tools/triton-shared-opt)
 
-if (TRITON_SHARED_BUILD_CPU_BACKEND)
+if(TRITON_SHARED_BUILD_CPU_BACKEND)
     add_triton_plugin(TritonShared ${CMAKE_CURRENT_SOURCE_DIR}/triton_shared.cc LINK_LIBS TritonSharedAnalysis TritonToLinalg TritonTilingExtIR)
-    target_link_libraries(TritonShared PRIVATE Python3::Module pybind11::headers ${Python3_LIBRARIES})
+    target_link_libraries(TritonShared
+        PUBLIC
+        MLIRAffineToStandard
+        MLIRReconcileUnrealizedCasts
+        MLIRSCFToControlFlow
+
+        # ! transforms
+        MLIRBufferizationTransforms
+        MLIRArithTransforms
+        MLIRMemRefTransforms
+        MLIRSCFTransforms
+        MLIRFuncTransforms
+        MLIRVectorTransforms
+        MLIRTensorTransforms
+
+        MLIRArithToLLVM
+        MLIRIndexToLLVM
+        MLIRMathToLLVM
+        MLIRComplexToLLVM
+        MLIRVectorToLLVM
+        MLIRVectorToLLVMPass
+        MLIRFuncToLLVM
+        MLIRControlFlowToLLVM
+        MLIRMemRefToLLVM
+        MLIRVectorToSCF
+        MLIRUBToLLVM
+        # MLIRMathToLibm
+
+
+        PRIVATE
+        Python3::Module
+        pybind11::headers
+        ${Python3_LIBRARIES}
+    )
 endif()
diff --git a/backend/compiler.py b/backend/compiler.py
@@ -1,5 +1,5 @@
 from triton.backends.compiler import BaseBackend, GPUTarget
-from triton._C.libtriton import ir, passes
+from triton._C.libtriton import ir, passes, triton_shared
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple
 from types import ModuleType
@@ -78,41 +78,36 @@ def _ttsharedir_to_llir(ttsharedir: str):
         llmlir_path = os.path.join(tmpdir, "ll.mlir")
         llir_path = os.path.join(tmpdir, "ll.ir")
         Path(ttshared_path).write_text(ttsharedir)
-        mlir_opt_path = _get_llvm_bin_path("mlir-opt")
-        # TritonShared-MLIR to LLVM-MLIR
-        subprocess.check_call([mlir_opt_path, ttshared_path,
-            "--convert-linalg-to-affine-loops",
-            # Note: eliminate-empty-tensors fails when there are multiple func.return ops
-            # in a single kernel which are the results of early returns.
-            # See python/examples/test_early_return.py for examples.
-            # We disable this pass for now since performance on CPU isn't the main
-            # focus at the moment.
-            # "--eliminate-empty-tensors",
-            "--empty-tensor-to-alloc-tensor",
-            "--one-shot-bufferize=allow-return-allocs-from-loops=true",
-            "--lower-affine",
-            "--convert-linalg-to-loops",
-            "--expand-strided-metadata",
-            "--convert-scf-to-cf",
-            "--convert-arith-to-llvm",
-            "--convert-math-to-llvm",
-            "--convert-complex-to-llvm",
-            "--convert-vector-to-llvm",
-            "--convert-index-to-llvm",
-            "--memref-expand",
-            "--finalize-memref-to-llvm",
-            "--convert-func-to-llvm",
-            "--convert-cf-to-llvm",
-            # Lowering memrefs creates more affine.apply ops.
-            # Lowering these affine ops again creates further arith ops,
-            # so we have to run these two passes again here.
-            "--lower-affine",
-            "--convert-arith-to-llvm",
-            # Remove all unrealized casts created
-            "--reconcile-unrealized-casts",
-            "--mlir-print-debuginfo",
-            "-o",
-            llmlir_path])
+        context = ir.context()
+        triton_shared.ir.load_dialects(context)
+        mod = ir.parse_mlir_module(ttshared_path, context)
+
+        pm = ir.pass_manager(context)
+        pm.enable_debug()
+        triton_shared.to_llir.add_convert_linalg_to_affine_loops(pm)
+        triton_shared.to_llir.add_empty_tensor_to_alloc_tensor(pm)
+        triton_shared.to_llir.add_one_shot_bufferize_with_options(
+            pm, allow_return_allocs_from_loops=True)
+        triton_shared.to_llir.add_lower_affine(pm)
+        triton_shared.to_llir.add_convert_linalg_to_loops(pm)
+        triton_shared.to_llir.add_expand_strided_metadata(pm)
+        triton_shared.to_llir.add_convert_scf_to_cf(pm)
+        triton_shared.to_llir.add_convert_arith_to_llvm(pm)
+        triton_shared.to_llir.add_convert_math_to_llvm(pm)
+        triton_shared.to_llir.add_convert_complex_to_llvm(pm)
+        triton_shared.to_llir.add_convert_vector_to_llvm(pm)
+        triton_shared.to_llir.add_convert_index_to_llvm(pm)
+        triton_shared.to_llir.add_memref_expand(pm)
+        triton_shared.to_llir.add_finalize_memref_to_llvm(pm)
+        triton_shared.to_llir.add_convert_func_to_llvm(pm)
+
+        triton_shared.to_llir.add_convert_cf_to_llvm(pm)
+        triton_shared.to_llir.add_lower_affine(pm)
+        triton_shared.to_llir.add_convert_arith_to_llvm(pm)
+        triton_shared.to_llir.add_reconcile_unrealized_casts(pm)
+        pm.run(mod)
+
+        Path(llmlir_path).write_text(str(mod))
 
         # LLVM-MLIR to LLVM-IR
         mlir_translate_path = _get_llvm_bin_path("mlir-translate")
diff --git a/triton_shared.cc b/triton_shared.cc
@@ -1,8 +1,179 @@
-﻿#include <pybind11/pybind11.h>
+﻿// pybind11
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
 
 namespace py = pybind11;
 
-// The CPU backend with triton_shared doesn't do compilation from within python
-// but rather externally through triton-shared-opt, so we leave this function
-// blank.
-void init_triton_triton_shared(py::module &&m) {}
+// LLVM
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/TargetSelect.h"
+
+// MLIR: Conversion Passes
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
+
+// MLIR: Dialects
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
+// #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Passes.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Transforms/Passes.h"
+
+// MLIR: Core IR and Passes
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+
+// MLIR: Target and Translation
+// #include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+
+// llvm:Debug
+#include "llvm/Support/Debug.h"  // 关键头文件
+
+// MLIR: Top-level Transforms
+#include "mlir/Transforms/Passes.h"
+
+// Triton and other third-party dialects
+#include "third_party/proton/dialect/include/Dialect/Proton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonInstrument/IR/Dialect.h"
+
+#define ADD_PASS_WRAPPER_0(name, builder) \
+  m.def(name, [](mlir::PassManager &pm) { pm.addPass(builder()); })
+#define ADD_PASS_WRAPPER_1(name, builder, ty0) \
+  m.def(name,                                  \
+        [](mlir::PassManager &pm, ty0 val0) { pm.addPass(builder(val0)); })
+
+#define ADD_PASS_WRAPPER_1_ARG(name, builder, ty0, arg0, val0)            \
+  m.def(                                                                  \
+      name,                                                               \
+      [](mlir::PassManager &pm, ty0 arg0) { pm.addPass(builder(val0)); }, \
+      py::arg("pm"), py::arg(#arg0) = val0);
+
+// 一个函数，用于设置 MLIR/LLVM 的调试类型
+void enable_mlir_debug(const std::string &debug_type) {
+  ::llvm::DebugFlag = true;
+  llvm::setCurrentDebugType(debug_type.c_str());
+}
+
+void init_to_llvm(py::module &&m) {
+  using namespace mlir;
+  // Note: Linalg conversions may not be available in this MLIR version
+  ADD_PASS_WRAPPER_0("add_convert_linalg_to_affine_loops",
+                     createConvertLinalgToAffineLoopsPass);
+  ADD_PASS_WRAPPER_0("add_empty_tensor_to_alloc_tensor",
+                     bufferization::createEmptyTensorToAllocTensorPass);
+  ADD_PASS_WRAPPER_1_ARG(
+      "add_one_shot_bufferize_with_options",
+      [](bool allowReturnAllocsFromLoops) {
+        mlir::bufferization::OneShotBufferizePassOptions options;
+        options.allowReturnAllocsFromLoops = allowReturnAllocsFromLoops;
+        return mlir::bufferization::createOneShotBufferizePass(options);
+      },
+      bool, allow_return_allocs_from_loops, true);
+  ADD_PASS_WRAPPER_0("add_one_shot_bufferize",
+                     bufferization::createOneShotBufferizePass);
+  ADD_PASS_WRAPPER_0("add_lower_affine", createLowerAffinePass);
+  ADD_PASS_WRAPPER_0("add_convert_linalg_to_loops",
+                     createConvertLinalgToLoopsPass);
+  ADD_PASS_WRAPPER_0("add_expand_strided_metadata",
+                     memref::createExpandStridedMetadataPass);
+  ADD_PASS_WRAPPER_0("add_convert_scf_to_cf", createSCFToControlFlowPass);
+  ADD_PASS_WRAPPER_0("add_convert_arith_to_llvm",
+                     createArithToLLVMConversionPass);
+  ADD_PASS_WRAPPER_0("add_convert_math_to_llvm", createConvertMathToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_convert_complex_to_llvm",
+                     createConvertComplexToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_convert_vector_to_llvm",
+                     createConvertVectorToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_convert_index_to_llvm", createConvertIndexToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_memref_expand", memref::createExpandOpsPass);
+  ADD_PASS_WRAPPER_0("add_finalize_memref_to_llvm",
+                     createFinalizeMemRefToLLVMConversionPass);
+  ADD_PASS_WRAPPER_0("add_convert_func_to_llvm", createConvertFuncToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_convert_cf_to_llvm",
+                     createConvertControlFlowToLLVMPass);
+  ADD_PASS_WRAPPER_0("add_reconcile_unrealized_casts",
+                     createReconcileUnrealizedCastsPass);
+}
+
+void init_triton_shared_ir(py::module &&m) {
+  m.def("load_dialects", [](mlir::MLIRContext &context) {
+    mlir::DialectRegistry registry;
+    registry.insert<
+        ::mlir::triton::TritonDialect,
+        // ::mlir::triton::gpu::TritonGPUDialect,
+        // ::mlir::triton::instrument::TritonInstrumentDialect,
+        ::mlir::linalg::LinalgDialect,
+        ::mlir::bufferization::BufferizationDialect,
+        ::mlir::tptr::TPtrDialect,
+        ::mlir::math::MathDialect, ::mlir::memref::MemRefDialect,
+        ::mlir::arith::ArithDialect, ::mlir::scf::SCFDialect,
+        ::mlir::vector::VectorDialect, ::mlir::cf::ControlFlowDialect,
+        ::mlir::triton::proton::ProtonDialect, ::mlir::LLVM::LLVMDialect,
+        ::mlir::ub::UBDialect, ::mlir::func::FuncDialect>();
+    // ::mlir::registerAllDialects(registry);
+    ::mlir::LLVM::registerInlinerInterface(registry);
+    ::mlir::registerBuiltinDialectTranslation(registry);
+    ::mlir::registerLLVMDialectTranslation(registry);
+    ::mlir::LLVM::registerInlinerInterface(registry);
+
+    ::mlir::arith::registerBufferizableOpInterfaceExternalModels(registry);
+    ::mlir::scf::registerBufferizableOpInterfaceExternalModels(registry);
+    ::mlir::linalg::registerBufferizableOpInterfaceExternalModels(registry);
+    ::mlir::vector::registerBufferizableOpInterfaceExternalModels(registry);
+    ::mlir::tensor::registerBufferizableOpInterfaceExternalModels(registry);
+    //! didn't know if exists
+    // ::mlir::memref::registerBufferizableOpInterfaceExternalModels(registry);
+    // ::mlir::func::registerBufferizableOpInterfaceExternalModels(registry);
+
+    ::mlir::bufferization::func_ext::
+        registerBufferizableOpInterfaceExternalModels(registry);
+    // ::mlir::cf::registerBufferizableOpInterfaceExternalModels(registry);
+
+    context.appendDialectRegistry(registry);
+    context.loadAllAvailableDialects();
+  });
+}
+
+void init_triton_shared_debug(py::module &&m) {
+  m.def("enable_mlir_debug", enable_mlir_debug,
+        "Enables a specific MLIR/LLVM debug type (e.g., 'pattern-rewrite'). "
+        "Pass an empty string to disable.",
+        py::arg("debug_type"));
+}
+
+void init_triton_triton_shared(py::module &&m) {
+  init_to_llvm(m.def_submodule("to_llir"));
+  init_triton_shared_ir(m.def_submodule("ir"));
+  init_triton_shared_debug(m.def_submodule("debug"));
+}