intel
diff --git a/‎.github/workflows/inductor-tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/inductor-tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/third-party-benchmarks.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/workflows/third-party-benchmarks.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/triton-benchmarks.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/try-latest-pytorch.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/try-latest-pytorch.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 2 additions & 0 deletions b/‎Makefile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/third_party/vllm/transform_results.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/third_party/vllm/transform_results.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 18 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 18 additions & 0 deletions
@@ -52,6 +52,7 @@ env:
     inductor/test_select_algorithm.py
     inductor/test_max_autotune.py
     inductor/test_compile_subprocess.py
+    inductor/test_analysis.py
 
 jobs:
   compute-params:
 
@@ -223,6 +223,18 @@ jobs:
             --max-new-tokens $MAX_NEW_TOKENS \
             --batch-size $BATCH_SIZE
 
+      - name: Run launch microbenchmark tests
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'launch_micro_benchmarks')) && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'launch_micro_benchmarks') }}
+        run: |
+          source scripts/capture-hw-details.sh
+          python python/test/microbenchmark/launch_overhead.py --reports $REPORTS
+
+          python benchmarks/third_party/vllm/transform_results.py $REPORTS/launch_overhead_results.csv $REPORTS/launch_overhead-report.csv \
+            --tag $TAG \
+            --bgroup overhead \
+            --benchmark launch-overhead \
+            --param_cols="input_type"
+
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v5
 
@@ -65,6 +65,9 @@ env:
   VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }}
   TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
   N_RUNS: ${{ inputs.n_runs || '1' }}
+  # FIXME: Enable Level Zero v2 loader once it's stable.
+  # https://github.com/intel/intel-xpu-backend-for-triton/issues/5572
+  UR_LOADER_USE_LEVEL_ZERO_V2: "0"
 
 jobs:
   build:
 
@@ -96,6 +96,7 @@ jobs:
         inductor/test_select_algorithm.py
         inductor/test_max_autotune.py
         inductor/test_compile_subprocess.py
+        inductor/test_analysis.py
       runner_label: ${{ inputs.runner_label }}
       python_version: "3.10"
 
 
@@ -51,6 +51,9 @@ pytest.ini
 # Instrumentation
 python/triton/instrumentation
 
+# MLIR Plugin
+python/triton/plugins
+
 # Python caches
 __pycache__/
 *.py[cod]
 
@@ -20,6 +20,8 @@ option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
 option(TRITON_BUILD_PROTON "Build the Triton Proton profiler" ON)
 option(TRITON_BUILD_UT "Build C++ Triton Unit Tests" ON)
 option(TRITON_BUILD_WITH_CCACHE "Build with ccache (if available)" ON)
+option(LLVM_BUILD_SHARED_LIBS
+  "Build all libraries as shared libraries instead of static" OFF)
 set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 
 if(TRITON_BUILD_WITH_CCACHE)
@@ -64,6 +66,7 @@ if(WIN32)
     set(CMAKE_EXE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
     set(CMAKE_MODULE_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
     set(CMAKE_SHARED_LINKER_FLAGS_TRITONRELBUILDWITHASSERTS "/debug:fastlink /INCREMENTAL")
+    set(LLVM_BUILD_SHARED_LIBS "0")
   else()
     message(FATAL_ERROR "Unsupported compiler")
   endif()
 
@@ -43,6 +43,8 @@ test-unit: all
 	$(PYTEST) --tb=short -vs python/examples/gluon/01-attention-forward.py
 	TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=python/triton/instrumentation/libGPUInstrumentationTestLib.so \
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
+	TRITON_PASS_PLUGIN_PATH=python/triton/plugins/libTritonPluginsTestLib.so \
+		$(PYTEST) -vvv python/test/unit/plugins/test_plugin.py
 	$(PYTEST) --tb=short -s -n $(NUM_PROCS) python/test/gluon
 
 .PHONY: test-distributed
 
@@ -272,7 +272,7 @@ def inspect_stages(_self, stages, options, language, capability):
     # inspect or modify add_stages here
 triton.knobs.runtime.add_stages_inspection_hook = inspect_stages
 ```
-
+Examples of how to use this for out of tree plugin passes is [here](lib/Plugins/README.md)
 # Changelog
 
 Version 2.0 is out! New features include:
 
@@ -46,7 +46,7 @@ def serialize_params(row):
 
     dfs = []
     for compiler_name in compilers:
-        for value_name in ['TFlops', 'GB/s']:
+        for value_name in ['TFlops', 'GB/s', 'time_us']:
             col = f'{compiler_name}-{value_name}'
             if col not in df.columns:
                 continue
 
@@ -55,6 +55,9 @@
 #include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 
+#include "triton/Tools/PluginUtils.h"
+#include "triton/Tools/Sys/GetEnv.hpp"
+
 namespace mlir {
 namespace test {
 namespace intel {
@@ -165,6 +168,21 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::proton::gpu::registerScheduleBufferStorePass();
   mlir::triton::proton::gpu::registerAddSchedBarriersPass();
 
+  // Plugin passes
+  if (std::string filename =
+          mlir::triton::tools::getStrEnv("TRITON_PASS_PLUGIN_PATH");
+      !filename.empty()) {
+
+    TritonPlugin TP(filename);
+    std::vector<const char *> passNames;
+    if (auto result = TP.getPassHandles(passNames); !result)
+      llvm::report_fatal_error(result.takeError());
+
+    for (const char *passName : passNames)
+      if (auto result = TP.registerPass(passName); !result)
+        llvm::report_fatal_error(result.takeError());
+  }
+
   registry.insert<
       mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
       mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,