diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16431d047..71f1dc6e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,8 +31,8 @@ elseif(FLAGTREE_BACKEND STREQUAL "mthreads")
   set(CMAKE_CXX_COMPILER clang++)
   set(ENV{FLAGTREE_PLUGIN} $ENV{FLAGTREE_BACKEND})
 elseif(FLAGTREE_BACKEND STREQUAL "aipu")
-  set(CMAKE_C_COMPILER clang-16)
-  set(CMAKE_CXX_COMPILER clang++-16)
+  set(CMAKE_C_COMPILER clang-15)
+  set(CMAKE_CXX_COMPILER clang++-15)
   add_definitions(-D__NVIDIA__)
   add_definitions(-D__AMD__)
 elseif(FLAGTREE_BACKEND STREQUAL "tsingmicro")
diff --git a/python/setup_tools/setup_helper.py b/python/setup_tools/setup_helper.py
index 230976cf4..b0935bb53 100644
--- a/python/setup_tools/setup_helper.py
+++ b/python/setup_tools/setup_helper.py
@@ -360,9 +360,9 @@ def check_env(env_val):
 
 # aipu
 cache.store(
-    file="llvm-a66376b0-ubuntu-x64",
+    file="llvm-a66376b0-ubuntu-arm64",
     condition=("aipu" == flagtree_backend),
-    url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-ubuntu-x64.tar.gz",
+    url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-ubuntu-arm64.tar.gz",
     pre_hock=lambda: check_env('LLVM_SYSPATH'),
     post_hock=set_llvm_env,
 )
diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py
index 3583429de..cac42a663 100644
--- a/python/triton/backends/compiler.py
+++ b/python/triton/backends/compiler.py
@@ -1,12 +1,218 @@
 import os
 import re
+import hashlib
 import subprocess
-import sysconfig
-from abc import ABCMeta, abstractmethod
+
+from abc import ABCMeta, abstractmethod, abstractclassmethod
 from dataclasses import dataclass
-from typing import Dict, Union
+from typing import Dict, List, Tuple, Union
 from types import ModuleType
 
+# Table that associates strings to AttrsDescriptor (sub)classes.
+# In this way we can dynamically select the correct class
+# constructor
+_descriptor_table = {}
+
+
+def register_descriptor(cls):
+    """
+    Register a descriptor into the descriptor table
+    """
+    _descriptor_table[cls.__name__] = cls
+    return cls
+
+
+@register_descriptor
+class AttrsDescriptor:
+    """
+    This class handles compile-time properties for specific function parameters.
+
+    Different backends can add more properties to the common ones. The class
+    contains two fields:
+
+    `arg_properties`: a dictionary containing the different compile-time properties for different
+        parameters. I.e., the dictionary is a map from property names to parameter indices
+        {
+        "prop0": (0, 2, 3)
+        "prop1": (0, 4, 5)
+        }
+        Different backends might need different properties on those paraemters to enable
+        specific optimizations. The common compile time properties contained in this class
+        are :
+        - "tt.divisibility", i.e., is the given parameter divisible by 16
+        - "tt.equal_to_1", i.e., is the given parameter an integer constant 1
+
+    `property_values`: a dictionary containing the value of the different compile-time properties, like:
+        {
+            "prop0": val0
+            "prop1": val1
+        }
+
+    `constant_properties`: a set containing the properties that can be used to determine if a parameter is constant
+
+    """
+    __slots__ = ('divisibility_16', 'equal_to_1', 'arg_properties', 'property_values', 'constant_properties')
+
+    def __init__(self, params=None, values=None):
+        """
+        Initialize the compile-time properties
+
+        We can initialize the AttrsDescriptor class by passing the list of params
+        of the function and their `values`. The function will try to apply the properties
+        to the values and save the parameters in the `arg_properties` list. If we don't pass
+        either the `params` or the `values` we should initialize the class via an alternative method
+        (see `from_dict` or `from_hints`)
+        """
+        # Default initialization
+        self.arg_properties = {}
+        self.property_values = {}
+        self.constant_properties = set()
+
+        self._add_common_properties(params, values)
+        self._add_backend_properties(params, values)
+        self._init_slots()
+
+    def _add_common_properties(self, params, values):
+        """ Add common compile-time properties """
+        self.property_values["tt.divisibility"] = 16
+        self.property_values["tt.equal_to"] = 1
+        self.constant_properties.add("tt.equal_to")
+
+        if (params is None) or (values is None):
+            return
+
+        # Compile properties deduction
+        assert (len(params) == len(values))
+
+        # Divisibility property
+        self.arg_properties["tt.divisibility"] = [
+            param.num for param, arg in zip(params, values) if AttrsDescriptor.is_divisible_by_16(arg)
+            and not param.do_not_specialize and not param.do_not_specialize_on_alignment
+        ]
+
+        # Equal to 1 property
+        self.arg_properties["tt.equal_to"] = [
+            param.num
+            for param, arg in zip(params, values)
+            if AttrsDescriptor.is_equal_to_1(arg) and not param.do_not_specialize
+        ]
+
+    def _add_backend_properties(self, params=None, values=None):
+        """ This method is for different subclasses to implement their own compile-time properties """
+        pass
+
+    def _init_slots(self):
+        """ Initialize the slots of this class """
+        for name, val in self.arg_properties.items():
+            setattr(self, name.removeprefix('tt.') + '_' + str(self.property_values[name]), val)
+
+    def get_fn_attrs(self) -> Dict:
+        """
+        Get the function attributes as a dictionary.
+
+        The returned dictionary will look like :
+            {
+            "arg0" : [(prop_name00, val00), (prop_name01, val01), ...)]}
+            "arg1" : [(prop_name10, val10), (prop_name11, val11), ...)]}
+            }
+        """
+        attrs = {}
+        for prop_name, arg_set in self.arg_properties.items():
+            prop_val = self.property_values[prop_name]
+            for arg in arg_set:
+                attrs[arg] = attrs.get(arg, []) + [(prop_name, prop_val)]
+        return attrs
+
+    def get_constants(self) -> Dict:
+        """ Return a mapping of constant parameters to their values """
+        constants = {}
+        for prop_name in self.constant_properties:
+            for p in self.arg_properties.get(prop_name, []):
+                constants[p] = self.property_values[prop_name]
+        return constants
+
+    def filter_out_constants(self):
+        """ Return the same object, without properties marked as constants"""
+        import copy
+        c = copy.deepcopy(self)
+        for prop_name in c.constant_properties:
+            c.arg_properties.pop(prop_name, None)
+            c.property_values.pop(prop_name, None)
+        c.constant_properties = {}
+        return c
+
+    def hash(self):
+        values = [sorted(self.arg_properties.values())]
+        values += [sorted(self.property_values.values())]
+        values += [sorted(self.constant_properties)]
+        key = str(values)
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+    def to_dict(self):
+        """
+        Store the fields of this class in a serializable dictionary
+        """
+        # We need to only store the `arg_properties` field. To initialize the
+        # other fields we relay on the class type. We store it as a string in
+        # the dictionary so that we can use it to invoke the appropriate
+        # (sub)class constructor in the `from_dict` method.
+        return {"arg_properties": self.arg_properties, "cls": type(self).__name__}
+
+    @staticmethod
+    def from_dict(data):
+        """
+        Create the object from a serializable dictionary
+        """
+        attrs_descriptor = _descriptor_table[data["cls"]]()
+        for prop_name, param_ids in data["arg_properties"].items():
+            attrs_descriptor.arg_properties[prop_name] = param_ids
+        attrs_descriptor._init_slots()
+        return attrs_descriptor
+
+    @classmethod
+    def from_hints(cls, hints: List[Tuple[int, int]]):
+        """
+        Create the class from a set of hints that are passed in.
+
+        Instead of deducing the properties from a list of paramaters and values,
+        the user can pass in a list of `hints=[(param_index, val)]` and if `val`
+        matches one of the values of the properties (e.g., `prop_val[prop0]`),
+        then we insert `param_index` into the correct list (e.g., in
+        `arg_properties[prop0]`)
+        """
+        attrs_descriptor = cls()
+        for prop_name, prop_val in attrs_descriptor.property_values.items():
+            attrs_descriptor.arg_properties[prop_name] = [i for i, h in hints.items() if h == prop_val]
+        attrs_descriptor._init_slots()
+        return attrs_descriptor
+
+    @staticmethod
+    def is_divisible_by_16(x):
+        """ Return if the argument is a multiple of 16"""
+        if hasattr(x, "data_ptr"):
+            return x.data_ptr() % 16 == 0
+        elif isinstance(x, int):
+            return x % 16 == 0
+        if x is None:
+            return True
+        return False
+
+    @staticmethod
+    def is_equal_to_1(x):
+        """ Return if the argument is a constant 1"""
+        return True if isinstance(x, int) and not isinstance(x, bool) and x == 1 else False
+
+    @staticmethod
+    def get_property_key(val, align):
+        if align and AttrsDescriptor.is_divisible_by_16(val):
+            return "D"
+        if AttrsDescriptor.is_equal_to_1(val):
+            return "1"
+        return "N"
+
+    def __repr__(self):
+        return f"AttrsDescriptor.from_dict({self.to_dict()!r})"
+
 
 @dataclass(frozen=True)
 class GPUTarget(object):
@@ -25,23 +231,22 @@ def __init__(self, target: GPUTarget) -> None:
 
     @staticmethod
     def _path_to_binary(binary: str):
-        binary += sysconfig.get_config_var("EXE")
         base_dir = os.path.join(os.path.dirname(__file__), os.pardir)
         paths = [
             os.environ.get(f"TRITON_{binary.upper()}_PATH", ""),
             os.path.join(base_dir, "third_party", "cuda", "bin", binary),
         ]
-        for path in paths:
-            if os.path.exists(path) and os.path.isfile(path):
-                result = subprocess.check_output([path, "--version"], stderr=subprocess.STDOUT)
+        for p in paths:
+            bin = p.split(" ")[0]
+            if os.path.exists(bin) and os.path.isfile(bin):
+                result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT)
                 if result is not None:
                     version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE)
                     if version is not None:
-                        return path, version.group(1)
+                        return p, version.group(1)
         raise RuntimeError(f"Cannot find {binary}")
 
-    @classmethod
-    @abstractmethod
+    @abstractclassmethod
     def supports_target(target: GPUTarget):
         raise NotImplementedError
 
@@ -84,21 +289,16 @@ def get_module_map(self) -> Dict[str, ModuleType]:
         """
         raise NotImplementedError
 
-    @staticmethod
-    def parse_attr(desc):
-        assert isinstance(desc, str)
-        ret = []
-        if "D" in desc:
-            ret += [["tt.divisibility", 16]]
-        return ret
+    def get_attrs_descriptor(self, params, args):
+        """
+        Return an attribute descriptor: given a set of parameters and arguments
+        the descriptor stores a set of compile time properties that can improve code
+        generation. Different backends might benefit from different properties
+        """
+        return AttrsDescriptor(params, args)
 
-    @staticmethod
-    def get_arg_specialization(arg, ty, **kwargs):
+    def compute_spec_key(self, arg, align):
         """
-        Return a string unique to each possible specialization of the argument
+        Return the ascii key for a given argument with a given set of properties
         """
-        if ty == "int" and arg % 16 == 0 and kwargs.get("align", False):
-            return "D"
-        if ty == "tensor" and arg.data_ptr() % 16 == 0 and kwargs.get("align", False):
-            return "D"
-        return ""
+        return AttrsDescriptor.get_property_key(arg, align)
diff --git a/third_party/aipu/backend/codegen.py b/third_party/aipu/backend/codegen.py
index 38688b119..ba2f7175d 100644
--- a/third_party/aipu/backend/codegen.py
+++ b/third_party/aipu/backend/codegen.py
@@ -1,8 +1,8 @@
 import numpy as np
-import tvm
-from tvm import tir, ir
-from tvm.script.parser import tir as T
-from tvm.compass.dsl import BuildManager, script as S
+#import tvm
+#from tvm import tir, ir
+#from tvm.script.parser import tir as T
+#from tvm.compass.dsl import BuildManager, script as S
 from mlir import ir as mlir_ir
 from mlir.dialects import func
 
diff --git a/third_party/aipu/backend/compiler.py b/third_party/aipu/backend/compiler.py
index be9091024..89a3a2d21 100644
--- a/third_party/aipu/backend/compiler.py
+++ b/third_party/aipu/backend/compiler.py
@@ -1,17 +1,41 @@
 import pickle
+import ctypes
+import functools
+import hashlib
+import os
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+from types import ModuleType
+#20250923ph
 from triton.backends.aipu import transform, analysis
-from triton.backends.aipu.codegen import codegenAIPU
+#from triton.backends.aipu.codegen import codegenAIPU
 from triton.backends.compiler import BaseBackend, GPUTarget
 from triton._C.libtriton import ir, aipu, passes
-from triton._C import aipu_interface
+#import triton._C.libaipu_interface as aipu_interface
 from mlir.passmanager import PassManager
 from mlir.ir import Context, Module
 
-from dataclasses import dataclass
-import functools
-import hashlib
-from typing import Any, Dict, Tuple
-from types import ModuleType
+from triton.backends.aipu.utils import (
+    _check_bishengir_api_change,
+    _check_bishengir_is_regbased,
+    _enable_unpublished_feature,
+    _get_npucompiler_path,
+    _is_ascend_sanitizer_enabled,
+    _is_debug_line_info_disabled,
+    _is_auto_map_parallel_blocks_enabled,
+)
+from triton.backends.aipu.driver import NPUUtils
+
+from triton.backends.compiler import (
+    AttrsDescriptor,
+    BaseBackend,
+    GPUTarget,
+    register_descriptor,
+)
 
 
 def min_dot_size(target: GPUTarget):
@@ -37,35 +61,206 @@ class AIPUOptions:
     reg_inc_consumer: int = -1
     allowed_dot_input_precisions: Tuple[str] = ("ieee", )
 
+    debug: bool = False
+    sanitize_overflow: bool = True
+    llvm_version: int = 15
+    kernel_name: str = "triton_"
+
+    cluster_dims: tuple = (1, 1, 1)
+    num_warps: int = -1
+    num_ctas: int = -1
+    num_stages: int = 2
+    num_buffers_warp_spec: int = 0
+    num_consumer_groups: int = 0
+    reg_dec_producer: int = 0
+    reg_inc_consumer: int = 0
+
+    enable_warp_specialization: bool = False
+    enable_nd2nz_on_vector: bool = False
+    enable_persistent: bool = False
+    optimize_epilogue: bool = False
+    enable_fp_fusion: bool = True
+    allow_fp8e4nv: bool = False
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", "hf32")
+    max_num_imprecise_acc_default: bool = None
+    extern_libs: dict = None
+
+    multibuffer: bool = None
+    enable_hivm_auto_cv_balance: bool = None
+    unit_flag: bool = None
+    inject_barrier_all: bool = None
+    limit_auto_multi_buffer_only_for_local_buffer: bool = None
+    limit_auto_multi_buffer_of_local_buffer: str = None
+    set_workspace_multibuffer: int = None
+    tile_mix_vector_loop: int = None
+    tile_mix_cube_loop: int = None
+
+    stream: int = None
+
     def hash(self):
         hash_dict = dict(self.__dict__)
         key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items())])
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
 
-class AIPUBackend(BaseBackend):
+class AscendAttrsDescriptor(AttrsDescriptor):
+
+    # For now we collect shapes of tensor at runtime.
+    # We comment out the following func but keep it for future reference.
+    def _add_backend_properties(self, params=None, values=None):
+        pass
+
+
+def __get_metadata_attr_by_callback(lib, postfix: str, metadata, meta_key: str):
+    func_symbol = metadata["kernel_name"] + postfix
+    if hasattr(lib, func_symbol):
+        callback_func = getattr(lib, func_symbol)
+        callback_func.restype = ctypes.c_int64
+        callback_func.argtypes = []
+        metadata[meta_key] = callback_func()
+
+
+def _parse_linalg_metadata(linalg: str, metadata: dict):
+    MIX_MODE_REGEX = r'mix_mode\s*=\s*"([^"]+)"'
+    KERNEL_NAME_REGEX = r"func\.func\s+@(\w+)"
+    TENSOR_KIND_REGEX = r'%arg(\d+):[^,)]*?\{[^}]*?tt\.tensor_kind\s*=\s*([^:\s}]+)\s*:[^}]*?\}'
+    REMOVE_MIX_MODE_REGEX = r', mix_mode\s*=\s*"[^"]*"'
+
+    metadata["shared"] = 1
+    metadata["mix_mode"] = re.search(MIX_MODE_REGEX, linalg).group(1)
+    metadata["kernel_name"] = re.search(KERNEL_NAME_REGEX, linalg).group(1)
+    metadata["name"] = metadata["kernel_name"] + " " + metadata["mix_mode"]
+    metadata["tensor_kinds"] = [int(kind) for _, kind in re.findall(TENSOR_KIND_REGEX, linalg)]
+    linalg = re.sub(REMOVE_MIX_MODE_REGEX, "", linalg)
+    return linalg, metadata
+
+
+def linalg_to_bin_enable_npu_compile(linalg: str, metadata, opt):
+
+    linalg, metadata = _parse_linalg_metadata(linalg, metadata)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ttadapter_path = os.path.join(tmpdir, "kernel.ttadapter.mlir")
+        Path(ttadapter_path).write_text(linalg)
+        bin_file = os.path.join(tmpdir, "kernel")
+        if _check_bishengir_api_change():
+            bin_file_with_ext = "kernel.o"
+        else:
+            bin_file_with_ext = "kernel_reloc.o"
+        if _check_bishengir_is_regbased():
+            bishengir_hivm_opt = "--reg-based=true"
+        else:
+            bishengir_hivm_opt = "--enable-hivm-compile=true"
+        bin_path = os.path.join(tmpdir, bin_file_with_ext)
+        callback_path = os.path.join(tmpdir, "libkernel.so")
+        _compile_option_list = []
+        if _enable_unpublished_feature():
+            _compile_option_list += [
+                f"--target={NPUUtils().get_arch()}",
+            ]
+        multibuffer = opt.multibuffer
+        if multibuffer is not None:
+            _compile_option_list += [
+                f"--enable-auto-multi-buffer={multibuffer}",
+            ]
+        if _is_ascend_sanitizer_enabled():
+            _compile_option_list += ["--enable-sanitizer=true"]
+        if not _is_debug_line_info_disabled():
+            _compile_option_list += ["--enable-debug-info=true"]
+
+        enable_hivm_auto_cv_balance = opt.enable_hivm_auto_cv_balance
+        if enable_hivm_auto_cv_balance is not None:
+            _compile_option_list += \
+                [f"--enable-hivm-auto-cv-balance={enable_hivm_auto_cv_balance}"]
+
+        unit_flag = opt.unit_flag
+        if unit_flag is not None:
+            _compile_option_list += \
+                [f"--enable-hivm-unit-flag-sync={unit_flag}"]
+
+        inject_barrier_all = opt.inject_barrier_all
+        if inject_barrier_all is not None:
+            _compile_option_list += \
+                [f"--enable-hivm-inject-barrier-all-sync={inject_barrier_all}"]
+
+        limit_auto_multi_buffer_only_for_local_buffer = opt.limit_auto_multi_buffer_only_for_local_buffer
+        if limit_auto_multi_buffer_only_for_local_buffer is not None:
+            _compile_option_list += \
+                [f"--limit-auto-multi-buffer-only-for-local-buffer={limit_auto_multi_buffer_only_for_local_buffer}"]
+
+        set_workspace_multibuffer = opt.set_workspace_multibuffer
+        if set_workspace_multibuffer is not None:
+            _compile_option_list += \
+                [f"--set-workspace-multibuffer={set_workspace_multibuffer}"]
+
+        tile_mix_vector_loop = opt.tile_mix_vector_loop
+        if tile_mix_vector_loop is not None:
+            _compile_option_list += \
+                [f"--tile-mix-vector-loop={tile_mix_vector_loop}"]
+
+        tile_mix_cube_loop = opt.tile_mix_cube_loop
+        if tile_mix_cube_loop is not None:
+            _compile_option_list += \
+                [f"--tile-mix-cube-loop={tile_mix_cube_loop}"]
+
+        auto_multi_buffer = opt.limit_auto_multi_buffer_of_local_buffer
+        if auto_multi_buffer is not None:
+            _compile_option_list += \
+                [f"--limit-auto-multi-buffer-of-local-buffer={auto_multi_buffer}"]
+
+        if _is_auto_map_parallel_blocks_enabled():
+            _compile_option_list += ["--enable-auto-blockify-loop"]
+        npu_compiler_path = _get_npucompiler_path()
+        if npu_compiler_path.endswith("bishengir-compile"):
+            _compile_option_list += [
+                "--enable-hfusion-compile=true",
+                bishengir_hivm_opt,
+                "--enable-triton-kernel-compile=true",
+            ]
+        cmd_list = ([npu_compiler_path, ttadapter_path] + _compile_option_list + ["-o", bin_file])
+        ret = subprocess.run(cmd_list, capture_output=True, check=True)
+        if Path(callback_path).is_file():
+            lib = ctypes.CDLL(callback_path)
+            __get_metadata_attr_by_callback(lib, "_infer_workspace_shape_function", metadata, "workspace_size")
+            __get_metadata_attr_by_callback(lib, "_infer_sync_block_lock_num_function", metadata, "lock_num")
+            __get_metadata_attr_by_callback(lib, "_infer_sync_block_lock_init_function", metadata, "lock_init_val")
+
+        return Path(bin_path).read_bytes()
+
+
+class AscendBackend(BaseBackend):
 
     @staticmethod
     def supports_target(target: GPUTarget):
-        return target.backend == 'aipu'
+        return target.backend == 'npu'
 
     def __init__(self, target: GPUTarget) -> None:
         super().__init__(target)
         self.capability = target.arch
-        self.binary_ext = "bin"
-        aipu_interface.passes.register_all_passes()
+        self.binary_ext = "npubin"
+        #aipu_interface.passes.register_all_passes()
 
     def parse_options(self, opts) -> Any:
-        return AIPUOptions()
+        return AIPUOptions(**{k: opts[k] for k in AIPUOptions.__dataclass_fields__.keys() if k in opts})
 
     def pack_metadata(self, metadata):
-        return (
-            metadata.num_tecs,
-            metadata.num_cores,
-            metadata.cluster_dims[0],
-            metadata.cluster_dims[1],
-            metadata.cluster_dims[2],
-        )
+        # collect necessary metadata to launch kernels
+        # TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 could set unique name.
+        # Get this name as the kernel_name to CANN runtime.
+        # kernel_name is unique to Ascend backend and should not be public.
+        # CANN runtime limits the length of kernel name <= 50.
+        # Considering '\n' is appended, thus the real kernel name <= 49.
+        KERNEL_NAME_MAX_LEN = 49
+        kernel_name_orig, mix_mode = metadata.name.split()
+        if len(kernel_name_orig) > KERNEL_NAME_MAX_LEN:
+            kernel_name = kernel_name_orig[-KERNEL_NAME_MAX_LEN:]
+        else:
+            kernel_name = kernel_name_orig
+        return {
+            "kernel_name": kernel_name,
+            "hash": metadata.hash,
+            "debug": metadata.debug,
+            "tensor_kinds": metadata.tensor_kinds,
+        }
 
     def get_codegen_implementation(self, options):
         codegen_fns = {"min_dot_size": min_dot_size(self.target)}
@@ -78,6 +273,9 @@ def get_module_map(self) -> Dict[str, ModuleType]:
     def load_dialects(self, ctx):
         aipu.load_dialects(ctx)
 
+    def get_arg_specialization(*arg, **kwargs):
+        return None
+
     @staticmethod
     def make_ttir(mod, metadata, opt):
         pm = ir.pass_manager(mod.context)
@@ -98,62 +296,32 @@ def make_ttir(mod, metadata, opt):
     def make_linalg(mod, metadata, opt):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        # Add pass here.
         aipu.passes.convert.add_triton_to_linalg_pipeline(pm)
         pm.run(mod)
         return mod
 
     @staticmethod
-    def make_aipubin(mod, metadata, opt):
-        ctx = Context()
-        ctx.allow_unregistered_dialects = True
-        aipu_interface.dialects.register_all_dialects(ctx._CAPIPtr)
-        pm = PassManager("builtin.module", ctx)
-        mod = Module.parse(aipu.common.generic_print(mod), ctx)
-
-        # Add pass here.
-        transform.linalg_transform(mod, ctx)
-        transform.tensor_transform(mod, ctx)
-
-        pm.add("func.func(linalg-fuse-elementwise-ops)")
-        pm.add("scf-loop-bufferization-preprocessing")
-        pm.add("one-shot-bufferize")
-        pm.add("func.func(convert-bool-arg-to-i8)")
-        pm.add("func.func(convert-linalg-to-affine-loops)")
-        pm.add("func.func(affine-loop-normalize{promote-single-iter=1})")
-        pm.add("func.func(affine-loop-fusion{mode=sibling})")
-        pm.add("func.func(flatten-memref)")
-        pm.add("func.func(canonicalize)")
-        pm.run(mod.operation)
-
-        pm = PassManager("builtin.module", ctx)
-        transform.convert_memref_i1_i8(mod, ctx)
-        transform.remove_empty_linalg_generic(mod, ctx)
-        # vectorize
-        vfactor = analysis.determine_vectorization_factor(mod, metadata["vector_register_bits"])
-        if vfactor > 1:
-            pm.add(f"func.func(affine-super-vectorize{{virtual-vector-size={vfactor}}})")
-        pm.add("func.func(lower-affine)")
-
-        # Optimize pass.
-        pm.add("func.func(canonicalize)")
-        pm.add("func.func(cse)")
-        pm.add("func.func(reconcile-unrealized-casts)")
-        pm.run(mod.operation)
-
-        # Post aipu pass.
-        transform.binding_tid(mod, ctx)
-        transform.canonical_const_dtype(mod, ctx)
-
-        ex = codegenAIPU(mod)
-        metadata["name"] = ex._func_name
-        metadata["shared"] = 1
-        return pickle.dumps(ex)
+    def make_npubin(mod, metadata, opt):
+
+        linalg_str = str(mod)
+        metadata.update({
+            "enable_nd2nz_on_vector": opt.enable_nd2nz_on_vector,
+            "multibuffer": opt.multibuffer,
+            "enable_hivm_auto_cv_balance": opt.enable_hivm_auto_cv_balance,
+            "unit_flag": opt.unit_flag,
+            "inject_barrier_all": opt.inject_barrier_all,
+            "limit_auto_multi_buffer_only_for_local_buffer": opt.limit_auto_multi_buffer_only_for_local_buffer,
+            "limit_auto_multi_buffer_of_local_buffer": opt.limit_auto_multi_buffer_of_local_buffer,
+            "set_workspace_multibuffer": opt.set_workspace_multibuffer,
+            "tile_mix_vector_loop": opt.tile_mix_vector_loop,
+            "tile_mix_cube_loop": opt.tile_mix_cube_loop,
+        })
+        return linalg_to_bin_enable_npu_compile(linalg_str, metadata, opt)
 
     def add_stages(self, stages, options):
         stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
         stages["linalg"] = lambda src, metadata: self.make_linalg(src, metadata, options)
-        stages["bin"] = lambda src, metadata: self.make_aipubin(src, metadata, options)
+        stages["npubin"] = (lambda src, metadata: linalg_to_bin_enable_npu_compile(src, metadata, options))
 
     @functools.lru_cache()
     def hash(self):
diff --git a/third_party/aipu/backend/driver.py b/third_party/aipu/backend/driver.py
index df4cce6b4..33d91a6c3 100644
--- a/third_party/aipu/backend/driver.py
+++ b/third_party/aipu/backend/driver.py
@@ -1,192 +1,731 @@
-import os
-import pickle
-import torch
-import uuid
-import numpy as np
 from pathlib import Path
-from itertools import chain
-from triton.backends.compiler import GPUTarget
+import tempfile
+import os
+import os.path
+import re
+import subprocess
+import sysconfig
+from typing import Optional
+import functools
+import hashlib
+from triton.runtime.cache import get_cache_manager, get_dump_manager
 from triton.backends.driver import DriverBase
-
-# ------------------------
-# Utils
-# ------------------------
-
-
-def load_binary(name, kernel, shared, device):
-    return None, kernel, 1, 0
+from triton.backends.compiler import GPUTarget
+from triton.backends.aipu.utils import (
+    _build_npu_ext,
+    _check_cxx11_abi,
+    convert_sigtype_to_int,
+    _is_auto_map_parallel_blocks_enabled,
+)
 
 
-class AIPUUtils(object):
+class NPUUtils(object):
 
     def __new__(cls):
-        if not hasattr(cls, "instance"):
-            cls.instance = super(AIPUUtils, cls).__new__(cls)
+        if not hasattr(cls, 'instance'):
+            cls.instance = super(NPUUtils, cls).__new__(cls)
         return cls.instance
 
     def __init__(self):
-        self.load_binary = load_binary
-        properties_dict = {"max_shared_mem": 256 * 1024, "multiprocessor_count": 4, "max_num_regs": 32, "warpSize": 4}
-        self.get_device_properties = lambda device: properties_dict
-
-
-# ------------------------
-# Launcher
-# ------------------------
-
-
-def _reset_output_path(ex):
-    output_dir = f"{os.getcwd()}/compass_dsl_{ex._func_name}_restore_{uuid.uuid4().hex}"
-    ex._output_dir = output_dir
-    ex._gbuilder_dir = f"{ex._output_dir}/gbuilder"
-    ex._op_lib_path = f"{ex._gbuilder_dir}/op_lib/{ex._func_name}.o"
-
-
-def _get_cpu_origin_tensor(tensor):
-    origin_tensor = tensor
-    while (base := origin_tensor._base) is not None:
-        origin_tensor = base
-
-    return origin_tensor.cpu().contiguous()
-
-
-def _get_np_array_from_strided_buffer(tensor, sb):
-    dtype = str(sb.dtype).split(".")[-1]
-    itemsize = sb.element_size()
-    offset = (sb.data_ptr() - sb._base.data_ptr())
-    shape = sb.size()
-    stride = [x * itemsize for x in sb.stride()]
-
-    return np.ndarray(
-        shape,
-        dtype,
-        tensor.numpy(),
-        offset,
-        stride,
-    )
-
-
-class AIPULauncher(object):
+        dirname = os.path.dirname(os.path.realpath(__file__))
+        src = Path(os.path.join(dirname, "npu_utils.cpp")).read_text()
+        key = hashlib.sha256(src.encode("utf-8")).hexdigest()
+        cache = get_cache_manager(key)
+        fname = "npu_utils.so"
+        cache_path = cache.get_file(fname)
+        if cache_path is None:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                src_path = os.path.join(tmpdir, "npu_utils.cpp")
+                with open(src_path, "w") as f:
+                    f.write(src)
+                so = _build_npu_ext("npu_utils", src_path, tmpdir)
+                with open(so, "rb") as f:
+                    cache_path = cache.put(f.read(), fname, binary=True)
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("npu_utils", cache_path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        self.npu_utils_mod = mod
+
+    def load_binary(self, name, kernel, shared, device):
+        fnname, mix_mode = name.split()
+        return self.npu_utils_mod.load_kernel_binary(fnname, kernel, shared, device, mix_mode)
+
+    @functools.lru_cache()
+    def get_device_properties(self, device):
+        # temperoarily added "max_shared_mem" properties to avoid triton-compiler complain
+        # fetch available memory at runtime
+        num_aic = self.get_aicore_num()
+        num_aiv = num_aic * 2
+        return {"max_shared_mem": 1, "num_aicore": num_aic, "num_vectorcore": num_aiv}
+
+    @functools.lru_cache()
+    def get_arch(self):
+        # temporarily return empty arch descriptor
+        return self.npu_utils_mod.get_arch()
+
+    @functools.lru_cache()
+    def get_aicore_num(self):
+        # temporarily return empty arch descriptor
+        return self.npu_utils_mod.get_aicore_num()
+
+    @functools.lru_cache()
+    def get_aivector_core_num(self):
+        return self.get_device_properties("npu")["num_vectorcore"]
+
+
+class NPULauncher(object):
 
     def __init__(self, src, metadata):
-        self.constants = src.constants
-
-    def lanch_kernel(self, ex, np_args, tail_args, totoal_pid_size):
-        convert_map = {}
-        real_args = []
-        for i, arg in enumerate(np_args):
-            if isinstance(arg, np.ndarray) and arg.dtype == "bool":
-                real_args.append(arg.astype(np.int8))
-                convert_map[i] = np.bool_
-            elif isinstance(arg, np.ndarray) and arg.dtype == "int64":
-                real_args.append(arg.astype(np.int32))
-                convert_map[i] = np.int64
-            else:
-                real_args.append(arg)
-
-        tec_num = 4
-        for i in range((totoal_pid_size + tec_num - 1) // tec_num):
-            tail_args[3] = i
-            ex(*(real_args + tail_args))
-
-        for i, arg in enumerate(real_args):
-            if i in convert_map.keys():
-                np.copyto(np_args[i], arg.astype(convert_map[i]))
-
-    # TODO(aipu-teams): This is just a temporary solution for now, because the real driver interface is not ready yet.
-    # These code will be refactor later.
-    def __call__(self, gridx_size, gridy_size, gridz_size, stream, function, *args):
-        try:
-            from flag_gems.utils.tensor_wrapper import StridedBuffer
-        except ImportError:
-            StridedBuffer = torch.Tensor
-
-        ex = pickle.loads(function)
-        _reset_output_path(ex)
-        np_args = []
-        sb_maps = {}
-        args = [arg for i, arg in enumerate(args[4:]) if i not in chain(*self.constants.keys())]
-
-        for i, arg in enumerate(args):
-            if isinstance(arg, torch.Tensor):
-                np_args.append(_get_cpu_origin_tensor(arg).numpy())
-            elif isinstance(arg, StridedBuffer):
-                tensor = _get_cpu_origin_tensor(arg)
-                np_args.append(_get_np_array_from_strided_buffer(tensor, arg))
-                sb_maps[i] = tensor
-            else:
-                np_args.append(arg)
-
-        tail_args = [gridx_size, gridy_size, gridz_size, 0, 0, 0]
-        total_pid_size = gridx_size * gridy_size * gridz_size
-        self.lanch_kernel(ex, np_args, tail_args, total_pid_size)
-
-        for i, param_info in enumerate(ex._cur_param_infos):
-            if param_info.is_output_tensor:
-                if isinstance(args[i], torch.Tensor):
-                    args[i].copy_(torch.from_numpy(np_args[i]))
-                else:
-                    args[i]._base.copy_(sb_maps[i])
-
-
-class AIPUDriver(DriverBase):
+        debug_mode = metadata.debug
+        workspace_size = int(metadata.workspace_size) \
+                              if hasattr(metadata, 'workspace_size') else -1
+        lock_init_value = int(metadata.lock_init_value) \
+                              if hasattr(metadata, 'lock_init_value') else 0
+        lock_num = int(metadata.lock_num) \
+                              if hasattr(metadata, 'lock_num') else -1
+        constants = src.constants if hasattr(src, "constants") else dict()
+        cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i
+        constants = {cst_key(key): value for key, value in constants.items()}
+        signature = {cst_key(key): value for key, value in src.signature.items()}
+        mix_mode = metadata.mix_mode
+        wrapper_src = generate_npu_wrapper_src(constants, signature, \
+                                               workspace_size, mix_mode, \
+                                               lock_num, lock_init_value)
+        so_launcher_path = make_npu_launcher_stub(wrapper_src, debug_mode)
+        # initialize launcher
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("__triton_launcher", so_launcher_path)
+        mod = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(mod)
+        self.launch = getattr(mod, "launch")
+
+    def __call__(self, *args, **kwargs):
+        self.launch(*args, **kwargs)
+
+
+class NPUDriver(DriverBase):
 
     def __init__(self):
-        self.utils = AIPUUtils()  # TODO: make static
-        self.launcher_cls = AIPULauncher
+        self.utils = NPUUtils()
+        self.launcher_cls = NPULauncher
+        super().__init__()
 
-        import torch
-        self.get_current_stream = lambda x: x
-        self.get_current_device = torch.aipu.current_device
+    @classmethod
+    def is_active(cls):
 
-        super().__init__()
+        def test_npucompiler():
+            from triton.backends.aipu.utils import _get_bisheng_path
+            npucompiler = _get_bisheng_path()
+            targets = subprocess.check_output([npucompiler, "-print-targets"]).decode().strip().split()
+            return "hiipu64" in targets
+
+        try:
+            return test_npucompiler()
+        except Exception as e_npucompiler:
+            import warnings
+            red = "\x1b[31;20m"
+            reset = "\x1b[0m"
+            warnings.warn(red + str(e_npucompiler) + reset)
+            return False
 
     def get_current_target(self):
-        warp_size = 4
-        return GPUTarget("aipu", "x2", warp_size)
+        backend = "npu"
+        arch = self.utils.get_arch()
+        warp_size = 0
+        return GPUTarget(backend, arch, warp_size)
 
     def get_active_torch_device(self):
         import torch
-        return torch.device("aipu", 0)
+        import torch_npu
+        return torch.npu.current_device()
 
-    def get_device_interface(self):
+    def get_current_device(self):
+        """
+        Get current device
+        """
         import torch
-        return torch.aipu
+        import torch_npu
+        return torch.npu.current_device()
 
-    @staticmethod
-    def is_active():
+    def set_current_device(self, device):
+        """
+        Set current device as the given device
+        """
         import torch
-        from torch.utils import cpp_extension
+        import torch_npu
+        return torch.npu.set_device(device)
+
+    def get_current_stream(self, device: Optional[int] = None) -> int:
+        """
+        Get stream for current device
+        """
+        # According to torch_npu, the content of a torch.npu.Stream is essentilly an rtStream_t
+        # TODO: use CANN API instead of torchnpu
+        import torch
+        import torch_npu
+        if device is None:
+            device = self.get_current_device()
+        return torch.npu.current_stream(device).npu_stream
 
-        try:
-            torch.aipu.is_available()
-        except AttributeError:
-            current_dir = Path(__file__).resolve().parent
-            extra_ldflags = [f"-L{x.strip()}" for x in os.getenv("LD_LIBRARY_PATH", "").split(":") if x.strip() != ""]
-            extra_ldflags.append("-laipudrv")
-            module = cpp_extension.load(
-                name="aipu", sources=[current_dir / "aipu_torch_dev.cpp"],
-                extra_include_paths=[os.getenv("ZHOUYI_LINUX_DRIVER_HOME") + "/driver/umd/include"],
-                extra_ldflags=extra_ldflags, verbose=True)
-
-            torch.utils.rename_privateuse1_backend("aipu")
-            torch._register_device_module("aipu", module)
-            torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
-        return torch.aipu.is_available()
-
-    # TODO(aipu-teams): Support bechmarker later.
     def get_benchmarker(self):
-
-        def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"):
-            return [float("inf"), float("inf"), float("inf")]
-
+        from triton.testing import do_bench
         return do_bench
 
-    def get_empty_cache_for_benchmark(self):
+    def get_device_interface(self):
         import torch
+        return torch.npu
 
-        # We maintain a buffer of 256 MB that we clear
-        # before each kernel call to make sure that the L2 cache
-        # doesn't contain any input data before the run
-        cache_size = 256 * 1024 * 1024
-        return torch.empty(int(cache_size // 4), dtype=torch.int, device='aipu')
+    def get_empty_cache_for_benchmark(self):
+        import torch
+        cache_size = 192 * 1024 * 1024
+        return torch.empty(cache_size // 4, dtype=torch.int, device='npu')
+
+
+def make_npu_launcher_stub(src, debug=False):
+    """
+    Generate the launcher stub to launch the kernel
+    """
+    # try to get cached file
+    so_cache_key = hashlib.sha256(src.encode("utf-8")).hexdigest()
+    so_cache_manager = get_cache_manager(so_cache_key)
+    # append the cxx11_abi value to the launcher name to avoid
+    # linking to a launcher with wrong cxx11_abi.
+    use_cxx11_abi = _check_cxx11_abi()
+    name = f"launcher_cxx11abi{use_cxx11_abi}"
+    suffix = sysconfig.get_config_var('EXT_SUFFIX')
+    so_name = f"{name}{suffix}"
+
+    if debug:
+        dump_manager = get_dump_manager(so_cache_key)
+        print(f"Dumping {name}.cxx to {dump_manager.cache_dir}")
+        dump_manager.put(src, f"{name}.cxx", binary=False)
+
+    cache_path = so_cache_manager.get_file(so_name)
+    if cache_path is not None:
+        return cache_path
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        if debug:
+            so_cache_manager.put(src, f"{name}.cxx", binary=False)
+        src_path = os.path.join(tmpdir, f"{name}.cxx")
+        with open(src_path, "w") as f:
+            f.write(src)
+        enable_taskqueue = os.getenv("TRITON_ENABLE_TASKQUEUE", 'true').lower() in ('true', '1')
+        if (enable_taskqueue):
+            kernel_launcher_type = "torch"
+        else:
+            kernel_launcher_type = None
+        so = _build_npu_ext(name, src_path, tmpdir, kernel_launcher=kernel_launcher_type)
+        if debug:
+            with open(so, "rb") as f:
+                return dump_manager.put(f.read(), so_name, binary=True)
+        with open(so, "rb") as f:
+            return so_cache_manager.put(f.read(), so_name, binary=True)
+
+
+def extract_device_print_code_from_cann():
+    from triton.backends.ascend.utils import _get_bisheng_path
+    ccec_compiler_bin_folder, _ = os.path.split(os.path.realpath(_get_bisheng_path()))
+    ccec_compiler_folder, _ = os.path.split(ccec_compiler_bin_folder)
+    clang_version = os.listdir(os.path.join(ccec_compiler_folder, "lib/clang/"))[0]
+    ccelib_path = os.path.join(ccec_compiler_folder, f"lib/clang/{clang_version}/include/ccelib")
+
+    def read_header(header_path):
+        with open(os.path.join(ccelib_path, header_path), 'r') as f:
+            code = f.read()
+
+        # remove all #include "..."
+        lines = code.splitlines()
+        purged_lines = []
+        for line in lines:
+            normalized_line = ' '.join(line.split())
+            if not normalized_line.startswith('#include "'):
+                purged_lines.append(line)
+        code = '\n'.join(purged_lines)
+
+        # remove [aicore] functions
+        aicore_positions = []
+        for m in re.finditer('\[aicore\]', code):
+            aicore_positions.append(m.start())
+
+        def find_aicore_function_span(src, pos):
+            for i in range(pos - 1, -1, -1):
+                if src[i] == '}':  # this relies on that all [aicore] functions come after normal functions
+                    left = i + 1
+                    break
+            n = len(src)
+            brace_nest = 0
+            for j in range(pos, n, 1):
+                if src[j] == '{':
+                    brace_nest += 1
+                elif src[j] == '}':
+                    brace_nest -= 1
+                    if brace_nest == 0:
+                        right = j
+                        break
+            return left, right
+
+        new_code = ''
+        segment_start = 0
+        for pos in aicore_positions:
+            left, right = find_aicore_function_span(code, pos)
+            new_code += code[segment_start:left]
+            segment_start = right + 1
+        new_code += code[segment_start:]
+
+        # remove __gm__ and rename macros
+        new_code = new_code.replace('__gm__', ' ')
+        new_code = new_code.replace('__CCELIB_RT_ERROR_NONE', 'RT_ERROR_NONE')
+        new_code = new_code.replace('__CCELIB_RT_MEMORY_HBM', 'RT_MEMORY_HBM')
+        new_code = new_code.replace('__CCELIB_RT_MEMCPY_HOST_TO_DEVICE', 'RT_MEMCPY_HOST_TO_DEVICE')
+        new_code = new_code.replace('__CCELIB_RT_MEMCPY_DEVICE_TO_HOST', 'RT_MEMCPY_DEVICE_TO_HOST')
+        return new_code
+
+    # the following headers should be included in this order
+    return '\n'.join([
+        read_header('common/common_impl.h'),
+        read_header('internal/debug_tunnel/payload.h'),
+        read_header('internal/debug_tunnel/payload_impl.h'),
+        read_header('internal/debug_tunnel/tunnel.h'),
+        read_header('internal/debug_tunnel/tunnel_impl.h')
+    ])
+
+
+# the template is from triton-adapter HEAD. Wrapping the generated kernel binary into a python module
+def generate_npu_wrapper_src(constants, signature, workspace_size, mix_mode, lock_num, lock_ini_val):
+    import os
+
+    def _ty_to_cpp(ty):
+        if ty[0] == '*':
+            return "void*"
+        return {
+            "i1": "int32_t",
+            "i8": "int8_t",
+            "i16": "int16_t",
+            "i32": "int32_t",
+            "i64": "int64_t",
+            "u32": "uint32_t",
+            "u64": "uint64_t",
+            "fp16": "float",
+            "bf16": "float",
+            "fp32": "float",
+            "f32": "float",
+            "fp64": "double",
+        }[ty]
+
+    def _extracted_ty(ty):
+        if ty[0] == '*':
+            return "PyObject*"
+        return {
+            'i1': 'int32_t',
+            'i32': 'int32_t',
+            'i64': 'int64_t',
+            'u32': 'uint32_t',
+            'u64': 'uint64_t',
+            'fp16': 'float',
+            'bf16': 'float',
+            'fp32': 'float',
+            'f32': 'float',
+            'fp64': 'double',
+        }[ty]
+
+    def _format_of(ty):
+        return {
+            "PyObject*": "O",
+            "float": "f",
+            "double": "d",
+            "long": "l",
+            "uint32_t": "I",
+            "int32_t": "i",
+            "uint64_t": "K",
+            "int64_t": "L",
+        }[ty]
+
+    arg_decls = ', '.join(f"{_ty_to_cpp(ty)} arg{i}" for i, ty in signature.items())
+    """
+    args:
+        int gridX, gridY, gridZ;
+        rtStream_t stream;
+        const void *functon;
+        PyObject* packed_metadata, *launch_metadata;
+        PyObject* launch_enter_hook, *launch_exit_hook;
+        *args_expand
+    """
+    format = "iiiKKOOOO" + ''.join([_format_of(_extracted_ty(ty)) for ty in signature.values()])
+
+    grid_info = {'X': 'i32', 'Y': 'i32', 'Z': 'i32'}
+
+    enable_device_print = os.getenv("TRITON_DEVICE_PRINT", 'false').lower() in ('true', '1')
+    enable_taskqueue = os.getenv("TRITON_ENABLE_TASKQUEUE", 'true').lower() in ('true', '1')
+    enable_auto_map_parallel_blocks = _is_auto_map_parallel_blocks_enabled()
+    npu_utils = NPUUtils()
+    num_physical_blocks = npu_utils.get_aivector_core_num() if mix_mode == "aiv" else npu_utils.get_aicore_num()
+    task_type = "MSPROF_GE_TASK_TYPE_AIV" if mix_mode == "aiv" else "MSPROF_GE_TASK_TYPE_AI_CORE"
+    LINE_CHANGE_CHAR = chr(10)  # it is \n
+
+    cpp_device_pointer = """
+typedef struct _DevicePtrInfo {
+  void *dev_ptr;
+  bool valid;
+} DevicePtrInfo;
+
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
+  DevicePtrInfo ptr_info;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {
+    ptr_info.dev_ptr = reinterpret_cast<void *>(PyLong_AsUnsignedLongLong(obj));
+    return ptr_info;
+  }
+  if (obj == Py_None) {
+    // valid nullptr
+    return ptr_info;
+  }
+  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
+  if(ptr){
+    PyObject *empty_tuple = PyTuple_New(0);
+    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
+    Py_DECREF(empty_tuple);
+    Py_DECREF(ptr);
+    if (!PyLong_Check(ret)) {
+      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+      ptr_info.valid = false;
+      return ptr_info;
+    }
+    ptr_info.dev_ptr = reinterpret_cast<void *>(PyLong_AsUnsignedLongLong(ret));
+    if(!ptr_info.dev_ptr)
+      return ptr_info;
+    Py_DECREF(ret);  // Thanks ChatGPT!
+    return ptr_info;
+  }
+  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+  return ptr_info;
+}
+"""
+
+    cpp_msprof_extern = """
+extern "C" {
+  typedef int (* callback)(unsigned int type, void* data, unsigned int len);
+  extern int MsprofReportApi(unsigned int  agingFlag, const MsprofApi *api);
+  extern unsigned long int  MsprofSysCycleTime();
+  extern int MsprofRegisterCallback(unsigned int moduleId, callback handle);
+  static unsigned int __MsprofFlagL0  = 0;
+  static unsigned int __MsprofFlagL1  = 0;
+
+  int ProfCtrlHandle(unsigned int CtrlType, void* CtrlData, unsigned int DataLen) {
+    if ((CtrlData == nullptr) || (DataLen == 0U)) {
+      return 1;
+    }
+
+    if (CtrlType == 1) {
+      MsprofCommandHandle* handle = (MsprofCommandHandle *)(CtrlData);
+      if (handle->type >= 6)  // 6 is not used here
+        return 1;
+      if (handle->type == 1) {  // init - 0  , start - 1
+        __MsprofFlagL0 = ((0x00000800ULL & handle->profSwitch) == 0x00000800ULL) ? 1 : 0;
+        __MsprofFlagL1 = ((0x00000002ULL & handle->profSwitch) == 0x00000002ULL) ? 1 : 0;
+      }
+    }
+    return 0;
+  }
+}
+"""
+
+    cpp_msprof_callback = """
+  MsprofRegisterCallback(8, ProfCtrlHandle);      // 8 - CCE defined in msprof headerfile slog.h
+"""
+
+    cpp_msprof_call_before_launch = """
+    unsigned long int beginTime = 0;
+    unsigned long int endTime = 0;
+    unsigned long int opNameHashID = 0;
+    unsigned int threadId = 0;
+    char* _kernelName = const_cast<char*>(name.c_str());
+    size_t length = name.length();
+    if (__MsprofFlagL0 || __MsprofFlagL1)
+    {
+      beginTime = MsprofSysCycleTime();
+    }
+"""
+
+    cpp_msprof_call_after_launch = f"""
+    if (__MsprofFlagL0 || __MsprofFlagL1)
+    {{
+      endTime = MsprofSysCycleTime();
+      opNameHashID = MsprofGetHashId(_kernelName, length);
+      threadId = (unsigned int)(syscall(SYS_gettid));
+      MsprofApi info;
+      info.level = MSPROF_REPORT_NODE_LEVEL;
+      info.magicNumber = 0x5a5a;      //MSPROF_REPORT_DATA_MAGIC_NUM
+      info.type = MSPROF_REPORT_NODE_LAUNCH_TYPE;
+      info.threadId = threadId;
+      info.reserve = 0;
+      info.beginTime = beginTime;
+      info.endTime = endTime;
+      info.itemId = opNameHashID;
+      MsprofReportApi(false, &info);
+    }}
+    if (__MsprofFlagL1)
+    {{
+      MsprofCompactInfo nodeBasicInfo;
+      nodeBasicInfo.level = MSPROF_REPORT_NODE_LEVEL;
+      nodeBasicInfo.magicNumber = 0x5a5a;      //MSPROF_REPORT_DATA_MAGIC_NUM
+      nodeBasicInfo.type = MSPROF_REPORT_NODE_BASIC_INFO_TYPE;
+      nodeBasicInfo.threadId = threadId;
+      nodeBasicInfo.timeStamp = endTime;
+      nodeBasicInfo.data.nodeBasicInfo.opName = opNameHashID;
+      nodeBasicInfo.data.nodeBasicInfo.opType = opNameHashID;
+      nodeBasicInfo.data.nodeBasicInfo.taskType = {task_type};
+      nodeBasicInfo.data.nodeBasicInfo.blockDim = blockNum;
+      MsprofReportCompactInfo(0, static_cast<void *>(&nodeBasicInfo), sizeof(MsprofCompactInfo));
+
+      // Report tensor info
+      int max_tensors_num = tensorShapes.size() < MSPROF_GE_TENSOR_DATA_NUM ? tensorShapes.size() : MSPROF_GE_TENSOR_DATA_NUM;
+      MsprofAdditionalInfo tensorInfo;
+      tensorInfo.level = MSPROF_REPORT_NODE_LEVEL;
+      tensorInfo.type = MSPROF_REPORT_NODE_TENSOR_INFO_TYPE;
+      tensorInfo.threadId = threadId;
+      tensorInfo.timeStamp = endTime;
+      auto profTensorData = reinterpret_cast<MsprofTensorInfo *>(tensorInfo.data);
+      profTensorData->opName = opNameHashID;
+      int tensorCount = 0;
+      int dataTypes[MSPROF_GE_TENSOR_DATA_NUM];
+      if (tensorShapes.size() > 0) {{
+        {LINE_CHANGE_CHAR.join(
+          f'dataTypes[{i}] = {convert_sigtype_to_int(ty[1:])};'
+          for i, ty in signature.items()
+          if ty.startswith("*") and i < 5
+        )}
+      }}
+      for (int i = 0; i < tensorShapes.size() && tensorCount < MSPROF_GE_TENSOR_DATA_NUM; i++) {{
+        auto fillTensorData = [&](int index, int tensorType) {{
+          profTensorData->tensorData[index].tensorType = tensorType;
+          profTensorData->tensorData[index].format = 2; // GeDataFormat: ND = 2
+          profTensorData->tensorData[index].dataType = dataTypes[i];
+          int nDim = tensorShapes[i].size();
+          nDim = nDim < MSPROF_GE_TENSOR_DATA_SHAPE_LEN ? nDim : MSPROF_GE_TENSOR_DATA_SHAPE_LEN;
+          for (int j = 0; j < nDim; j++) {{
+            profTensorData->tensorData[index].shape[j] = tensorShapes[i][j];
+          }}
+          for (int j = nDim; j < MSPROF_GE_TENSOR_DATA_SHAPE_LEN; j++) {{
+            profTensorData->tensorData[index].shape[j] = 0;
+          }}
+        }};
+        int tensorType = (i < tensorKinds.size()) ? tensorKinds[i] : 0;  // DeFault tensor type is input
+        if (tensorType == TENSOR_KIND_INPUT || tensorType == TENSOR_KIND_INPUT_OUTPUT) {{
+          fillTensorData(tensorCount, MSPROF_GE_TENSOR_TYPE_INPUT);
+          tensorCount++;
+        }}
+        if ((tensorType == TENSOR_KIND_OUTPUT || tensorType == TENSOR_KIND_INPUT_OUTPUT) && tensorCount < MSPROF_GE_TENSOR_DATA_NUM){{
+          fillTensorData(tensorCount, MSPROF_GE_TENSOR_TYPE_OUTPUT);
+          tensorCount++;
+        }}
+      }}
+      profTensorData->tensorNum = tensorCount;
+      MsprofReportAdditionalInfo(false, static_cast<void *>(&tensorInfo), sizeof(MsprofAdditionalInfo));
+    }}
+"""
+
+    return f"""
+#include <assert.h>
+#include <stdbool.h>
+#include <string>
+#include <sys/syscall.h>
+#include <vector>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+{'#include <torch_npu/csrc/framework/OpCommand.h>' if enable_taskqueue else ''}
+#include "experiment/runtime/runtime/rt.h"
+{extract_device_print_code_from_cann() if enable_device_print else ''}
+
+#define TENSOR_KIND_INPUT 0
+#define TENSOR_KIND_OUTPUT 1
+#define TENSOR_KIND_INPUT_OUTPUT 2
+
+{cpp_msprof_extern}
+
+{cpp_device_pointer}
+
+static void _launch(const char* kernelName, const void* func, rtStream_t stream, int gridX, int gridY, int gridZ, std::vector<std::vector<int64_t>> &tensorShapes, std::vector<int> &tensorKinds, {arg_decls}) {{
+  // only 1D parallelization is supported for NPU
+  // Pointer type becomes flattend 1-D Memref tuple: base_ptr, data_ptr, offset, shape, stride
+  // base_ptr offset shape and stride are not used, arbitrarily set for now
+  std::string name = "";
+  name.append(kernelName);
+  {'auto launch_call = [=]()' if enable_taskqueue else ''} {{
+    uint32_t blockNum = gridX * gridY * gridZ;
+    {'blockNum = std::min(blockNum, (uint32_t)' + str(num_physical_blocks) + ');' if enable_auto_map_parallel_blocks else ''}
+    {'cce::internal::DebugTunnelData *DTData = cce::internal::DebugTunnel::Open(blockNum);' if enable_device_print else ''}
+    rtError_t ret;
+    void *ffts_addr = NULL;
+    uint32_t ffts_len; ret = rtGetC2cCtrlAddr((uint64_t*)&ffts_addr, &ffts_len);
+    if (ret != RT_ERROR_NONE) {{
+      return {'ret' if enable_taskqueue else ''};
+    }}
+    // stub argument for workspace
+    void *syncBlockLock = NULL;
+    void *workspace_addr = NULL;
+    uint16_t ModuleId = 0;
+    {f'''
+    uint64_t syncBlockLockSize = {lock_num} * sizeof(int64_t);
+    ret = rtMalloc(reinterpret_cast<void **>(&syncBlockLock),
+                   syncBlockLockSize, RT_MEMORY_HBM, 0);
+    if (ret != RT_ERROR_NONE) {{
+      return {'ret' if enable_taskqueue else ''};
+    }}
+    std::vector<int64_t> lockInitData({lock_num}, {lock_ini_val});
+    ret = rtMemcpy(syncBlockLock, syncBlockLockSize, reinterpret_cast<void *>(lockInitData.data()),
+                   syncBlockLockSize, RT_MEMCPY_HOST_TO_DEVICE);
+    if (ret != RT_ERROR_NONE) {{
+      return {'ret' if enable_taskqueue else ''};
+    }}
+    ''' if lock_num > 0 else ''}
+    {f'''
+    uint64_t totalWorkSpaceSize = {workspace_size} * blockNum;
+    ret = rtMalloc(reinterpret_cast<void **>(&workspace_addr),
+                   totalWorkSpaceSize, RT_MEMORY_HBM, ModuleId);
+    if (ret != RT_ERROR_NONE) {{
+      return {'ret' if enable_taskqueue else ''};
+    }}
+    ''' if workspace_size > 0 else ''}
+    struct __attribute__((packed)) {{
+      void* ffts_addr __attribute__((aligned(8)));
+      void* syncBlockLock __attribute__((aligned(8)));
+      void* workspace_addr __attribute__((aligned(8)));
+      {' '.join(f'{_ty_to_cpp(ty)} arg{i} __attribute__((aligned({4 if ty[0] != "*" and ty[-2:] != "64" else 8})));' for i, ty in signature.items() if i not in constants)}
+      {' '.join(f'{_ty_to_cpp(ty)} grid{mark} __attribute__((aligned(4)));' for mark, ty in grid_info.items())}
+      {'void* DTData __attribute__((aligned(8)));' if enable_device_print else ''}
+    }} args = {{
+      static_cast<void*>(ffts_addr),
+      static_cast<void*>(syncBlockLock),
+      static_cast<void*>(workspace_addr),
+      {', '.join(f'static_cast<{_ty_to_cpp(ty)}>(arg{i})' for i, ty in signature.items() if i not in constants)},
+      {', '.join(f'static_cast<{_ty_to_cpp(ty)}>(grid{mark})' for mark, ty in grid_info.items())}
+      {', static_cast<void*>(DTData)' if enable_device_print else ''}
+    }};
+    {cpp_msprof_call_before_launch}
+    ret = rtKernelLaunch(func, blockNum, static_cast<void*>(&args), sizeof(args), NULL, stream);
+    {'void *&stream_ref = const_cast<void*&>(stream);' if enable_device_print else ''}
+    {'cce::internal::DebugTunnel::Close(DTData, stream_ref);' if enable_device_print else ''}
+    {cpp_msprof_call_after_launch}
+    {'return ret;' if enable_taskqueue else ''}
+   }};
+   {'at_npu::native::OpCommand cmd; cmd.Name(name.c_str()).SetCustomHandler(launch_call).Run();' if enable_taskqueue else ''}
+  return;
+}}
+
+// Extract tensor shape from PyObject
+static std::vector<int64_t> _get_tensor_shape(PyObject *tensor) {{
+  std::vector<int64_t> shape;
+
+  // Early return if tensor is None or null
+  if (!tensor || tensor == Py_None) {{
+    return shape;
+  }}
+
+  // Calling tensor.size()
+  PyObject* size_result = PyObject_CallMethod(tensor, "size", NULL);
+  if (!size_result) {{
+    return shape;
+  }}
+  // Using PySequence_Fast to improve access efficiency
+  PyObject* seq = PySequence_Fast(size_result, "Expected a sequence from tensor.size()");
+  if (seq) {{
+    Py_ssize_t len = PySequence_Fast_GET_SIZE(seq);
+    PyObject** items = PySequence_Fast_ITEMS(seq);
+    for (Py_ssize_t i = 0; i < len; ++i) {{
+      PyObject* dim = items[i];
+      if (PyLong_Check(dim)) {{
+        shape.push_back(PyLong_AsLong(dim));
+      }}
+    }}
+  }}
+  Py_DECREF(seq);
+  Py_DECREF(size_result);
+  return shape;
+}}
+
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  int gridX, gridY, gridZ;
+  rtStream_t stream;
+  const void *function;
+  PyObject *packedMetadata = NULL;
+  PyObject *launch_metadata = NULL;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  std::vector<std::vector<int64_t>> tensorShapes;
+  {' '.join([f"{_extracted_ty(ty)} _arg{i}; " for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(
+      args, \"{format}\",
+      &gridX, &gridY, &gridZ, &stream, &function,
+      &packedMetadata, &launch_metadata,
+      &launch_enter_hook, &launch_exit_hook
+      {', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''}
+      )
+    ) {{
+    return NULL;
+  }}
+  if (__MsprofFlagL1)
+  {{
+    {
+      LINE_CHANGE_CHAR.join(
+        f"{{ auto tmp = _get_tensor_shape(_arg{i}); if (!tmp.empty()) tensorShapes.push_back(tmp); }}"
+        for i, ty in signature.items() if ty[0] == "*"
+      )
+    }
+  }}
+
+  if (launch_enter_hook != Py_None && !PyObject_CallObject(launch_enter_hook, args)) {{
+    return NULL;
+  }}
+
+  // get kernel_name
+  PyObject *kernelNameObj = PyDict_GetItemString(packedMetadata, "kernel_name");
+  const char *kernelName = PyUnicode_AsUTF8(kernelNameObj);
+  // get tensor_kinds
+  std::vector<int> tensorKinds;
+  PyObject *tensorKindList = PyDict_GetItemString(packedMetadata, "tensor_kinds");
+  if (tensorKindList) {{
+    int size = PyObject_Size(tensorKindList);
+    for (int i = 0; i < size; i++) {{
+      PyObject *kind = PySequence_GetItem(tensorKindList, i);
+      tensorKinds.push_back(PyLong_AsLong(kind));
+    }}
+  }}
+
+  // raise exception asap
+  {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0]=="*" else "" for i, ty in signature.items()])};
+  _launch(kernelName, function, stream, gridX, gridY, gridZ, tensorShapes, tensorKinds, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}" for i, ty in signature.items())});
+  if (PyErr_Occurred()) {{
+    return NULL;
+  }}
+  if (launch_exit_hook != Py_None && !PyObject_CallObject(launch_exit_hook, args)) {{
+    return NULL;
+  }}
+  Py_RETURN_NONE;
+}}
+
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  {cpp_msprof_callback}
+  return m;
+}}
+"""
diff --git a/third_party/aipu/backend/npu_utils.cpp b/third_party/aipu/backend/npu_utils.cpp
new file mode 100644
index 000000000..74f3d5ca7
--- /dev/null
+++ b/third_party/aipu/backend/npu_utils.cpp
@@ -0,0 +1,134 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+
+#include "experiment/runtime/runtime/rt.h"
+
+// Use map to differentiate same name functions from different binary
+static std::unordered_map<std::string, size_t> registered_names;
+static std::unordered_map<std::string, std::unique_ptr<size_t>> func_stubs;
+
+static std::tuple<void *, void *>
+registerKernel(const char *name, const void *data, size_t data_size, int shared,
+               int device, const char *kernel_mode_str) {
+  rtError_t rtRet;
+
+  rtDevBinary_t devbin;
+  devbin.data = data;
+  devbin.length = data_size;
+  const std::string kernel_mode{kernel_mode_str};
+  if (kernel_mode == "aiv")
+    devbin.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC;
+  else
+    devbin.magic = RT_DEV_BINARY_MAGIC_ELF;
+  devbin.version = 0;
+
+  rtRet = rtSetDevice(device);
+  if (rtRet != RT_ERROR_NONE) {
+    printf("rtSetDevice failed, 0x%x\n", rtRet);
+    return {NULL, NULL};
+  }
+
+  void *devbinHandle = NULL;
+  rtRet = rtDevBinaryRegister(&devbin, &devbinHandle);
+  if (rtRet != RT_ERROR_NONE) {
+    printf("rtDevBinaryRegister failed, 0x%x\n", rtRet);
+    return {NULL, NULL};
+  }
+
+  std::string stubName = name;
+  stubName += "_" + std::to_string(registered_names[name]);
+  registered_names[name]++;
+  auto registered = func_stubs.emplace(stubName, std::make_unique<size_t>(0));
+  void *func_stub_handle = registered.first->second.get();
+  rtRet = rtFunctionRegister(devbinHandle, func_stub_handle, stubName.c_str(),
+                             (void *)name, 0);
+  if (rtRet != RT_ERROR_NONE) {
+    printf("rtFunctionRegister failed(stubName = %s), 0x%x\n", stubName.c_str(),
+           rtRet);
+    return {NULL, NULL};
+  }
+
+  return std::make_tuple(devbinHandle, func_stub_handle);
+}
+
+static PyObject *loadKernelBinary(PyObject *self, PyObject *args) {
+  const char *name;        // kernel name
+  const char *data;        // binary pointer
+  Py_ssize_t data_size;    // binary size
+  int shared;              // shared_memory(meaningless now)
+  int device;              // device ID
+  const char *kernel_mode; // kernel mode
+
+  if (!PyArg_ParseTuple(args, "ss#iis", &name, &data, &data_size, &shared,
+                        &device, &kernel_mode)) {
+    return NULL;
+  }
+
+  auto [module_handle, func_handle] =
+      registerKernel(name, data, data_size, shared, device, kernel_mode);
+
+  uint64_t mod = reinterpret_cast<uint64_t>(module_handle);
+  uint64_t func = reinterpret_cast<uint64_t>(func_handle);
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+
+  return Py_BuildValue("(KKii)", mod, func, 0, 0);
+}
+
+static PyObject *getArch(PyObject *self, PyObject *args) {
+  char name[64] = {'\0'};
+
+  rtError_t rtRet = rtGetSocVersion(name, 64);
+
+  if (rtRet != RT_ERROR_NONE) {
+    printf("rtGetSocVersion failed, 0x%x", rtRet);
+    return NULL;
+  }
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("s", name);
+}
+
+static PyObject *getAiCoreNum(PyObject *self, PyObject *args) {
+  uint32_t aiCoreCnt;
+
+  rtError_t rtRet = rtGetAiCoreCount(&aiCoreCnt);
+
+  if (rtRet != RT_ERROR_NONE) {
+    printf("rtGetAiCoreCount failed, 0x%x", rtRet);
+    return NULL;
+  }
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("I", aiCoreCnt);
+}
+
+static PyMethodDef NpuUtilsMethods[] = {
+    {"load_kernel_binary", loadKernelBinary, METH_VARARGS,
+     "Load NPU kernel binary into NPU driver"},
+    {"get_arch", getArch, METH_VARARGS, "Get soc version of NPU"},
+    {"get_aicore_num", getAiCoreNum, METH_VARARGS, "Get the number of AI core"},
+    {NULL, NULL, 0, NULL}};
+
+static PyModuleDef ModuleDef = {
+    PyModuleDef_HEAD_INIT, "npu_utils",
+    "Utilities for fetching NPU device info and preparing kernel binary", -1,
+    NpuUtilsMethods};
+
+PyMODINIT_FUNC PyInit_npu_utils(void) {
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+
+  PyModule_AddFunctions(m, NpuUtilsMethods);
+  return m;
+}
diff --git a/third_party/aipu/backend/utils.py b/third_party/aipu/backend/utils.py
new file mode 100644
index 000000000..ec9919102
--- /dev/null
+++ b/third_party/aipu/backend/utils.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+import functools
+import os
+import shutil
+import subprocess
+import sysconfig
+from pathlib import Path
+import pybind11
+
+
+def _get_npucompiler_path() -> str:
+    npu_compiler_path = shutil.which("bishengir-compile")
+    if npu_compiler_path is None:
+        npu_compiler_root = os.getenv("TRITON_NPU_COMPILER_PATH", "")
+        if npu_compiler_root is None:
+            raise EnvironmentError("Couldn't find executable bishengir-compile or TRITON_NPU_COMPILER_PATH.")
+        npu_compiler_path = os.path.join(npu_compiler_root, "npuc")
+    return npu_compiler_path
+
+
+def _get_bisheng_path() -> str:
+    bisheng_path = shutil.which("bisheng")
+    if bisheng_path is None:
+        npu_compiler_root = os.getenv("TRITON_NPU_COMPILER_PATH", "")
+        if npu_compiler_root is None:
+            raise EnvironmentError("Couldn't find executable bisheng or TRITON_NPU_COMPILER_PATH")
+        bisheng_path = os.path.join(npu_compiler_root, "ccec")
+    return bisheng_path
+
+
+def _check_bishengir_api_change() -> bool:
+    bishengir_path = _get_npucompiler_path()
+    try:
+        result = subprocess.run(
+            [bishengir_path, "--help"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        if result.returncode == 0 and 'limit-auto-multi-buffer-buffer' in result.stdout:
+            return True
+        else:
+            return False
+    except Exception as e:
+        print(f"ERROR: {e}")
+        return False
+
+
+def _check_bishengir_is_regbased() -> bool:
+    bishengir_path = _get_npucompiler_path()
+    try:
+        result = subprocess.run(
+            [bishengir_path, "--help"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        if result.returncode == 0 and 'reg-based' in result.stdout:
+            return True
+        else:
+            return False
+    except Exception as e:
+        print(f"ERROR: {e}")
+        return False
+
+
+@functools.lru_cache(None)
+def _get_ascend_path() -> str:
+    path = os.getenv("ASCEND_HOME_PATH", "")
+    if path == "":
+        raise EnvironmentError("ASCEND_HOME_PATH is not set, source <ascend-toolkit>/set_env.sh first")
+    return Path(path)
+
+
+def _is_ascend_sanitizer_enabled() -> bool:
+    return os.getenv("TRITON_ENABLE_SANITIZER", "false").lower() in ("true", "1")
+
+
+def _is_debug_line_info_disabled() -> bool:
+    return os.getenv("TRITON_DISABLE_LINE_INFO", "true").lower() in ("true", "1")
+
+
+def _is_auto_map_parallel_blocks_enabled() -> bool:
+    if not _enable_unpublished_feature():
+        return False
+    return os.getenv("TRITON_ALL_BLOCKS_PARALLEL", "false").lower() in ("true", "1")
+
+
+def _enable_unpublished_feature() -> bool:
+    return os.getenv("ENABLE_UNPUBLISHED_FEATURE", "false").lower() in ("true", "1")
+
+
+def _build_npu_ext(obj_name: str, src_path, src_dir, *, kernel_launcher=None) -> str:
+    suffix = sysconfig.get_config_var("EXT_SUFFIX")
+    so_path = os.path.join(src_dir, f"{obj_name}{suffix}")
+
+    cxx = os.environ.get("CC")
+    if cxx is None:
+        clangxx = shutil.which("clang++")
+        gxx = shutil.which("g++")
+        cxx = clangxx if clangxx is not None else gxx
+        if cxx is None:
+            raise RuntimeError("Failed to find C++ compiler")
+    cc_cmd = [cxx, src_path]
+    cc_cmd += [f"-w"]
+
+    if hasattr(sysconfig, "get_default_scheme"):
+        scheme = sysconfig.get_default_scheme()
+    else:
+        scheme = sysconfig._get_default_scheme()
+    if scheme == "posix_local":
+        scheme = "posix_prefix"
+    py_include_dir = sysconfig.get_paths(scheme=scheme)["include"]
+    cc_cmd += [f"-I{py_include_dir}"]
+    cc_cmd += [f"-I{os.path.dirname(os.path.realpath(__file__))}"]
+
+    asc_path = _get_ascend_path()
+    cc_cmd += [
+        f"-I{os.path.join(asc_path, 'include')}",
+        f"-I{os.path.join(asc_path, 'include/experiment')}",
+        f"-I{os.path.join(asc_path, 'include/experiment/msprof')}",
+        f"-I{pybind11.get_include()}",
+        f"-L{os.path.join(asc_path, 'lib64')}",
+        "-lruntime",
+        "-lascendcl",
+    ]
+
+    if kernel_launcher == "torch":
+        import torch
+        import torch_npu
+        torch_path = os.path.dirname(os.path.realpath(torch.__file__))
+        torch_npu_path = os.path.dirname(os.path.realpath(torch_npu.__file__))
+        use_cxx11_abi = _check_cxx11_abi()
+        cc_cmd += [
+            f"-I{os.path.join(torch_path, 'include')}",
+            f"-I{os.path.join(torch_npu_path, 'include')}",
+            f"-L{os.path.join(torch_npu_path, 'lib')}",
+            "-ltorch_npu",
+            f"-D_GLIBCXX_USE_CXX11_ABI={use_cxx11_abi}",
+        ]
+
+    cc_cmd += ["-std=c++17", "-shared", "-fPIC", "-o", so_path]
+    ret = subprocess.check_call(cc_cmd)
+
+    if ret == 0:
+        return so_path
+    else:
+        raise RuntimeError("Failed to compile " + src_path)
+
+
+def _check_cxx11_abi():
+    import torch
+    return 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+
+
+def convert_sigtype_to_int(sigty: str):
+    MAP_SIGTYPE_TO_INT = {
+        "i1": 12,
+        "i8": 2,
+        "i16": 6,
+        "i32": 3,
+        "i64": 9,
+        "u32": 8,
+        "u64": 10,
+        "fp16": 1,
+        "bf16": 27,
+        "fp32": 0,
+        "fp64": 11,
+    }
+    if sigty not in MAP_SIGTYPE_TO_INT:
+        raise ValueError(f"Unsupported data type: {sigty}")
+    return MAP_SIGTYPE_TO_INT[sigty]