diff --git a/CMakeLists.txt b/CMakeLists.txt index 16431d047..71f1dc6e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,8 +31,8 @@ elseif(FLAGTREE_BACKEND STREQUAL "mthreads") set(CMAKE_CXX_COMPILER clang++) set(ENV{FLAGTREE_PLUGIN} $ENV{FLAGTREE_BACKEND}) elseif(FLAGTREE_BACKEND STREQUAL "aipu") - set(CMAKE_C_COMPILER clang-16) - set(CMAKE_CXX_COMPILER clang++-16) + set(CMAKE_C_COMPILER clang-15) + set(CMAKE_CXX_COMPILER clang++-15) add_definitions(-D__NVIDIA__) add_definitions(-D__AMD__) elseif(FLAGTREE_BACKEND STREQUAL "tsingmicro") diff --git a/python/setup_tools/setup_helper.py b/python/setup_tools/setup_helper.py index 230976cf4..b0935bb53 100644 --- a/python/setup_tools/setup_helper.py +++ b/python/setup_tools/setup_helper.py @@ -360,9 +360,9 @@ def check_env(env_val): # aipu cache.store( - file="llvm-a66376b0-ubuntu-x64", + file="llvm-a66376b0-ubuntu-arm64", condition=("aipu" == flagtree_backend), - url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-ubuntu-x64.tar.gz", + url="https://oaitriton.blob.core.windows.net/public/llvm-builds/llvm-a66376b0-ubuntu-arm64.tar.gz", pre_hock=lambda: check_env('LLVM_SYSPATH'), post_hock=set_llvm_env, ) diff --git a/python/triton/backends/compiler.py b/python/triton/backends/compiler.py index 3583429de..cac42a663 100644 --- a/python/triton/backends/compiler.py +++ b/python/triton/backends/compiler.py @@ -1,12 +1,218 @@ import os import re +import hashlib import subprocess -import sysconfig -from abc import ABCMeta, abstractmethod + +from abc import ABCMeta, abstractmethod, abstractclassmethod from dataclasses import dataclass -from typing import Dict, Union +from typing import Dict, List, Tuple, Union from types import ModuleType +# Table that associates strings to AttrsDescriptor (sub)classes. +# In this way we can dynamically select the correct class +# constructor +_descriptor_table = {} + + +def register_descriptor(cls): + """ + Register a descriptor into the descriptor table + """ + _descriptor_table[cls.__name__] = cls + return cls + + +@register_descriptor +class AttrsDescriptor: + """ + This class handles compile-time properties for specific function parameters. + + Different backends can add more properties to the common ones. The class + contains two fields: + + `arg_properties`: a dictionary containing the different compile-time properties for different + parameters. I.e., the dictionary is a map from property names to parameter indices + { + "prop0": (0, 2, 3) + "prop1": (0, 4, 5) + } + Different backends might need different properties on those paraemters to enable + specific optimizations. The common compile time properties contained in this class + are : + - "tt.divisibility", i.e., is the given parameter divisible by 16 + - "tt.equal_to_1", i.e., is the given parameter an integer constant 1 + + `property_values`: a dictionary containing the value of the different compile-time properties, like: + { + "prop0": val0 + "prop1": val1 + } + + `constant_properties`: a set containing the properties that can be used to determine if a parameter is constant + + """ + __slots__ = ('divisibility_16', 'equal_to_1', 'arg_properties', 'property_values', 'constant_properties') + + def __init__(self, params=None, values=None): + """ + Initialize the compile-time properties + + We can initialize the AttrsDescriptor class by passing the list of params + of the function and their `values`. The function will try to apply the properties + to the values and save the parameters in the `arg_properties` list. If we don't pass + either the `params` or the `values` we should initialize the class via an alternative method + (see `from_dict` or `from_hints`) + """ + # Default initialization + self.arg_properties = {} + self.property_values = {} + self.constant_properties = set() + + self._add_common_properties(params, values) + self._add_backend_properties(params, values) + self._init_slots() + + def _add_common_properties(self, params, values): + """ Add common compile-time properties """ + self.property_values["tt.divisibility"] = 16 + self.property_values["tt.equal_to"] = 1 + self.constant_properties.add("tt.equal_to") + + if (params is None) or (values is None): + return + + # Compile properties deduction + assert (len(params) == len(values)) + + # Divisibility property + self.arg_properties["tt.divisibility"] = [ + param.num for param, arg in zip(params, values) if AttrsDescriptor.is_divisible_by_16(arg) + and not param.do_not_specialize and not param.do_not_specialize_on_alignment + ] + + # Equal to 1 property + self.arg_properties["tt.equal_to"] = [ + param.num + for param, arg in zip(params, values) + if AttrsDescriptor.is_equal_to_1(arg) and not param.do_not_specialize + ] + + def _add_backend_properties(self, params=None, values=None): + """ This method is for different subclasses to implement their own compile-time properties """ + pass + + def _init_slots(self): + """ Initialize the slots of this class """ + for name, val in self.arg_properties.items(): + setattr(self, name.removeprefix('tt.') + '_' + str(self.property_values[name]), val) + + def get_fn_attrs(self) -> Dict: + """ + Get the function attributes as a dictionary. + + The returned dictionary will look like : + { + "arg0" : [(prop_name00, val00), (prop_name01, val01), ...)]} + "arg1" : [(prop_name10, val10), (prop_name11, val11), ...)]} + } + """ + attrs = {} + for prop_name, arg_set in self.arg_properties.items(): + prop_val = self.property_values[prop_name] + for arg in arg_set: + attrs[arg] = attrs.get(arg, []) + [(prop_name, prop_val)] + return attrs + + def get_constants(self) -> Dict: + """ Return a mapping of constant parameters to their values """ + constants = {} + for prop_name in self.constant_properties: + for p in self.arg_properties.get(prop_name, []): + constants[p] = self.property_values[prop_name] + return constants + + def filter_out_constants(self): + """ Return the same object, without properties marked as constants""" + import copy + c = copy.deepcopy(self) + for prop_name in c.constant_properties: + c.arg_properties.pop(prop_name, None) + c.property_values.pop(prop_name, None) + c.constant_properties = {} + return c + + def hash(self): + values = [sorted(self.arg_properties.values())] + values += [sorted(self.property_values.values())] + values += [sorted(self.constant_properties)] + key = str(values) + return hashlib.sha256(key.encode("utf-8")).hexdigest() + + def to_dict(self): + """ + Store the fields of this class in a serializable dictionary + """ + # We need to only store the `arg_properties` field. To initialize the + # other fields we relay on the class type. We store it as a string in + # the dictionary so that we can use it to invoke the appropriate + # (sub)class constructor in the `from_dict` method. + return {"arg_properties": self.arg_properties, "cls": type(self).__name__} + + @staticmethod + def from_dict(data): + """ + Create the object from a serializable dictionary + """ + attrs_descriptor = _descriptor_table[data["cls"]]() + for prop_name, param_ids in data["arg_properties"].items(): + attrs_descriptor.arg_properties[prop_name] = param_ids + attrs_descriptor._init_slots() + return attrs_descriptor + + @classmethod + def from_hints(cls, hints: List[Tuple[int, int]]): + """ + Create the class from a set of hints that are passed in. + + Instead of deducing the properties from a list of paramaters and values, + the user can pass in a list of `hints=[(param_index, val)]` and if `val` + matches one of the values of the properties (e.g., `prop_val[prop0]`), + then we insert `param_index` into the correct list (e.g., in + `arg_properties[prop0]`) + """ + attrs_descriptor = cls() + for prop_name, prop_val in attrs_descriptor.property_values.items(): + attrs_descriptor.arg_properties[prop_name] = [i for i, h in hints.items() if h == prop_val] + attrs_descriptor._init_slots() + return attrs_descriptor + + @staticmethod + def is_divisible_by_16(x): + """ Return if the argument is a multiple of 16""" + if hasattr(x, "data_ptr"): + return x.data_ptr() % 16 == 0 + elif isinstance(x, int): + return x % 16 == 0 + if x is None: + return True + return False + + @staticmethod + def is_equal_to_1(x): + """ Return if the argument is a constant 1""" + return True if isinstance(x, int) and not isinstance(x, bool) and x == 1 else False + + @staticmethod + def get_property_key(val, align): + if align and AttrsDescriptor.is_divisible_by_16(val): + return "D" + if AttrsDescriptor.is_equal_to_1(val): + return "1" + return "N" + + def __repr__(self): + return f"AttrsDescriptor.from_dict({self.to_dict()!r})" + @dataclass(frozen=True) class GPUTarget(object): @@ -25,23 +231,22 @@ def __init__(self, target: GPUTarget) -> None: @staticmethod def _path_to_binary(binary: str): - binary += sysconfig.get_config_var("EXE") base_dir = os.path.join(os.path.dirname(__file__), os.pardir) paths = [ os.environ.get(f"TRITON_{binary.upper()}_PATH", ""), os.path.join(base_dir, "third_party", "cuda", "bin", binary), ] - for path in paths: - if os.path.exists(path) and os.path.isfile(path): - result = subprocess.check_output([path, "--version"], stderr=subprocess.STDOUT) + for p in paths: + bin = p.split(" ")[0] + if os.path.exists(bin) and os.path.isfile(bin): + result = subprocess.check_output([bin, "--version"], stderr=subprocess.STDOUT) if result is not None: version = re.search(r".*release (\d+\.\d+).*", result.decode("utf-8"), flags=re.MULTILINE) if version is not None: - return path, version.group(1) + return p, version.group(1) raise RuntimeError(f"Cannot find {binary}") - @classmethod - @abstractmethod + @abstractclassmethod def supports_target(target: GPUTarget): raise NotImplementedError @@ -84,21 +289,16 @@ def get_module_map(self) -> Dict[str, ModuleType]: """ raise NotImplementedError - @staticmethod - def parse_attr(desc): - assert isinstance(desc, str) - ret = [] - if "D" in desc: - ret += [["tt.divisibility", 16]] - return ret + def get_attrs_descriptor(self, params, args): + """ + Return an attribute descriptor: given a set of parameters and arguments + the descriptor stores a set of compile time properties that can improve code + generation. Different backends might benefit from different properties + """ + return AttrsDescriptor(params, args) - @staticmethod - def get_arg_specialization(arg, ty, **kwargs): + def compute_spec_key(self, arg, align): """ - Return a string unique to each possible specialization of the argument + Return the ascii key for a given argument with a given set of properties """ - if ty == "int" and arg % 16 == 0 and kwargs.get("align", False): - return "D" - if ty == "tensor" and arg.data_ptr() % 16 == 0 and kwargs.get("align", False): - return "D" - return "" + return AttrsDescriptor.get_property_key(arg, align) diff --git a/third_party/aipu/backend/codegen.py b/third_party/aipu/backend/codegen.py index 38688b119..ba2f7175d 100644 --- a/third_party/aipu/backend/codegen.py +++ b/third_party/aipu/backend/codegen.py @@ -1,8 +1,8 @@ import numpy as np -import tvm -from tvm import tir, ir -from tvm.script.parser import tir as T -from tvm.compass.dsl import BuildManager, script as S +#import tvm +#from tvm import tir, ir +#from tvm.script.parser import tir as T +#from tvm.compass.dsl import BuildManager, script as S from mlir import ir as mlir_ir from mlir.dialects import func diff --git a/third_party/aipu/backend/compiler.py b/third_party/aipu/backend/compiler.py index be9091024..89a3a2d21 100644 --- a/third_party/aipu/backend/compiler.py +++ b/third_party/aipu/backend/compiler.py @@ -1,17 +1,41 @@ import pickle +import ctypes +import functools +import hashlib +import os +import re +import subprocess +import tempfile +from pathlib import Path +from dataclasses import dataclass +from typing import Any, Dict, Tuple +from types import ModuleType +#20250923ph from triton.backends.aipu import transform, analysis -from triton.backends.aipu.codegen import codegenAIPU +#from triton.backends.aipu.codegen import codegenAIPU from triton.backends.compiler import BaseBackend, GPUTarget from triton._C.libtriton import ir, aipu, passes -from triton._C import aipu_interface +#import triton._C.libaipu_interface as aipu_interface from mlir.passmanager import PassManager from mlir.ir import Context, Module -from dataclasses import dataclass -import functools -import hashlib -from typing import Any, Dict, Tuple -from types import ModuleType +from triton.backends.aipu.utils import ( + _check_bishengir_api_change, + _check_bishengir_is_regbased, + _enable_unpublished_feature, + _get_npucompiler_path, + _is_ascend_sanitizer_enabled, + _is_debug_line_info_disabled, + _is_auto_map_parallel_blocks_enabled, +) +from triton.backends.aipu.driver import NPUUtils + +from triton.backends.compiler import ( + AttrsDescriptor, + BaseBackend, + GPUTarget, + register_descriptor, +) def min_dot_size(target: GPUTarget): @@ -37,35 +61,206 @@ class AIPUOptions: reg_inc_consumer: int = -1 allowed_dot_input_precisions: Tuple[str] = ("ieee", ) + debug: bool = False + sanitize_overflow: bool = True + llvm_version: int = 15 + kernel_name: str = "triton_" + + cluster_dims: tuple = (1, 1, 1) + num_warps: int = -1 + num_ctas: int = -1 + num_stages: int = 2 + num_buffers_warp_spec: int = 0 + num_consumer_groups: int = 0 + reg_dec_producer: int = 0 + reg_inc_consumer: int = 0 + + enable_warp_specialization: bool = False + enable_nd2nz_on_vector: bool = False + enable_persistent: bool = False + optimize_epilogue: bool = False + enable_fp_fusion: bool = True + allow_fp8e4nv: bool = False + allowed_dot_input_precisions: Tuple[str] = ("ieee", "hf32") + max_num_imprecise_acc_default: bool = None + extern_libs: dict = None + + multibuffer: bool = None + enable_hivm_auto_cv_balance: bool = None + unit_flag: bool = None + inject_barrier_all: bool = None + limit_auto_multi_buffer_only_for_local_buffer: bool = None + limit_auto_multi_buffer_of_local_buffer: str = None + set_workspace_multibuffer: int = None + tile_mix_vector_loop: int = None + tile_mix_cube_loop: int = None + + stream: int = None + def hash(self): hash_dict = dict(self.__dict__) key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items())]) return hashlib.sha256(key.encode("utf-8")).hexdigest() -class AIPUBackend(BaseBackend): +class AscendAttrsDescriptor(AttrsDescriptor): + + # For now we collect shapes of tensor at runtime. + # We comment out the following func but keep it for future reference. + def _add_backend_properties(self, params=None, values=None): + pass + + +def __get_metadata_attr_by_callback(lib, postfix: str, metadata, meta_key: str): + func_symbol = metadata["kernel_name"] + postfix + if hasattr(lib, func_symbol): + callback_func = getattr(lib, func_symbol) + callback_func.restype = ctypes.c_int64 + callback_func.argtypes = [] + metadata[meta_key] = callback_func() + + +def _parse_linalg_metadata(linalg: str, metadata: dict): + MIX_MODE_REGEX = r'mix_mode\s*=\s*"([^"]+)"' + KERNEL_NAME_REGEX = r"func\.func\s+@(\w+)" + TENSOR_KIND_REGEX = r'%arg(\d+):[^,)]*?\{[^}]*?tt\.tensor_kind\s*=\s*([^:\s}]+)\s*:[^}]*?\}' + REMOVE_MIX_MODE_REGEX = r', mix_mode\s*=\s*"[^"]*"' + + metadata["shared"] = 1 + metadata["mix_mode"] = re.search(MIX_MODE_REGEX, linalg).group(1) + metadata["kernel_name"] = re.search(KERNEL_NAME_REGEX, linalg).group(1) + metadata["name"] = metadata["kernel_name"] + " " + metadata["mix_mode"] + metadata["tensor_kinds"] = [int(kind) for _, kind in re.findall(TENSOR_KIND_REGEX, linalg)] + linalg = re.sub(REMOVE_MIX_MODE_REGEX, "", linalg) + return linalg, metadata + + +def linalg_to_bin_enable_npu_compile(linalg: str, metadata, opt): + + linalg, metadata = _parse_linalg_metadata(linalg, metadata) + with tempfile.TemporaryDirectory() as tmpdir: + ttadapter_path = os.path.join(tmpdir, "kernel.ttadapter.mlir") + Path(ttadapter_path).write_text(linalg) + bin_file = os.path.join(tmpdir, "kernel") + if _check_bishengir_api_change(): + bin_file_with_ext = "kernel.o" + else: + bin_file_with_ext = "kernel_reloc.o" + if _check_bishengir_is_regbased(): + bishengir_hivm_opt = "--reg-based=true" + else: + bishengir_hivm_opt = "--enable-hivm-compile=true" + bin_path = os.path.join(tmpdir, bin_file_with_ext) + callback_path = os.path.join(tmpdir, "libkernel.so") + _compile_option_list = [] + if _enable_unpublished_feature(): + _compile_option_list += [ + f"--target={NPUUtils().get_arch()}", + ] + multibuffer = opt.multibuffer + if multibuffer is not None: + _compile_option_list += [ + f"--enable-auto-multi-buffer={multibuffer}", + ] + if _is_ascend_sanitizer_enabled(): + _compile_option_list += ["--enable-sanitizer=true"] + if not _is_debug_line_info_disabled(): + _compile_option_list += ["--enable-debug-info=true"] + + enable_hivm_auto_cv_balance = opt.enable_hivm_auto_cv_balance + if enable_hivm_auto_cv_balance is not None: + _compile_option_list += \ + [f"--enable-hivm-auto-cv-balance={enable_hivm_auto_cv_balance}"] + + unit_flag = opt.unit_flag + if unit_flag is not None: + _compile_option_list += \ + [f"--enable-hivm-unit-flag-sync={unit_flag}"] + + inject_barrier_all = opt.inject_barrier_all + if inject_barrier_all is not None: + _compile_option_list += \ + [f"--enable-hivm-inject-barrier-all-sync={inject_barrier_all}"] + + limit_auto_multi_buffer_only_for_local_buffer = opt.limit_auto_multi_buffer_only_for_local_buffer + if limit_auto_multi_buffer_only_for_local_buffer is not None: + _compile_option_list += \ + [f"--limit-auto-multi-buffer-only-for-local-buffer={limit_auto_multi_buffer_only_for_local_buffer}"] + + set_workspace_multibuffer = opt.set_workspace_multibuffer + if set_workspace_multibuffer is not None: + _compile_option_list += \ + [f"--set-workspace-multibuffer={set_workspace_multibuffer}"] + + tile_mix_vector_loop = opt.tile_mix_vector_loop + if tile_mix_vector_loop is not None: + _compile_option_list += \ + [f"--tile-mix-vector-loop={tile_mix_vector_loop}"] + + tile_mix_cube_loop = opt.tile_mix_cube_loop + if tile_mix_cube_loop is not None: + _compile_option_list += \ + [f"--tile-mix-cube-loop={tile_mix_cube_loop}"] + + auto_multi_buffer = opt.limit_auto_multi_buffer_of_local_buffer + if auto_multi_buffer is not None: + _compile_option_list += \ + [f"--limit-auto-multi-buffer-of-local-buffer={auto_multi_buffer}"] + + if _is_auto_map_parallel_blocks_enabled(): + _compile_option_list += ["--enable-auto-blockify-loop"] + npu_compiler_path = _get_npucompiler_path() + if npu_compiler_path.endswith("bishengir-compile"): + _compile_option_list += [ + "--enable-hfusion-compile=true", + bishengir_hivm_opt, + "--enable-triton-kernel-compile=true", + ] + cmd_list = ([npu_compiler_path, ttadapter_path] + _compile_option_list + ["-o", bin_file]) + ret = subprocess.run(cmd_list, capture_output=True, check=True) + if Path(callback_path).is_file(): + lib = ctypes.CDLL(callback_path) + __get_metadata_attr_by_callback(lib, "_infer_workspace_shape_function", metadata, "workspace_size") + __get_metadata_attr_by_callback(lib, "_infer_sync_block_lock_num_function", metadata, "lock_num") + __get_metadata_attr_by_callback(lib, "_infer_sync_block_lock_init_function", metadata, "lock_init_val") + + return Path(bin_path).read_bytes() + + +class AscendBackend(BaseBackend): @staticmethod def supports_target(target: GPUTarget): - return target.backend == 'aipu' + return target.backend == 'npu' def __init__(self, target: GPUTarget) -> None: super().__init__(target) self.capability = target.arch - self.binary_ext = "bin" - aipu_interface.passes.register_all_passes() + self.binary_ext = "npubin" + #aipu_interface.passes.register_all_passes() def parse_options(self, opts) -> Any: - return AIPUOptions() + return AIPUOptions(**{k: opts[k] for k in AIPUOptions.__dataclass_fields__.keys() if k in opts}) def pack_metadata(self, metadata): - return ( - metadata.num_tecs, - metadata.num_cores, - metadata.cluster_dims[0], - metadata.cluster_dims[1], - metadata.cluster_dims[2], - ) + # collect necessary metadata to launch kernels + # TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 could set unique name. + # Get this name as the kernel_name to CANN runtime. + # kernel_name is unique to Ascend backend and should not be public. + # CANN runtime limits the length of kernel name <= 50. + # Considering '\n' is appended, thus the real kernel name <= 49. + KERNEL_NAME_MAX_LEN = 49 + kernel_name_orig, mix_mode = metadata.name.split() + if len(kernel_name_orig) > KERNEL_NAME_MAX_LEN: + kernel_name = kernel_name_orig[-KERNEL_NAME_MAX_LEN:] + else: + kernel_name = kernel_name_orig + return { + "kernel_name": kernel_name, + "hash": metadata.hash, + "debug": metadata.debug, + "tensor_kinds": metadata.tensor_kinds, + } def get_codegen_implementation(self, options): codegen_fns = {"min_dot_size": min_dot_size(self.target)} @@ -78,6 +273,9 @@ def get_module_map(self) -> Dict[str, ModuleType]: def load_dialects(self, ctx): aipu.load_dialects(ctx) + def get_arg_specialization(*arg, **kwargs): + return None + @staticmethod def make_ttir(mod, metadata, opt): pm = ir.pass_manager(mod.context) @@ -98,62 +296,32 @@ def make_ttir(mod, metadata, opt): def make_linalg(mod, metadata, opt): pm = ir.pass_manager(mod.context) pm.enable_debug() - # Add pass here. aipu.passes.convert.add_triton_to_linalg_pipeline(pm) pm.run(mod) return mod @staticmethod - def make_aipubin(mod, metadata, opt): - ctx = Context() - ctx.allow_unregistered_dialects = True - aipu_interface.dialects.register_all_dialects(ctx._CAPIPtr) - pm = PassManager("builtin.module", ctx) - mod = Module.parse(aipu.common.generic_print(mod), ctx) - - # Add pass here. - transform.linalg_transform(mod, ctx) - transform.tensor_transform(mod, ctx) - - pm.add("func.func(linalg-fuse-elementwise-ops)") - pm.add("scf-loop-bufferization-preprocessing") - pm.add("one-shot-bufferize") - pm.add("func.func(convert-bool-arg-to-i8)") - pm.add("func.func(convert-linalg-to-affine-loops)") - pm.add("func.func(affine-loop-normalize{promote-single-iter=1})") - pm.add("func.func(affine-loop-fusion{mode=sibling})") - pm.add("func.func(flatten-memref)") - pm.add("func.func(canonicalize)") - pm.run(mod.operation) - - pm = PassManager("builtin.module", ctx) - transform.convert_memref_i1_i8(mod, ctx) - transform.remove_empty_linalg_generic(mod, ctx) - # vectorize - vfactor = analysis.determine_vectorization_factor(mod, metadata["vector_register_bits"]) - if vfactor > 1: - pm.add(f"func.func(affine-super-vectorize{{virtual-vector-size={vfactor}}})") - pm.add("func.func(lower-affine)") - - # Optimize pass. - pm.add("func.func(canonicalize)") - pm.add("func.func(cse)") - pm.add("func.func(reconcile-unrealized-casts)") - pm.run(mod.operation) - - # Post aipu pass. - transform.binding_tid(mod, ctx) - transform.canonical_const_dtype(mod, ctx) - - ex = codegenAIPU(mod) - metadata["name"] = ex._func_name - metadata["shared"] = 1 - return pickle.dumps(ex) + def make_npubin(mod, metadata, opt): + + linalg_str = str(mod) + metadata.update({ + "enable_nd2nz_on_vector": opt.enable_nd2nz_on_vector, + "multibuffer": opt.multibuffer, + "enable_hivm_auto_cv_balance": opt.enable_hivm_auto_cv_balance, + "unit_flag": opt.unit_flag, + "inject_barrier_all": opt.inject_barrier_all, + "limit_auto_multi_buffer_only_for_local_buffer": opt.limit_auto_multi_buffer_only_for_local_buffer, + "limit_auto_multi_buffer_of_local_buffer": opt.limit_auto_multi_buffer_of_local_buffer, + "set_workspace_multibuffer": opt.set_workspace_multibuffer, + "tile_mix_vector_loop": opt.tile_mix_vector_loop, + "tile_mix_cube_loop": opt.tile_mix_cube_loop, + }) + return linalg_to_bin_enable_npu_compile(linalg_str, metadata, opt) def add_stages(self, stages, options): stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) stages["linalg"] = lambda src, metadata: self.make_linalg(src, metadata, options) - stages["bin"] = lambda src, metadata: self.make_aipubin(src, metadata, options) + stages["npubin"] = (lambda src, metadata: linalg_to_bin_enable_npu_compile(src, metadata, options)) @functools.lru_cache() def hash(self): diff --git a/third_party/aipu/backend/driver.py b/third_party/aipu/backend/driver.py index df4cce6b4..33d91a6c3 100644 --- a/third_party/aipu/backend/driver.py +++ b/third_party/aipu/backend/driver.py @@ -1,192 +1,731 @@ -import os -import pickle -import torch -import uuid -import numpy as np from pathlib import Path -from itertools import chain -from triton.backends.compiler import GPUTarget +import tempfile +import os +import os.path +import re +import subprocess +import sysconfig +from typing import Optional +import functools +import hashlib +from triton.runtime.cache import get_cache_manager, get_dump_manager from triton.backends.driver import DriverBase - -# ------------------------ -# Utils -# ------------------------ - - -def load_binary(name, kernel, shared, device): - return None, kernel, 1, 0 +from triton.backends.compiler import GPUTarget +from triton.backends.aipu.utils import ( + _build_npu_ext, + _check_cxx11_abi, + convert_sigtype_to_int, + _is_auto_map_parallel_blocks_enabled, +) -class AIPUUtils(object): +class NPUUtils(object): def __new__(cls): - if not hasattr(cls, "instance"): - cls.instance = super(AIPUUtils, cls).__new__(cls) + if not hasattr(cls, 'instance'): + cls.instance = super(NPUUtils, cls).__new__(cls) return cls.instance def __init__(self): - self.load_binary = load_binary - properties_dict = {"max_shared_mem": 256 * 1024, "multiprocessor_count": 4, "max_num_regs": 32, "warpSize": 4} - self.get_device_properties = lambda device: properties_dict - - -# ------------------------ -# Launcher -# ------------------------ - - -def _reset_output_path(ex): - output_dir = f"{os.getcwd()}/compass_dsl_{ex._func_name}_restore_{uuid.uuid4().hex}" - ex._output_dir = output_dir - ex._gbuilder_dir = f"{ex._output_dir}/gbuilder" - ex._op_lib_path = f"{ex._gbuilder_dir}/op_lib/{ex._func_name}.o" - - -def _get_cpu_origin_tensor(tensor): - origin_tensor = tensor - while (base := origin_tensor._base) is not None: - origin_tensor = base - - return origin_tensor.cpu().contiguous() - - -def _get_np_array_from_strided_buffer(tensor, sb): - dtype = str(sb.dtype).split(".")[-1] - itemsize = sb.element_size() - offset = (sb.data_ptr() - sb._base.data_ptr()) - shape = sb.size() - stride = [x * itemsize for x in sb.stride()] - - return np.ndarray( - shape, - dtype, - tensor.numpy(), - offset, - stride, - ) - - -class AIPULauncher(object): + dirname = os.path.dirname(os.path.realpath(__file__)) + src = Path(os.path.join(dirname, "npu_utils.cpp")).read_text() + key = hashlib.sha256(src.encode("utf-8")).hexdigest() + cache = get_cache_manager(key) + fname = "npu_utils.so" + cache_path = cache.get_file(fname) + if cache_path is None: + with tempfile.TemporaryDirectory() as tmpdir: + src_path = os.path.join(tmpdir, "npu_utils.cpp") + with open(src_path, "w") as f: + f.write(src) + so = _build_npu_ext("npu_utils", src_path, tmpdir) + with open(so, "rb") as f: + cache_path = cache.put(f.read(), fname, binary=True) + import importlib.util + spec = importlib.util.spec_from_file_location("npu_utils", cache_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + self.npu_utils_mod = mod + + def load_binary(self, name, kernel, shared, device): + fnname, mix_mode = name.split() + return self.npu_utils_mod.load_kernel_binary(fnname, kernel, shared, device, mix_mode) + + @functools.lru_cache() + def get_device_properties(self, device): + # temperoarily added "max_shared_mem" properties to avoid triton-compiler complain + # fetch available memory at runtime + num_aic = self.get_aicore_num() + num_aiv = num_aic * 2 + return {"max_shared_mem": 1, "num_aicore": num_aic, "num_vectorcore": num_aiv} + + @functools.lru_cache() + def get_arch(self): + # temporarily return empty arch descriptor + return self.npu_utils_mod.get_arch() + + @functools.lru_cache() + def get_aicore_num(self): + # temporarily return empty arch descriptor + return self.npu_utils_mod.get_aicore_num() + + @functools.lru_cache() + def get_aivector_core_num(self): + return self.get_device_properties("npu")["num_vectorcore"] + + +class NPULauncher(object): def __init__(self, src, metadata): - self.constants = src.constants - - def lanch_kernel(self, ex, np_args, tail_args, totoal_pid_size): - convert_map = {} - real_args = [] - for i, arg in enumerate(np_args): - if isinstance(arg, np.ndarray) and arg.dtype == "bool": - real_args.append(arg.astype(np.int8)) - convert_map[i] = np.bool_ - elif isinstance(arg, np.ndarray) and arg.dtype == "int64": - real_args.append(arg.astype(np.int32)) - convert_map[i] = np.int64 - else: - real_args.append(arg) - - tec_num = 4 - for i in range((totoal_pid_size + tec_num - 1) // tec_num): - tail_args[3] = i - ex(*(real_args + tail_args)) - - for i, arg in enumerate(real_args): - if i in convert_map.keys(): - np.copyto(np_args[i], arg.astype(convert_map[i])) - - # TODO(aipu-teams): This is just a temporary solution for now, because the real driver interface is not ready yet. - # These code will be refactor later. - def __call__(self, gridx_size, gridy_size, gridz_size, stream, function, *args): - try: - from flag_gems.utils.tensor_wrapper import StridedBuffer - except ImportError: - StridedBuffer = torch.Tensor - - ex = pickle.loads(function) - _reset_output_path(ex) - np_args = [] - sb_maps = {} - args = [arg for i, arg in enumerate(args[4:]) if i not in chain(*self.constants.keys())] - - for i, arg in enumerate(args): - if isinstance(arg, torch.Tensor): - np_args.append(_get_cpu_origin_tensor(arg).numpy()) - elif isinstance(arg, StridedBuffer): - tensor = _get_cpu_origin_tensor(arg) - np_args.append(_get_np_array_from_strided_buffer(tensor, arg)) - sb_maps[i] = tensor - else: - np_args.append(arg) - - tail_args = [gridx_size, gridy_size, gridz_size, 0, 0, 0] - total_pid_size = gridx_size * gridy_size * gridz_size - self.lanch_kernel(ex, np_args, tail_args, total_pid_size) - - for i, param_info in enumerate(ex._cur_param_infos): - if param_info.is_output_tensor: - if isinstance(args[i], torch.Tensor): - args[i].copy_(torch.from_numpy(np_args[i])) - else: - args[i]._base.copy_(sb_maps[i]) - - -class AIPUDriver(DriverBase): + debug_mode = metadata.debug + workspace_size = int(metadata.workspace_size) \ + if hasattr(metadata, 'workspace_size') else -1 + lock_init_value = int(metadata.lock_init_value) \ + if hasattr(metadata, 'lock_init_value') else 0 + lock_num = int(metadata.lock_num) \ + if hasattr(metadata, 'lock_num') else -1 + constants = src.constants if hasattr(src, "constants") else dict() + cst_key = lambda i: src.fn.arg_names.index(i) if isinstance(i, str) else i + constants = {cst_key(key): value for key, value in constants.items()} + signature = {cst_key(key): value for key, value in src.signature.items()} + mix_mode = metadata.mix_mode + wrapper_src = generate_npu_wrapper_src(constants, signature, \ + workspace_size, mix_mode, \ + lock_num, lock_init_value) + so_launcher_path = make_npu_launcher_stub(wrapper_src, debug_mode) + # initialize launcher + import importlib.util + spec = importlib.util.spec_from_file_location("__triton_launcher", so_launcher_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + self.launch = getattr(mod, "launch") + + def __call__(self, *args, **kwargs): + self.launch(*args, **kwargs) + + +class NPUDriver(DriverBase): def __init__(self): - self.utils = AIPUUtils() # TODO: make static - self.launcher_cls = AIPULauncher + self.utils = NPUUtils() + self.launcher_cls = NPULauncher + super().__init__() - import torch - self.get_current_stream = lambda x: x - self.get_current_device = torch.aipu.current_device + @classmethod + def is_active(cls): - super().__init__() + def test_npucompiler(): + from triton.backends.aipu.utils import _get_bisheng_path + npucompiler = _get_bisheng_path() + targets = subprocess.check_output([npucompiler, "-print-targets"]).decode().strip().split() + return "hiipu64" in targets + + try: + return test_npucompiler() + except Exception as e_npucompiler: + import warnings + red = "\x1b[31;20m" + reset = "\x1b[0m" + warnings.warn(red + str(e_npucompiler) + reset) + return False def get_current_target(self): - warp_size = 4 - return GPUTarget("aipu", "x2", warp_size) + backend = "npu" + arch = self.utils.get_arch() + warp_size = 0 + return GPUTarget(backend, arch, warp_size) def get_active_torch_device(self): import torch - return torch.device("aipu", 0) + import torch_npu + return torch.npu.current_device() - def get_device_interface(self): + def get_current_device(self): + """ + Get current device + """ import torch - return torch.aipu + import torch_npu + return torch.npu.current_device() - @staticmethod - def is_active(): + def set_current_device(self, device): + """ + Set current device as the given device + """ import torch - from torch.utils import cpp_extension + import torch_npu + return torch.npu.set_device(device) + + def get_current_stream(self, device: Optional[int] = None) -> int: + """ + Get stream for current device + """ + # According to torch_npu, the content of a torch.npu.Stream is essentilly an rtStream_t + # TODO: use CANN API instead of torchnpu + import torch + import torch_npu + if device is None: + device = self.get_current_device() + return torch.npu.current_stream(device).npu_stream - try: - torch.aipu.is_available() - except AttributeError: - current_dir = Path(__file__).resolve().parent - extra_ldflags = [f"-L{x.strip()}" for x in os.getenv("LD_LIBRARY_PATH", "").split(":") if x.strip() != ""] - extra_ldflags.append("-laipudrv") - module = cpp_extension.load( - name="aipu", sources=[current_dir / "aipu_torch_dev.cpp"], - extra_include_paths=[os.getenv("ZHOUYI_LINUX_DRIVER_HOME") + "/driver/umd/include"], - extra_ldflags=extra_ldflags, verbose=True) - - torch.utils.rename_privateuse1_backend("aipu") - torch._register_device_module("aipu", module) - torch.utils.generate_methods_for_privateuse1_backend(for_storage=True) - return torch.aipu.is_available() - - # TODO(aipu-teams): Support bechmarker later. def get_benchmarker(self): - - def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"): - return [float("inf"), float("inf"), float("inf")] - + from triton.testing import do_bench return do_bench - def get_empty_cache_for_benchmark(self): + def get_device_interface(self): import torch + return torch.npu - # We maintain a buffer of 256 MB that we clear - # before each kernel call to make sure that the L2 cache - # doesn't contain any input data before the run - cache_size = 256 * 1024 * 1024 - return torch.empty(int(cache_size // 4), dtype=torch.int, device='aipu') + def get_empty_cache_for_benchmark(self): + import torch + cache_size = 192 * 1024 * 1024 + return torch.empty(cache_size // 4, dtype=torch.int, device='npu') + + +def make_npu_launcher_stub(src, debug=False): + """ + Generate the launcher stub to launch the kernel + """ + # try to get cached file + so_cache_key = hashlib.sha256(src.encode("utf-8")).hexdigest() + so_cache_manager = get_cache_manager(so_cache_key) + # append the cxx11_abi value to the launcher name to avoid + # linking to a launcher with wrong cxx11_abi. + use_cxx11_abi = _check_cxx11_abi() + name = f"launcher_cxx11abi{use_cxx11_abi}" + suffix = sysconfig.get_config_var('EXT_SUFFIX') + so_name = f"{name}{suffix}" + + if debug: + dump_manager = get_dump_manager(so_cache_key) + print(f"Dumping {name}.cxx to {dump_manager.cache_dir}") + dump_manager.put(src, f"{name}.cxx", binary=False) + + cache_path = so_cache_manager.get_file(so_name) + if cache_path is not None: + return cache_path + + with tempfile.TemporaryDirectory() as tmpdir: + if debug: + so_cache_manager.put(src, f"{name}.cxx", binary=False) + src_path = os.path.join(tmpdir, f"{name}.cxx") + with open(src_path, "w") as f: + f.write(src) + enable_taskqueue = os.getenv("TRITON_ENABLE_TASKQUEUE", 'true').lower() in ('true', '1') + if (enable_taskqueue): + kernel_launcher_type = "torch" + else: + kernel_launcher_type = None + so = _build_npu_ext(name, src_path, tmpdir, kernel_launcher=kernel_launcher_type) + if debug: + with open(so, "rb") as f: + return dump_manager.put(f.read(), so_name, binary=True) + with open(so, "rb") as f: + return so_cache_manager.put(f.read(), so_name, binary=True) + + +def extract_device_print_code_from_cann(): + from triton.backends.ascend.utils import _get_bisheng_path + ccec_compiler_bin_folder, _ = os.path.split(os.path.realpath(_get_bisheng_path())) + ccec_compiler_folder, _ = os.path.split(ccec_compiler_bin_folder) + clang_version = os.listdir(os.path.join(ccec_compiler_folder, "lib/clang/"))[0] + ccelib_path = os.path.join(ccec_compiler_folder, f"lib/clang/{clang_version}/include/ccelib") + + def read_header(header_path): + with open(os.path.join(ccelib_path, header_path), 'r') as f: + code = f.read() + + # remove all #include "..." + lines = code.splitlines() + purged_lines = [] + for line in lines: + normalized_line = ' '.join(line.split()) + if not normalized_line.startswith('#include "'): + purged_lines.append(line) + code = '\n'.join(purged_lines) + + # remove [aicore] functions + aicore_positions = [] + for m in re.finditer('\[aicore\]', code): + aicore_positions.append(m.start()) + + def find_aicore_function_span(src, pos): + for i in range(pos - 1, -1, -1): + if src[i] == '}': # this relies on that all [aicore] functions come after normal functions + left = i + 1 + break + n = len(src) + brace_nest = 0 + for j in range(pos, n, 1): + if src[j] == '{': + brace_nest += 1 + elif src[j] == '}': + brace_nest -= 1 + if brace_nest == 0: + right = j + break + return left, right + + new_code = '' + segment_start = 0 + for pos in aicore_positions: + left, right = find_aicore_function_span(code, pos) + new_code += code[segment_start:left] + segment_start = right + 1 + new_code += code[segment_start:] + + # remove __gm__ and rename macros + new_code = new_code.replace('__gm__', ' ') + new_code = new_code.replace('__CCELIB_RT_ERROR_NONE', 'RT_ERROR_NONE') + new_code = new_code.replace('__CCELIB_RT_MEMORY_HBM', 'RT_MEMORY_HBM') + new_code = new_code.replace('__CCELIB_RT_MEMCPY_HOST_TO_DEVICE', 'RT_MEMCPY_HOST_TO_DEVICE') + new_code = new_code.replace('__CCELIB_RT_MEMCPY_DEVICE_TO_HOST', 'RT_MEMCPY_DEVICE_TO_HOST') + return new_code + + # the following headers should be included in this order + return '\n'.join([ + read_header('common/common_impl.h'), + read_header('internal/debug_tunnel/payload.h'), + read_header('internal/debug_tunnel/payload_impl.h'), + read_header('internal/debug_tunnel/tunnel.h'), + read_header('internal/debug_tunnel/tunnel_impl.h') + ]) + + +# the template is from triton-adapter HEAD. Wrapping the generated kernel binary into a python module +def generate_npu_wrapper_src(constants, signature, workspace_size, mix_mode, lock_num, lock_ini_val): + import os + + def _ty_to_cpp(ty): + if ty[0] == '*': + return "void*" + return { + "i1": "int32_t", + "i8": "int8_t", + "i16": "int16_t", + "i32": "int32_t", + "i64": "int64_t", + "u32": "uint32_t", + "u64": "uint64_t", + "fp16": "float", + "bf16": "float", + "fp32": "float", + "f32": "float", + "fp64": "double", + }[ty] + + def _extracted_ty(ty): + if ty[0] == '*': + return "PyObject*" + return { + 'i1': 'int32_t', + 'i32': 'int32_t', + 'i64': 'int64_t', + 'u32': 'uint32_t', + 'u64': 'uint64_t', + 'fp16': 'float', + 'bf16': 'float', + 'fp32': 'float', + 'f32': 'float', + 'fp64': 'double', + }[ty] + + def _format_of(ty): + return { + "PyObject*": "O", + "float": "f", + "double": "d", + "long": "l", + "uint32_t": "I", + "int32_t": "i", + "uint64_t": "K", + "int64_t": "L", + }[ty] + + arg_decls = ', '.join(f"{_ty_to_cpp(ty)} arg{i}" for i, ty in signature.items()) + """ + args: + int gridX, gridY, gridZ; + rtStream_t stream; + const void *functon; + PyObject* packed_metadata, *launch_metadata; + PyObject* launch_enter_hook, *launch_exit_hook; + *args_expand + """ + format = "iiiKKOOOO" + ''.join([_format_of(_extracted_ty(ty)) for ty in signature.values()]) + + grid_info = {'X': 'i32', 'Y': 'i32', 'Z': 'i32'} + + enable_device_print = os.getenv("TRITON_DEVICE_PRINT", 'false').lower() in ('true', '1') + enable_taskqueue = os.getenv("TRITON_ENABLE_TASKQUEUE", 'true').lower() in ('true', '1') + enable_auto_map_parallel_blocks = _is_auto_map_parallel_blocks_enabled() + npu_utils = NPUUtils() + num_physical_blocks = npu_utils.get_aivector_core_num() if mix_mode == "aiv" else npu_utils.get_aicore_num() + task_type = "MSPROF_GE_TASK_TYPE_AIV" if mix_mode == "aiv" else "MSPROF_GE_TASK_TYPE_AI_CORE" + LINE_CHANGE_CHAR = chr(10) # it is \n + + cpp_device_pointer = """ +typedef struct _DevicePtrInfo { + void *dev_ptr; + bool valid; +} DevicePtrInfo; + +static inline DevicePtrInfo getPointer(PyObject *obj, int idx) { + DevicePtrInfo ptr_info; + ptr_info.dev_ptr = 0; + ptr_info.valid = true; + if (PyLong_Check(obj)) { + ptr_info.dev_ptr = reinterpret_cast(PyLong_AsUnsignedLongLong(obj)); + return ptr_info; + } + if (obj == Py_None) { + // valid nullptr + return ptr_info; + } + PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr"); + if(ptr){ + PyObject *empty_tuple = PyTuple_New(0); + PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL); + Py_DECREF(empty_tuple); + Py_DECREF(ptr); + if (!PyLong_Check(ret)) { + PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int"); + ptr_info.valid = false; + return ptr_info; + } + ptr_info.dev_ptr = reinterpret_cast(PyLong_AsUnsignedLongLong(ret)); + if(!ptr_info.dev_ptr) + return ptr_info; + Py_DECREF(ret); // Thanks ChatGPT! + return ptr_info; + } + PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method"); + return ptr_info; +} +""" + + cpp_msprof_extern = """ +extern "C" { + typedef int (* callback)(unsigned int type, void* data, unsigned int len); + extern int MsprofReportApi(unsigned int agingFlag, const MsprofApi *api); + extern unsigned long int MsprofSysCycleTime(); + extern int MsprofRegisterCallback(unsigned int moduleId, callback handle); + static unsigned int __MsprofFlagL0 = 0; + static unsigned int __MsprofFlagL1 = 0; + + int ProfCtrlHandle(unsigned int CtrlType, void* CtrlData, unsigned int DataLen) { + if ((CtrlData == nullptr) || (DataLen == 0U)) { + return 1; + } + + if (CtrlType == 1) { + MsprofCommandHandle* handle = (MsprofCommandHandle *)(CtrlData); + if (handle->type >= 6) // 6 is not used here + return 1; + if (handle->type == 1) { // init - 0 , start - 1 + __MsprofFlagL0 = ((0x00000800ULL & handle->profSwitch) == 0x00000800ULL) ? 1 : 0; + __MsprofFlagL1 = ((0x00000002ULL & handle->profSwitch) == 0x00000002ULL) ? 1 : 0; + } + } + return 0; + } +} +""" + + cpp_msprof_callback = """ + MsprofRegisterCallback(8, ProfCtrlHandle); // 8 - CCE defined in msprof headerfile slog.h +""" + + cpp_msprof_call_before_launch = """ + unsigned long int beginTime = 0; + unsigned long int endTime = 0; + unsigned long int opNameHashID = 0; + unsigned int threadId = 0; + char* _kernelName = const_cast(name.c_str()); + size_t length = name.length(); + if (__MsprofFlagL0 || __MsprofFlagL1) + { + beginTime = MsprofSysCycleTime(); + } +""" + + cpp_msprof_call_after_launch = f""" + if (__MsprofFlagL0 || __MsprofFlagL1) + {{ + endTime = MsprofSysCycleTime(); + opNameHashID = MsprofGetHashId(_kernelName, length); + threadId = (unsigned int)(syscall(SYS_gettid)); + MsprofApi info; + info.level = MSPROF_REPORT_NODE_LEVEL; + info.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM + info.type = MSPROF_REPORT_NODE_LAUNCH_TYPE; + info.threadId = threadId; + info.reserve = 0; + info.beginTime = beginTime; + info.endTime = endTime; + info.itemId = opNameHashID; + MsprofReportApi(false, &info); + }} + if (__MsprofFlagL1) + {{ + MsprofCompactInfo nodeBasicInfo; + nodeBasicInfo.level = MSPROF_REPORT_NODE_LEVEL; + nodeBasicInfo.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM + nodeBasicInfo.type = MSPROF_REPORT_NODE_BASIC_INFO_TYPE; + nodeBasicInfo.threadId = threadId; + nodeBasicInfo.timeStamp = endTime; + nodeBasicInfo.data.nodeBasicInfo.opName = opNameHashID; + nodeBasicInfo.data.nodeBasicInfo.opType = opNameHashID; + nodeBasicInfo.data.nodeBasicInfo.taskType = {task_type}; + nodeBasicInfo.data.nodeBasicInfo.blockDim = blockNum; + MsprofReportCompactInfo(0, static_cast(&nodeBasicInfo), sizeof(MsprofCompactInfo)); + + // Report tensor info + int max_tensors_num = tensorShapes.size() < MSPROF_GE_TENSOR_DATA_NUM ? tensorShapes.size() : MSPROF_GE_TENSOR_DATA_NUM; + MsprofAdditionalInfo tensorInfo; + tensorInfo.level = MSPROF_REPORT_NODE_LEVEL; + tensorInfo.type = MSPROF_REPORT_NODE_TENSOR_INFO_TYPE; + tensorInfo.threadId = threadId; + tensorInfo.timeStamp = endTime; + auto profTensorData = reinterpret_cast(tensorInfo.data); + profTensorData->opName = opNameHashID; + int tensorCount = 0; + int dataTypes[MSPROF_GE_TENSOR_DATA_NUM]; + if (tensorShapes.size() > 0) {{ + {LINE_CHANGE_CHAR.join( + f'dataTypes[{i}] = {convert_sigtype_to_int(ty[1:])};' + for i, ty in signature.items() + if ty.startswith("*") and i < 5 + )} + }} + for (int i = 0; i < tensorShapes.size() && tensorCount < MSPROF_GE_TENSOR_DATA_NUM; i++) {{ + auto fillTensorData = [&](int index, int tensorType) {{ + profTensorData->tensorData[index].tensorType = tensorType; + profTensorData->tensorData[index].format = 2; // GeDataFormat: ND = 2 + profTensorData->tensorData[index].dataType = dataTypes[i]; + int nDim = tensorShapes[i].size(); + nDim = nDim < MSPROF_GE_TENSOR_DATA_SHAPE_LEN ? nDim : MSPROF_GE_TENSOR_DATA_SHAPE_LEN; + for (int j = 0; j < nDim; j++) {{ + profTensorData->tensorData[index].shape[j] = tensorShapes[i][j]; + }} + for (int j = nDim; j < MSPROF_GE_TENSOR_DATA_SHAPE_LEN; j++) {{ + profTensorData->tensorData[index].shape[j] = 0; + }} + }}; + int tensorType = (i < tensorKinds.size()) ? tensorKinds[i] : 0; // DeFault tensor type is input + if (tensorType == TENSOR_KIND_INPUT || tensorType == TENSOR_KIND_INPUT_OUTPUT) {{ + fillTensorData(tensorCount, MSPROF_GE_TENSOR_TYPE_INPUT); + tensorCount++; + }} + if ((tensorType == TENSOR_KIND_OUTPUT || tensorType == TENSOR_KIND_INPUT_OUTPUT) && tensorCount < MSPROF_GE_TENSOR_DATA_NUM){{ + fillTensorData(tensorCount, MSPROF_GE_TENSOR_TYPE_OUTPUT); + tensorCount++; + }} + }} + profTensorData->tensorNum = tensorCount; + MsprofReportAdditionalInfo(false, static_cast(&tensorInfo), sizeof(MsprofAdditionalInfo)); + }} +""" + + return f""" +#include +#include +#include +#include +#include +#define PY_SSIZE_T_CLEAN +#include +{'#include ' if enable_taskqueue else ''} +#include "experiment/runtime/runtime/rt.h" +{extract_device_print_code_from_cann() if enable_device_print else ''} + +#define TENSOR_KIND_INPUT 0 +#define TENSOR_KIND_OUTPUT 1 +#define TENSOR_KIND_INPUT_OUTPUT 2 + +{cpp_msprof_extern} + +{cpp_device_pointer} + +static void _launch(const char* kernelName, const void* func, rtStream_t stream, int gridX, int gridY, int gridZ, std::vector> &tensorShapes, std::vector &tensorKinds, {arg_decls}) {{ + // only 1D parallelization is supported for NPU + // Pointer type becomes flattend 1-D Memref tuple: base_ptr, data_ptr, offset, shape, stride + // base_ptr offset shape and stride are not used, arbitrarily set for now + std::string name = ""; + name.append(kernelName); + {'auto launch_call = [=]()' if enable_taskqueue else ''} {{ + uint32_t blockNum = gridX * gridY * gridZ; + {'blockNum = std::min(blockNum, (uint32_t)' + str(num_physical_blocks) + ');' if enable_auto_map_parallel_blocks else ''} + {'cce::internal::DebugTunnelData *DTData = cce::internal::DebugTunnel::Open(blockNum);' if enable_device_print else ''} + rtError_t ret; + void *ffts_addr = NULL; + uint32_t ffts_len; ret = rtGetC2cCtrlAddr((uint64_t*)&ffts_addr, &ffts_len); + if (ret != RT_ERROR_NONE) {{ + return {'ret' if enable_taskqueue else ''}; + }} + // stub argument for workspace + void *syncBlockLock = NULL; + void *workspace_addr = NULL; + uint16_t ModuleId = 0; + {f''' + uint64_t syncBlockLockSize = {lock_num} * sizeof(int64_t); + ret = rtMalloc(reinterpret_cast(&syncBlockLock), + syncBlockLockSize, RT_MEMORY_HBM, 0); + if (ret != RT_ERROR_NONE) {{ + return {'ret' if enable_taskqueue else ''}; + }} + std::vector lockInitData({lock_num}, {lock_ini_val}); + ret = rtMemcpy(syncBlockLock, syncBlockLockSize, reinterpret_cast(lockInitData.data()), + syncBlockLockSize, RT_MEMCPY_HOST_TO_DEVICE); + if (ret != RT_ERROR_NONE) {{ + return {'ret' if enable_taskqueue else ''}; + }} + ''' if lock_num > 0 else ''} + {f''' + uint64_t totalWorkSpaceSize = {workspace_size} * blockNum; + ret = rtMalloc(reinterpret_cast(&workspace_addr), + totalWorkSpaceSize, RT_MEMORY_HBM, ModuleId); + if (ret != RT_ERROR_NONE) {{ + return {'ret' if enable_taskqueue else ''}; + }} + ''' if workspace_size > 0 else ''} + struct __attribute__((packed)) {{ + void* ffts_addr __attribute__((aligned(8))); + void* syncBlockLock __attribute__((aligned(8))); + void* workspace_addr __attribute__((aligned(8))); + {' '.join(f'{_ty_to_cpp(ty)} arg{i} __attribute__((aligned({4 if ty[0] != "*" and ty[-2:] != "64" else 8})));' for i, ty in signature.items() if i not in constants)} + {' '.join(f'{_ty_to_cpp(ty)} grid{mark} __attribute__((aligned(4)));' for mark, ty in grid_info.items())} + {'void* DTData __attribute__((aligned(8)));' if enable_device_print else ''} + }} args = {{ + static_cast(ffts_addr), + static_cast(syncBlockLock), + static_cast(workspace_addr), + {', '.join(f'static_cast<{_ty_to_cpp(ty)}>(arg{i})' for i, ty in signature.items() if i not in constants)}, + {', '.join(f'static_cast<{_ty_to_cpp(ty)}>(grid{mark})' for mark, ty in grid_info.items())} + {', static_cast(DTData)' if enable_device_print else ''} + }}; + {cpp_msprof_call_before_launch} + ret = rtKernelLaunch(func, blockNum, static_cast(&args), sizeof(args), NULL, stream); + {'void *&stream_ref = const_cast(stream);' if enable_device_print else ''} + {'cce::internal::DebugTunnel::Close(DTData, stream_ref);' if enable_device_print else ''} + {cpp_msprof_call_after_launch} + {'return ret;' if enable_taskqueue else ''} + }}; + {'at_npu::native::OpCommand cmd; cmd.Name(name.c_str()).SetCustomHandler(launch_call).Run();' if enable_taskqueue else ''} + return; +}} + +// Extract tensor shape from PyObject +static std::vector _get_tensor_shape(PyObject *tensor) {{ + std::vector shape; + + // Early return if tensor is None or null + if (!tensor || tensor == Py_None) {{ + return shape; + }} + + // Calling tensor.size() + PyObject* size_result = PyObject_CallMethod(tensor, "size", NULL); + if (!size_result) {{ + return shape; + }} + // Using PySequence_Fast to improve access efficiency + PyObject* seq = PySequence_Fast(size_result, "Expected a sequence from tensor.size()"); + if (seq) {{ + Py_ssize_t len = PySequence_Fast_GET_SIZE(seq); + PyObject** items = PySequence_Fast_ITEMS(seq); + for (Py_ssize_t i = 0; i < len; ++i) {{ + PyObject* dim = items[i]; + if (PyLong_Check(dim)) {{ + shape.push_back(PyLong_AsLong(dim)); + }} + }} + }} + Py_DECREF(seq); + Py_DECREF(size_result); + return shape; +}} + +static PyObject* launch(PyObject* self, PyObject* args) {{ + int gridX, gridY, gridZ; + rtStream_t stream; + const void *function; + PyObject *packedMetadata = NULL; + PyObject *launch_metadata = NULL; + PyObject *launch_enter_hook = NULL; + PyObject *launch_exit_hook = NULL; + std::vector> tensorShapes; + {' '.join([f"{_extracted_ty(ty)} _arg{i}; " for i, ty in signature.items()])} + if(!PyArg_ParseTuple( + args, \"{format}\", + &gridX, &gridY, &gridZ, &stream, &function, + &packedMetadata, &launch_metadata, + &launch_enter_hook, &launch_exit_hook + {', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''} + ) + ) {{ + return NULL; + }} + if (__MsprofFlagL1) + {{ + { + LINE_CHANGE_CHAR.join( + f"{{ auto tmp = _get_tensor_shape(_arg{i}); if (!tmp.empty()) tensorShapes.push_back(tmp); }}" + for i, ty in signature.items() if ty[0] == "*" + ) + } + }} + + if (launch_enter_hook != Py_None && !PyObject_CallObject(launch_enter_hook, args)) {{ + return NULL; + }} + + // get kernel_name + PyObject *kernelNameObj = PyDict_GetItemString(packedMetadata, "kernel_name"); + const char *kernelName = PyUnicode_AsUTF8(kernelNameObj); + // get tensor_kinds + std::vector tensorKinds; + PyObject *tensorKindList = PyDict_GetItemString(packedMetadata, "tensor_kinds"); + if (tensorKindList) {{ + int size = PyObject_Size(tensorKindList); + for (int i = 0; i < size; i++) {{ + PyObject *kind = PySequence_GetItem(tensorKindList, i); + tensorKinds.push_back(PyLong_AsLong(kind)); + }} + }} + + // raise exception asap + {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0]=="*" else "" for i, ty in signature.items()])}; + _launch(kernelName, function, stream, gridX, gridY, gridZ, tensorShapes, tensorKinds, {', '.join(f"ptr_info{i}.dev_ptr" if ty[0]=="*" else f"_arg{i}" for i, ty in signature.items())}); + if (PyErr_Occurred()) {{ + return NULL; + }} + if (launch_exit_hook != Py_None && !PyObject_CallObject(launch_exit_hook, args)) {{ + return NULL; + }} + Py_RETURN_NONE; +}} + +static PyMethodDef ModuleMethods[] = {{ + {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}}, + {{NULL, NULL, 0, NULL}} // sentinel +}}; + +static struct PyModuleDef ModuleDef = {{ + PyModuleDef_HEAD_INIT, + \"__triton_launcher\", + NULL, //documentation + -1, //size + ModuleMethods +}}; + +PyMODINIT_FUNC PyInit___triton_launcher(void) {{ + PyObject *m = PyModule_Create(&ModuleDef); + if(m == NULL) {{ + return NULL; + }} + PyModule_AddFunctions(m, ModuleMethods); + {cpp_msprof_callback} + return m; +}} +""" diff --git a/third_party/aipu/backend/npu_utils.cpp b/third_party/aipu/backend/npu_utils.cpp new file mode 100644 index 000000000..74f3d5ca7 --- /dev/null +++ b/third_party/aipu/backend/npu_utils.cpp @@ -0,0 +1,134 @@ +#define PY_SSIZE_T_CLEAN +#include + +#include +#include +#include +#include + +#include "experiment/runtime/runtime/rt.h" + +// Use map to differentiate same name functions from different binary +static std::unordered_map registered_names; +static std::unordered_map> func_stubs; + +static std::tuple +registerKernel(const char *name, const void *data, size_t data_size, int shared, + int device, const char *kernel_mode_str) { + rtError_t rtRet; + + rtDevBinary_t devbin; + devbin.data = data; + devbin.length = data_size; + const std::string kernel_mode{kernel_mode_str}; + if (kernel_mode == "aiv") + devbin.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; + else + devbin.magic = RT_DEV_BINARY_MAGIC_ELF; + devbin.version = 0; + + rtRet = rtSetDevice(device); + if (rtRet != RT_ERROR_NONE) { + printf("rtSetDevice failed, 0x%x\n", rtRet); + return {NULL, NULL}; + } + + void *devbinHandle = NULL; + rtRet = rtDevBinaryRegister(&devbin, &devbinHandle); + if (rtRet != RT_ERROR_NONE) { + printf("rtDevBinaryRegister failed, 0x%x\n", rtRet); + return {NULL, NULL}; + } + + std::string stubName = name; + stubName += "_" + std::to_string(registered_names[name]); + registered_names[name]++; + auto registered = func_stubs.emplace(stubName, std::make_unique(0)); + void *func_stub_handle = registered.first->second.get(); + rtRet = rtFunctionRegister(devbinHandle, func_stub_handle, stubName.c_str(), + (void *)name, 0); + if (rtRet != RT_ERROR_NONE) { + printf("rtFunctionRegister failed(stubName = %s), 0x%x\n", stubName.c_str(), + rtRet); + return {NULL, NULL}; + } + + return std::make_tuple(devbinHandle, func_stub_handle); +} + +static PyObject *loadKernelBinary(PyObject *self, PyObject *args) { + const char *name; // kernel name + const char *data; // binary pointer + Py_ssize_t data_size; // binary size + int shared; // shared_memory(meaningless now) + int device; // device ID + const char *kernel_mode; // kernel mode + + if (!PyArg_ParseTuple(args, "ss#iis", &name, &data, &data_size, &shared, + &device, &kernel_mode)) { + return NULL; + } + + auto [module_handle, func_handle] = + registerKernel(name, data, data_size, shared, device, kernel_mode); + + uint64_t mod = reinterpret_cast(module_handle); + uint64_t func = reinterpret_cast(func_handle); + if (PyErr_Occurred()) { + return NULL; + } + + return Py_BuildValue("(KKii)", mod, func, 0, 0); +} + +static PyObject *getArch(PyObject *self, PyObject *args) { + char name[64] = {'\0'}; + + rtError_t rtRet = rtGetSocVersion(name, 64); + + if (rtRet != RT_ERROR_NONE) { + printf("rtGetSocVersion failed, 0x%x", rtRet); + return NULL; + } + if (PyErr_Occurred()) { + return NULL; + } + return Py_BuildValue("s", name); +} + +static PyObject *getAiCoreNum(PyObject *self, PyObject *args) { + uint32_t aiCoreCnt; + + rtError_t rtRet = rtGetAiCoreCount(&aiCoreCnt); + + if (rtRet != RT_ERROR_NONE) { + printf("rtGetAiCoreCount failed, 0x%x", rtRet); + return NULL; + } + if (PyErr_Occurred()) { + return NULL; + } + return Py_BuildValue("I", aiCoreCnt); +} + +static PyMethodDef NpuUtilsMethods[] = { + {"load_kernel_binary", loadKernelBinary, METH_VARARGS, + "Load NPU kernel binary into NPU driver"}, + {"get_arch", getArch, METH_VARARGS, "Get soc version of NPU"}, + {"get_aicore_num", getAiCoreNum, METH_VARARGS, "Get the number of AI core"}, + {NULL, NULL, 0, NULL}}; + +static PyModuleDef ModuleDef = { + PyModuleDef_HEAD_INIT, "npu_utils", + "Utilities for fetching NPU device info and preparing kernel binary", -1, + NpuUtilsMethods}; + +PyMODINIT_FUNC PyInit_npu_utils(void) { + PyObject *m = PyModule_Create(&ModuleDef); + if (m == NULL) { + return NULL; + } + + PyModule_AddFunctions(m, NpuUtilsMethods); + return m; +} diff --git a/third_party/aipu/backend/utils.py b/third_party/aipu/backend/utils.py new file mode 100644 index 000000000..ec9919102 --- /dev/null +++ b/third_party/aipu/backend/utils.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +import functools +import os +import shutil +import subprocess +import sysconfig +from pathlib import Path +import pybind11 + + +def _get_npucompiler_path() -> str: + npu_compiler_path = shutil.which("bishengir-compile") + if npu_compiler_path is None: + npu_compiler_root = os.getenv("TRITON_NPU_COMPILER_PATH", "") + if npu_compiler_root is None: + raise EnvironmentError("Couldn't find executable bishengir-compile or TRITON_NPU_COMPILER_PATH.") + npu_compiler_path = os.path.join(npu_compiler_root, "npuc") + return npu_compiler_path + + +def _get_bisheng_path() -> str: + bisheng_path = shutil.which("bisheng") + if bisheng_path is None: + npu_compiler_root = os.getenv("TRITON_NPU_COMPILER_PATH", "") + if npu_compiler_root is None: + raise EnvironmentError("Couldn't find executable bisheng or TRITON_NPU_COMPILER_PATH") + bisheng_path = os.path.join(npu_compiler_root, "ccec") + return bisheng_path + + +def _check_bishengir_api_change() -> bool: + bishengir_path = _get_npucompiler_path() + try: + result = subprocess.run( + [bishengir_path, "--help"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode == 0 and 'limit-auto-multi-buffer-buffer' in result.stdout: + return True + else: + return False + except Exception as e: + print(f"ERROR: {e}") + return False + + +def _check_bishengir_is_regbased() -> bool: + bishengir_path = _get_npucompiler_path() + try: + result = subprocess.run( + [bishengir_path, "--help"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + if result.returncode == 0 and 'reg-based' in result.stdout: + return True + else: + return False + except Exception as e: + print(f"ERROR: {e}") + return False + + +@functools.lru_cache(None) +def _get_ascend_path() -> str: + path = os.getenv("ASCEND_HOME_PATH", "") + if path == "": + raise EnvironmentError("ASCEND_HOME_PATH is not set, source /set_env.sh first") + return Path(path) + + +def _is_ascend_sanitizer_enabled() -> bool: + return os.getenv("TRITON_ENABLE_SANITIZER", "false").lower() in ("true", "1") + + +def _is_debug_line_info_disabled() -> bool: + return os.getenv("TRITON_DISABLE_LINE_INFO", "true").lower() in ("true", "1") + + +def _is_auto_map_parallel_blocks_enabled() -> bool: + if not _enable_unpublished_feature(): + return False + return os.getenv("TRITON_ALL_BLOCKS_PARALLEL", "false").lower() in ("true", "1") + + +def _enable_unpublished_feature() -> bool: + return os.getenv("ENABLE_UNPUBLISHED_FEATURE", "false").lower() in ("true", "1") + + +def _build_npu_ext(obj_name: str, src_path, src_dir, *, kernel_launcher=None) -> str: + suffix = sysconfig.get_config_var("EXT_SUFFIX") + so_path = os.path.join(src_dir, f"{obj_name}{suffix}") + + cxx = os.environ.get("CC") + if cxx is None: + clangxx = shutil.which("clang++") + gxx = shutil.which("g++") + cxx = clangxx if clangxx is not None else gxx + if cxx is None: + raise RuntimeError("Failed to find C++ compiler") + cc_cmd = [cxx, src_path] + cc_cmd += [f"-w"] + + if hasattr(sysconfig, "get_default_scheme"): + scheme = sysconfig.get_default_scheme() + else: + scheme = sysconfig._get_default_scheme() + if scheme == "posix_local": + scheme = "posix_prefix" + py_include_dir = sysconfig.get_paths(scheme=scheme)["include"] + cc_cmd += [f"-I{py_include_dir}"] + cc_cmd += [f"-I{os.path.dirname(os.path.realpath(__file__))}"] + + asc_path = _get_ascend_path() + cc_cmd += [ + f"-I{os.path.join(asc_path, 'include')}", + f"-I{os.path.join(asc_path, 'include/experiment')}", + f"-I{os.path.join(asc_path, 'include/experiment/msprof')}", + f"-I{pybind11.get_include()}", + f"-L{os.path.join(asc_path, 'lib64')}", + "-lruntime", + "-lascendcl", + ] + + if kernel_launcher == "torch": + import torch + import torch_npu + torch_path = os.path.dirname(os.path.realpath(torch.__file__)) + torch_npu_path = os.path.dirname(os.path.realpath(torch_npu.__file__)) + use_cxx11_abi = _check_cxx11_abi() + cc_cmd += [ + f"-I{os.path.join(torch_path, 'include')}", + f"-I{os.path.join(torch_npu_path, 'include')}", + f"-L{os.path.join(torch_npu_path, 'lib')}", + "-ltorch_npu", + f"-D_GLIBCXX_USE_CXX11_ABI={use_cxx11_abi}", + ] + + cc_cmd += ["-std=c++17", "-shared", "-fPIC", "-o", so_path] + ret = subprocess.check_call(cc_cmd) + + if ret == 0: + return so_path + else: + raise RuntimeError("Failed to compile " + src_path) + + +def _check_cxx11_abi(): + import torch + return 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 + + +def convert_sigtype_to_int(sigty: str): + MAP_SIGTYPE_TO_INT = { + "i1": 12, + "i8": 2, + "i16": 6, + "i32": 3, + "i64": 9, + "u32": 8, + "u64": 10, + "fp16": 1, + "bf16": 27, + "fp32": 0, + "fp64": 11, + } + if sigty not in MAP_SIGTYPE_TO_INT: + raise ValueError(f"Unsupported data type: {sigty}") + return MAP_SIGTYPE_TO_INT[sigty]