nod-ai
diff --git a/‎core/shark_turbine/runtime/device.py‎
Lines changed: 4 additions & 1 deletion b/‎core/shark_turbine/runtime/device.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 6 additions & 3 deletions b/‎core/shark_turbine/runtime/op_reg/base.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎core/shark_turbine/runtime/op_reg/compiler.py‎
Lines changed: 25 additions & 10 deletions b/‎core/shark_turbine/runtime/op_reg/compiler.py‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎core/shark_turbine/runtime/op_reg/eager.py‎
Lines changed: 66 additions & 1 deletion b/‎core/shark_turbine/runtime/op_reg/eager.py‎
Lines changed: 66 additions & 1 deletion
diff --git a/‎core/shark_turbine/runtime/tracing.py‎
Lines changed: 111 additions & 0 deletions b/‎core/shark_turbine/runtime/tracing.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎core/shark_turbine/support/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎core/shark_turbine/support/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -298,7 +298,10 @@ def _device_export_torch_tensor_cpu(
 }
 
 DEVICE_TARGET_COMPILE_FLAGS: dict[str, tuple[str, ...]] = {
-    "local-task": ("--iree-hal-target-backends=llvm-cpu",),
+    "local-task": (
+        "--iree-hal-target-backends=llvm-cpu",
+        "--iree-llvmcpu-target-cpu-features=host",
+    ),
 }
 
 # Aliases.
 
@@ -10,7 +10,7 @@
 
 from typing import Any, Callable, Optional, Sequence, Type, Union, cast
 
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
 import functools
 import logging
 import re
@@ -37,6 +37,8 @@
     func_d,
 )
 
+from ...support.logging import runtime_logger as logger
+
 from ...support.conversions import (
     TORCH_DTYPE_TO_IREE_TYPE_ASM,
 )
@@ -53,7 +55,6 @@
     "def_library",
 ]
 
-logger = logging.getLogger("turbine.runtime.op_reg")
 
 ###############################################################################
 # Op library management
@@ -167,7 +168,8 @@ def __init__(
         fq_name = f"{library.ns}.{name}"
         ALL_CUSTOM_OP_REGS[fq_name] = self
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def signature(self) -> str:
         """PyTorch function signature.
 
@@ -616,6 +618,7 @@ def __init__(
         self.arg_bindings = arg_bindings
         self.ip = ip
         self.module_body = module_body
+        self.context = module_body.owner.context
         self.symbol_table = symbol_table
         self.yielded = False
 
 
@@ -6,7 +6,7 @@
 
 from dataclasses import dataclass
 from timeit import default_timer
-from typing import Any
+from typing import Any, Optional
 
 from iree.compiler.api import (
     Session,
@@ -36,6 +36,8 @@
     Device,
 )
 
+from ..tracing import tracer
+
 from .base import (
     FreeFuncKernelBuilder,
     KernelSelection,
@@ -69,6 +71,10 @@ class KernelCompileConfig:
     # things like unbacked memory mappings, etc.
     keep_alive: Any = None
 
+    # If tracing is enabled, this may contain a sanitized key that can be
+    # used to log additional information against the kernel.
+    tracing_key: Optional[str] = None
+
 
 # TODO: The cache should be more than just a simple dict. Can be persistent
 KERNEL_CACHE: dict[str, tuple[VmContext, VmFunction, KernelCompileConfig]] = {}
@@ -95,7 +101,7 @@ def compile_standalone_kernel(
         ksel.op.generate(ksel, kb)
     kb.module_op.verify()
     module_asm = kb.module_op.get_asm(
-        binary=True, enable_debug_info=True, print_generic_op_form=True
+        binary=True, enable_debug_info=True, print_generic_op_form=False
     )
     generation_time = default_timer() - start
 
@@ -129,14 +135,23 @@ def compile_standalone_kernel(
     vm_context = VmContext(vm_instance, [device.create_hal_module(), vm_module])
     main_function = vm_module.lookup_function("main")
 
-    logger.debug(
-        "Compiled kernel %s: mlir=%d bytes, vmfb=%d bytes (generation: %sms, compilation: %sms)",
-        cache_key,
-        len(module_asm),
-        len(mapped_memory),
-        generation_time * 1000,
-        compilation_time * 1000,
-    )
+    if tracer.enabled:
+        config.tracing_key = tracer.save_jit_kernel_artifacts(
+            cache_key=cache_key, module_asm=module_asm, binary=mapped_memory
+        )
+        tracer.log_structured(
+            tag="COMPILE",
+            msg=f"Compiled kernel {config.tracing_key}, cache_key={cache_key}",
+            columns=[
+                config.tracing_key,
+                main_function.name,
+                len(module_asm),
+                len(mapped_memory),
+                generation_time * 1000,
+                compilation_time * 1000,
+                " ".join(session.get_flags(non_default_only=True)),
+            ],
+        )
     cache_hit = (vm_context, main_function, config)
     KERNEL_CACHE[cache_key] = cache_hit
     return cache_hit
@@ -13,6 +13,8 @@
 
 from iree.runtime import (
     HalBufferView,
+    HalElementType,
+    VmRef,
     VmVariantList,
 )
 
@@ -29,6 +31,8 @@
     lookup_device_from_torch,
 )
 
+from ..tracing import tracer
+
 from .base import (
     AttrArg,
     IntArg,
@@ -37,6 +41,7 @@
 
 from .compiler import (
     compile_standalone_kernel,
+    KernelCompileConfig,
 )
 
 __all__ = [
@@ -153,7 +158,8 @@ def push_tensor(tensor_arg):
     start = default_timer()
     vm_context.invoke(vm_f, arg_list, ret_list)
     invoke_time = default_timer() - start
-    logger.debug("Kernel invocation %s: %sms", config.key, invoke_time * 1000)
+    if tracer.enabled:
+        _log_eager_dispatch(config, arg_list, invoke_time * 1000)
 
     # Unpack results.
     results = []
@@ -179,3 +185,62 @@ def push_tensor(tensor_arg):
         return None
     else:
         return tuple(results)
+
+
+def _log_eager_dispatch(
+    config: KernelCompileConfig, arg_list: VmVariantList, invoke_time_millis: float
+):
+    args = []
+    try:
+        for i in range(arg_list.size):
+            variant = arg_list.get_variant(i)
+            if isinstance(variant, VmRef):
+                if variant.isinstance(HalBufferView):
+                    args.append(_log_format_buffer_view(variant.deref(HalBufferView)))
+                    continue
+            args.append(variant)
+    except:
+        tracer.exception("Exception while pretty-printing arguments")
+
+    msg = ""
+    tracer.log_structured(
+        tag="INVOKE_KERNEL",
+        msg=msg,
+        columns=[config.tracing_key, invoke_time_millis] + args,
+    )
+
+
+def _log_format_buffer_view(bv: HalBufferView) -> str:
+    # TODO: We should expose this as a method on HalBufferView upstream instead
+    # of half doing it here.
+    shape = "x".join(str(i) for i in bv.shape)
+    dtype_desc = _LOG_HAL_ELEMENT_TYPE_DESC.get(bv.element_type)
+    if dtype_desc is None:
+        dtype_desc = f"<{bv.element_type}>"
+    return f"{shape}x{dtype_desc}"
+
+
+_LOG_HAL_ELEMENT_TYPE_DESC = {
+    HalElementType.BFLOAT_16: "bf16",
+    HalElementType.BOOL_8: "i1",
+    HalElementType.COMPLEX_64: "cf64",
+    HalElementType.COMPLEX_128: "cf128",
+    HalElementType.FLOAT_16: "f16",
+    HalElementType.FLOAT_32: "f32",
+    HalElementType.FLOAT_64: "f64",
+    HalElementType.INT_4: "i4",
+    HalElementType.INT_8: "i8",
+    HalElementType.INT_16: "i16",
+    HalElementType.INT_32: "i32",
+    HalElementType.INT_64: "i64",
+    HalElementType.SINT_4: "si4",
+    HalElementType.SINT_8: "si8",
+    HalElementType.SINT_16: "si16",
+    HalElementType.SINT_32: "si32",
+    HalElementType.SINT_64: "si64",
+    HalElementType.UINT_4: "ui4",
+    HalElementType.UINT_8: "ui8",
+    HalElementType.UINT_16: "ui16",
+    HalElementType.UINT_32: "ui32",
+    HalElementType.UINT_64: "ui64",
+}
@@ -0,0 +1,111 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import hashlib
+import os
+from pathlib import Path
+import logging
+
+from ..support.debugging import flags
+from ..support.logging import get_logger, DefaultFormatter
+
+logger = get_logger("turbine.runtime")
+
+
+class RuntimeTracer:
+    """Supports fine grained tracing of runtime interactions.
+
+    The default implementation no-ops.
+    """
+
+    __slots__ = ["enabled"]
+
+    def __init__(self):
+        self.enabled: bool = False
+
+    def save_jit_kernel_artifacts(
+        self, *, cache_key: str, module_asm: bytes, binary: memoryview
+    ) -> str:
+        return cache_key
+
+    def info(self, msg, *args, **kwargs):
+        ...
+
+    def error(self, msg, *args, **kwargs):
+        ...
+
+    def exception(self, msg, *args, **kwargs):
+        ...
+
+    def log_structured(self, *, tag: str, msg: str, columns: list):
+        ...
+
+
+class DirectoryTracer(RuntimeTracer):
+    __slots__ = [
+        "dir",
+        "logger",
+    ]
+
+    def __init__(self, dir: Path):
+        self.dir = dir
+        self.enabled = True
+        # Configure a root logger that outputs what we want.
+        trace_logger = self.logger = logging.getLogger("turbine.runtime.tracer")
+        log_file = dir / "runtime.log"
+        trace_logger.setLevel(logging.DEBUG)
+        handler = logging.FileHandler(log_file)
+        handler.setFormatter(DefaultFormatter())
+        trace_logger.addHandler(handler)
+        trace_logger.propagate = False
+        logger.info(f"Set up turbine runtime tracing to %s", log_file)
+        trace_logger.info("Started process %d", os.getpid())
+
+    def save_jit_kernel_artifacts(
+        self, *, cache_key: str, module_asm: bytes, binary: memoryview
+    ) -> str:
+        hasher = hashlib.sha1(cache_key.encode(), usedforsecurity=False)
+        tracing_key = hasher.digest().hex()
+        try:
+            with open(self.dir / f"{tracing_key}.mlir", "wb") as f:
+                f.write(module_asm)
+            with open(self.dir / f"{tracing_key}.vmfb", "wb") as f:
+                f.write(binary)
+        except IOError:
+            self.logger.exception(f"Error saving artifact for {tracing_key}")
+        finally:
+            self.logger.info(f"Saved artifacts for {tracing_key}")
+        return tracing_key
+
+    def info(self, msg, *args, **kwargs):
+        self.logger.info(msg, *args, **kwargs)
+
+    def error(self, msg, *args, **kwargs):
+        self.logger.error(msg, *args, **kwargs)
+
+    def exception(self, msg, *args, **kwargs):
+        self.logger.exception(msg, *args, **kwargs, stacklevel=2)
+
+    def log_structured(self, *, tag: str, msg: str, columns: list):
+        columns_joined = "\t".join(str(c) for c in columns)
+        self.logger.info("%s\n::%s\t%s", msg, tag, columns_joined)
+
+
+# Determine whether configured to do real tracing.
+def _setup_default_tracer() -> RuntimeTracer:
+    if flags.runtime_trace_dir:
+        try:
+            trace_dir = Path(flags.runtime_trace_dir)
+            trace_dir.mkdir(parents=True, exist_ok=True)
+            return DirectoryTracer(trace_dir)
+        except IOError:
+            logger.exception("Error configuring runtime tracing to: %s", trace_dir)
+            return RuntimeTracer()
+
+    return RuntimeTracer()
+
+
+tracer: RuntimeTracer = _setup_default_tracer()
@@ -4,4 +4,6 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+# Debugging must be loaded first as other low level things depend on it.
+from .debugging import *
 from .exceptions import *
Original file line number	Diff line number	Diff line change
`@@ -298,7 +298,10 @@ def _device_export_torch_tensor_cpu(`
`298`	`298`	`}`
`299`	`299`
`300`	`300`	`DEVICE_TARGET_COMPILE_FLAGS: dict[str, tuple[str, ...]] = {`
`301`		`- "local-task": ("--iree-hal-target-backends=llvm-cpu",),`
	`301`	`+ "local-task": (`
	`302`	`+ "--iree-hal-target-backends=llvm-cpu",`
	`303`	`+ "--iree-llvmcpu-target-cpu-features=host",`
	`304`	`+ ),`
`302`	`305`	`}`
`303`	`306`
`304`	`307`	`# Aliases.`