JuliaGPU
diff --git a/‎docs/src/api/kernel.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/src/api/kernel.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/src/usage/kernel.md‎
Lines changed: 28 additions & 0 deletions b/‎docs/src/usage/kernel.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎lib/mtl/MTL.jl‎
Lines changed: 1 addition & 0 deletions b/‎lib/mtl/MTL.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/mtl/command_queue.jl‎
Lines changed: 25 additions & 0 deletions b/‎lib/mtl/command_queue.jl‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎lib/mtl/log_state.jl‎
Lines changed: 40 additions & 0 deletions b/‎lib/mtl/log_state.jl‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/Metal.jl‎
Lines changed: 1 addition & 0 deletions b/‎src/Metal.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/compiler/compilation.jl‎
Lines changed: 6 additions & 4 deletions b/‎src/compiler/compilation.jl‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/compiler/execution.jl‎
Lines changed: 31 additions & 3 deletions b/‎src/compiler/execution.jl‎
Lines changed: 31 additions & 3 deletions
@@ -53,4 +53,13 @@ MtlThreadGroupArray
 MemoryFlags
 threadgroup_barrier
 simdgroup_barrier
+```
+
+## Printing
+
+```@docs
+@mtlprintf
+@mtlprint
+@mtlprintln
+@mtlshow
 ```
@@ -84,6 +84,34 @@ Additional notes:
 - Kernels must always return nothing
 - Kernels are asynchronous. To synchronize, use the `Metal.@sync` macro.
 
+## Printing
+
+When debugging, it's not uncommon to want to print some values. This is achieved with `@mtlprintf`:
+
+```julia
+function gpu_add2_print!(y, x)
+    index = thread_position_in_grid_1d()
+    @mtlprintf("thread %d", index)
+    @inbounds y[i] += x[i]
+    return nothing
+end
+
+A = Metal.ones(Float32, 8);
+B = Metal.rand(Float32, 8);
+
+@metal threads=length(A) gpu_add2_print!(A, B)
+```
+
+`@mtlprintf` is supported on macOS 15 and later. `@mtlprintf` support most of the format specifiers that `printf`
+supports in C with the following exceptions:
+ - `%n` and `%s` conversion specifiers are not supported
+ - Default argument promotion applies to arguments of half type which promote to the `double` type
+ - The format string must be a string literal
+
+Metal places output from `@mtlprintf` into a log buffer. The system only removes the messages from the log buffer when the command buffer completes. When the log buffer becomes full, the system drops all subsequent messages.
+
+See also: `@mtlprint`, `@mtlprintln` and `@mtlshow`
+
 ## Other Helpful Links
 
 [Metal Shading Language Specification](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf)
@@ -34,6 +34,7 @@ include("events.jl")
 include("fences.jl")
 include("heap.jl")
 include("buffer.jl")
+include("log_state.jl")
 include("command_queue.jl")
 include("command_buf.jl")
 include("compute_pipeline.jl")
 
@@ -1,3 +1,21 @@
+export MTLCommandQueueDescriptor
+
+@objcwrapper immutable=false MTLCommandQueueDescriptor <: NSObject
+
+@objcproperties MTLCommandQueueDescriptor begin
+    @autoproperty maxCommandBufferCount::NSUInteger
+    @autoproperty logState::id{MTLLogState} setter=setLogState
+end
+
+function MTLCommandQueueDescriptor()
+    handle = @objc [MTLCommandQueueDescriptor alloc]::id{MTLCommandQueueDescriptor}
+    obj = MTLCommandQueueDescriptor(handle)
+    finalizer(release, obj)
+    @objc [obj::id{MTLCommandQueueDescriptor} init]::id{MTLCommandQueueDescriptor}
+    return obj
+end
+
+
 export MTLCommandQueue
 
 # @objcwrapper immutable=false MTLCommandQueue <: NSObject
@@ -8,3 +26,10 @@ function MTLCommandQueue(dev::MTLDevice)
     finalizer(release, obj)
     return obj
 end
+
+function MTLCommandQueue(dev::MTLDevice, descriptor::MTLCommandQueueDescriptor)
+    handle = @objc [dev::id{MTLDevice} newCommandQueueWithDescriptor:descriptor::id{MTLCommandQueueDescriptor}]::id{MTLCommandQueue}
+    obj = MTLCommandQueue(handle)
+    finalizer(release, obj)
+    return obj
+end
@@ -0,0 +1,40 @@
+export MTLLogLevel
+
+@cenum MTLLogLevel::NSInteger begin
+    MTLLogLevelUndefined = 0
+    MTLLogLevelDebug = 1
+    MTLLogLevelInfo = 2
+    MTLLogLevelNotice = 3
+    MTLLogLevelError = 4
+    MTLLogLevelFault = 5
+end
+
+export MTLLogStateDescriptor
+
+@objcwrapper immutable=false MTLLogStateDescriptor <: NSObject
+
+@objcproperties MTLLogStateDescriptor begin
+    @autoproperty level::MTLLogLevel setter=setLevel
+    @autoproperty bufferSize::NSInteger setter=setBufferSize
+end
+
+function MTLLogStateDescriptor()
+    handle = @objc [MTLLogStateDescriptor alloc]::id{MTLLogStateDescriptor}
+    obj = MTLLogStateDescriptor(handle)
+    finalizer(release, obj)
+    @objc [obj::id{MTLLogStateDescriptor} init]::id{MTLLogStateDescriptor}
+    return obj
+end
+
+
+export MTLLogState
+
+@objcwrapper MTLLogState <: NSObject
+
+function MTLLogState(dev::MTLDevice, descriptor::MTLLogStateDescriptor)
+    err = Ref{id{NSError}}(nil)
+    handle = @objc [dev::id{MTLDevice} newLogStateWithDescriptor:descriptor::id{MTLLogStateDescriptor}
+                                       error:err::Ptr{id{NSError}}]::id{MTLLogState}
+    err[] == nil || throw(NSError(err[]))
+    MTLLogState(handle)
+end
@@ -37,6 +37,7 @@ include("device/intrinsics/synchronization.jl")
 include("device/intrinsics/memory.jl")
 include("device/intrinsics/simd.jl")
 include("device/intrinsics/atomics.jl")
+include("device/intrinsics/output.jl")
 include("device/quirks.jl")
 
 # array essentials
 
@@ -104,9 +104,9 @@ function compile(@nospecialize(job::CompilerJob))
 
     @signpost_interval log=log_compiler() "Generate LLVM IR" begin
         # TODO: on 1.9, this actually creates a context. cache those.
-        ir, entry = JuliaContext() do ctx
+        ir, entry, loggingEnabled = JuliaContext() do ctx
             mod, meta = GPUCompiler.compile(:llvm, job)
-            string(mod), LLVM.name(meta.entry)
+            string(mod), LLVM.name(meta.entry), haskey(functions(mod), "air.os_log")
         end
     end
 
@@ -172,7 +172,7 @@ function compile(@nospecialize(job::CompilerJob))
         end
     end
 
-    return (; ir, air, metallib, entry)
+    return (; ir, air, metallib, entry, loggingEnabled)
 end
 
 # link into an executable kernel
@@ -210,5 +210,7 @@ end
         end
     end
 
-    pipeline_state
+    # most of the time, we don't need the function object,
+    # so don't keep it alive unconditionally in GPUCompiler's caches
+    pipeline_state, compiled.loggingEnabled
 end
@@ -161,6 +161,7 @@ mtlconvert(arg, cce=nothing) = adapt(Adaptor(cce), arg)
 struct HostKernel{F,TT}
     f::F
     pipeline::MTLComputePipelineState
+    loggingEnabled::Bool
 end
 
 const mtlfunction_lock = ReentrantLock()
@@ -186,15 +187,15 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
         cache = compiler_cache(dev)
         source = methodinstance(F, tt)
         config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
-        pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+        pipeline, loggingEnabled = GPUCompiler.cached_compilation(cache, source, config, compile, link)
 
         # create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object
         h = hash(pipeline, hash(f, hash(tt)))
         kernel = get(_kernel_instances, h, nothing)
         if kernel === nothing
             # create the kernel state object
-            kernel = HostKernel{F,tt}(f, pipeline)
+            kernel = HostKernel{F,tt}(f, pipeline, loggingEnabled)
             _kernel_instances[h] = kernel
         end
         return kernel::HostKernel{F,tt}
@@ -275,7 +276,34 @@ end
     (threads.width * threads.height * threads.depth) > kernel.pipeline.maxTotalThreadsPerThreadgroup &&
         throw(ArgumentError("Number of threads in group ($(threads.width * threads.height * threads.depth)) should not exceed $(kernel.pipeline.maxTotalThreadsPerThreadgroup)"))
 
-    cmdbuf = MTLCommandBuffer(queue)
+    cmdbuf = if kernel.loggingEnabled
+        if macos_version() < v"15"
+            @error "Logging is only supported on macOS 15 or higher"
+        end
+
+        if MTLCaptureManager().isCapturing
+            @error "Logging is not supported while GPU frame capturing"
+        end
+
+        log_state_descriptor = MTLLogStateDescriptor()
+        log_state_descriptor.level = MTL.MTLLogLevelDebug
+        log_state = MTLLogState(queue.device, log_state_descriptor)
+
+        function log_handler(subSystem, category, logLevel, message)
+            print(String(NSString(message)))
+            return nothing
+        end
+
+        block = @objcblock(log_handler, Nothing, (id{NSString}, id{NSString}, NSInteger, id{NSString}))
+        @objc [log_state::id{MTLLogState} addLogHandler:block::id{NSBlock}]::Nothing
+
+        cmdbuf_descriptor = MTLCommandBufferDescriptor()
+        cmdbuf_descriptor.logState = log_state
+        MTLCommandBuffer(queue, cmdbuf_descriptor)
+    else
+        MTLCommandBuffer(queue)
+    end
+
     cmdbuf.label = "MTLCommandBuffer($(nameof(kernel.f)))"
     cce = MTLComputeCommandEncoder(cmdbuf)
     argument_buffers = try