From 4179f1fd11afef407d49f14d4be2d196a27dd8cf Mon Sep 17 00:00:00 2001 From: atharva Date: Tue, 21 Jan 2025 05:02:45 +0530 Subject: [PATCH 1/6] Add error checking to command buffer completion handler --- src/compiler/execution.jl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index c16683bcf..e0affe383 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -300,8 +300,18 @@ end empty!(roots) foreach(free, argument_buffers) - # TODO: access logs here to check for errors - # https://developer.apple.com/videos/play/wwdc2020/10616/ + # Check for errors + if buf.status == MTL.MTLCommandBufferStatusError + err = buf.error + if err !== nothing + # Get error details + code = err.code # MTLCommandBufferError enum value + description = err.localizedDescription + + # Log the error + @error "GPU kernel execution failed" exception=(err, catch_backtrace()) kernel=nameof(kernel.f) error_code=code description=description + end + end end commit!(cmdbuf) From be41ab802070ebe1e79b481547c7b5590b811e7c Mon Sep 17 00:00:00 2001 From: atharva Date: Tue, 21 Jan 2025 05:10:13 +0530 Subject: [PATCH 2/6] fix fmt --- src/compiler/execution.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index e0affe383..886a73963 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -304,12 +304,10 @@ end if buf.status == MTL.MTLCommandBufferStatusError err = buf.error if err !== nothing - # Get error details code = err.code # MTLCommandBufferError enum value description = err.localizedDescription - - # Log the error - @error "GPU kernel execution failed" exception=(err, catch_backtrace()) kernel=nameof(kernel.f) error_code=code description=description + + @error "GPU kernel execution failed" exception = (err, catch_backtrace()) kernel = nameof(kernel.f) error_code = code description = description end end end From 96808ca2dbdf64fb1b929100eb06d5069a41c05f Mon Sep 17 00:00:00 2001 From: atharva Date: Tue, 21 Jan 2025 21:39:52 +0530 Subject: [PATCH 3/6] add test and fight with fmt --- test/execution.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/execution.jl b/test/execution.jl index d67b4ac42..d8749c991 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -260,3 +260,12 @@ end @test Array(a)[] == 1 end end + +@testset "error handling" begin + function failing_kernel() + x = unsafe_load(Ptr{Int}(0)) + return nothing + end + + @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel() +end From 60996c64ba22f31d7b6e2fdaa4645683c0fa9aac Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 27 Jan 2025 11:55:20 +0100 Subject: [PATCH 4/6] Format and fix test. --- src/compiler/execution.jl | 4 +++- test/execution.jl | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 886a73963..908077993 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -301,15 +301,17 @@ end foreach(free, argument_buffers) # Check for errors + Core.println(buf.status) if buf.status == MTL.MTLCommandBufferStatusError err = buf.error if err !== nothing code = err.code # MTLCommandBufferError enum value description = err.localizedDescription - @error "GPU kernel execution failed" exception = (err, catch_backtrace()) kernel = nameof(kernel.f) error_code = code description = description + @error "GPU kernel execution failed" exception = (err, catch_backtrace()) kernel = nameof(kernel.f) error_code = code description = description end end + end commit!(cmdbuf) diff --git a/test/execution.jl b/test/execution.jl index d8749c991..58384d6b9 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -262,10 +262,11 @@ end end @testset "error handling" begin - function failing_kernel() - x = unsafe_load(Ptr{Int}(0)) + function failing_kernel(arr, ptr) + x = unsafe_load(reinterpret(Ptr{Int}, ptr)) + @inbounds arr[1] = x return nothing end - @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel() + @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel(mtl([0], 0) end From 804a4b50693cc204525be30e2c29210a7153af1c Mon Sep 17 00:00:00 2001 From: atharva Date: Wed, 29 Jan 2025 18:14:18 +0530 Subject: [PATCH 5/6] runic fmt --- src/compiler/execution.jl | 1 - test/execution.jl | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 908077993..9bf8094da 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -313,7 +313,6 @@ end end end - commit!(cmdbuf) end diff --git a/test/execution.jl b/test/execution.jl index 58384d6b9..4676f1f21 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -261,6 +261,7 @@ end end end + @testset "error handling" begin function failing_kernel(arr, ptr) x = unsafe_load(reinterpret(Ptr{Int}, ptr)) @@ -268,5 +269,5 @@ end return nothing end - @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel(mtl([0], 0) + @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel(mtl([0], 0)) end From 3c9c87a6018b1fcd90788493268eb05a60f6fdce Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 3 Feb 2025 12:03:01 +0100 Subject: [PATCH 6/6] Simplify. --- src/compiler/execution.jl | 11 +++-------- test/execution.jl | 11 ----------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 9bf8094da..58b2c7f29 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -301,15 +301,10 @@ end foreach(free, argument_buffers) # Check for errors - Core.println(buf.status) + # XXX: we cannot do this nicely, e.g. throwing an `error` or reporting with `@error` + # because we're not allowed to switch tasks from this contexts. if buf.status == MTL.MTLCommandBufferStatusError - err = buf.error - if err !== nothing - code = err.code # MTLCommandBufferError enum value - description = err.localizedDescription - - @error "GPU kernel execution failed" exception = (err, catch_backtrace()) kernel = nameof(kernel.f) error_code = code description = description - end + Core.println("ERROR: Failed to submit command buffer: $(buf.error.localizedDescription)") end end diff --git a/test/execution.jl b/test/execution.jl index 4676f1f21..d67b4ac42 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -260,14 +260,3 @@ end @test Array(a)[] == 1 end end - - -@testset "error handling" begin - function failing_kernel(arr, ptr) - x = unsafe_load(reinterpret(Ptr{Int}, ptr)) - @inbounds arr[1] = x - return nothing - end - - @test_logs (:error, r"GPU kernel execution failed") @metal failing_kernel(mtl([0], 0)) -end