diff --git a/.github/workflows/DocsCleanup.yml b/.github/workflows/DocsCleanup.yml new file mode 100644 index 00000000..be55aca8 --- /dev/null +++ b/.github/workflows/DocsCleanup.yml @@ -0,0 +1,28 @@ +name: Doc Preview Cleanup + +on: + pull_request: + types: [closed] + +jobs: + doc-preview-cleanup: + runs-on: ubuntu-latest + steps: + - name: Checkout gh-pages branch + uses: actions/checkout@v2 + with: + ref: gh-pages + + - name: Delete preview and history + run: | + git config user.name "oneAPI.jl" + git config user.email "oneapi@juliagpu.github.io" + git rm -rf "previews/PR$PRNUM" + git commit -m "delete preview" + git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree}) + env: + PRNUM: ${{ github.event.number }} + + - name: Push changes + run: | + git push --force origin gh-pages-new:gh-pages \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..850ecc62 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Documentation + +on: + push: + branches: + - main + tags: '*' + pull_request: + types: [opened, synchronize, reopened] + schedule: + - cron: '0 0 * * 0' + +jobs: + docs: + name: Build documentation + env: + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + JULIA_DEBUG: Documenter + runs-on: [self-hosted, linux, X64] + + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@latest + with: + version: 'lts' + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@latest + - run: julia --project=docs/ docs/make.jl \ No newline at end of file diff --git a/.gitignore b/.gitignore index 736148be..3fc6e9ef 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ deps/onemkl_lapack.cpp deps/onemkl_lapack.h deps/onemkl_sparse.cpp deps/onemkl_sparse.h +docs/build diff --git a/Project.toml b/Project.toml index fc7cd1bc..e2dac0e2 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,6 @@ authors = ["Tim Besard ", "Alexis Montoison", "Michel Sch [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" -AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" @@ -30,6 +29,12 @@ oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d" oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01" oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36" +[weakdeps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" + +[extensions] +oneAPIAcceleratedKernelsExt = "AcceleratedKernels" + [compat] AbstractFFTs = "1.5.0" AcceleratedKernels = "0.4.3" diff --git a/README.md b/README.md index c22cdb6b..8485889f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ *Julia support for the oneAPI programming toolkit.* -[![][doi-img]][doi-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url] +[![][doi-img]][doi-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url] [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] [doi-img]: https://zenodo.org/badge/252466420.svg [doi-url]: https://zenodo.org/badge/latestdoi/252466420 @@ -13,6 +13,12 @@ [codecov-img]: https://codecov.io/gh/JuliaGPU/oneAPI.jl/branch/master/graph/badge.svg [codecov-url]: https://codecov.io/gh/JuliaGPU/oneAPI.jl +[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg +[docs-stable-url]: https://juliagpu.github.io/oneAPI.jl/stable + +[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg +[docs-dev-url]: https://juliagpu.github.io/oneAPI.jl/dev + oneAPI.jl provides support for working with the [oneAPI unified programming model](https://software.intel.com/en-us/oneapi). The package is verified to work with the (currently) only implementation of this interface [that is part of the Intel Compute diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 00000000..90803543 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,9 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" + +[sources] +oneAPI = {path = "/home/michel/git/oneAPI.jl"} + +[compat] +Documenter = "1" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 00000000..b5529648 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,54 @@ +using Pkg + +Pkg.develop(PackageSpec(path=joinpath(dirname(@__FILE__), ".."))) +# # when first running instantiate +Pkg.instantiate() +using Documenter +using Documenter.Remotes +using oneAPI + +oneAPI.versioninfo() + +makedocs( + sitename = "oneAPI.jl", + format = Documenter.HTML( + prettyurls = Base.get(ENV, "CI", nothing) == "true", + canonical = "https://exanauts.github.io/ExaPF.jl/stable/", + mathengine = Documenter.KaTeX(), + ), + modules = [oneAPI], + pages = [ + "Home" => "index.md", + "Installation" => "installation.md", + "Getting Started" => "getting_started.md", + "Usage" => [ + "Array Programming" => "arrays.md", + "Kernel Programming" => "kernels.md", + "Memory Management" => "memory.md", + "Device Intrinsics" => "device.md", + "Performance Guide" => "usage/performance.md", + ], + "API Reference" => [ + "Overview" => "api.md", + "Context & Device Management" => "api/context.md", + "Array Operations" => "api/arrays.md", + "Kernel Programming" => "api/kernels.md", + "Memory Management" => "api/memory.md", + "Compiler & Reflection" => "api/compiler.md", + "Level Zero (oneL0)" => "level_zero.md", + "oneMKL" => "onemkl.md", + ], + "Troubleshooting" => "troubleshooting.md", + ], + checkdocs = :none, # Don't error on missing docstrings + warnonly = [:cross_references, :missing_docs], # Only warn, don't error +) + +deploydocs( + repo = "github.com/JuliaGPU/oneAPI.jl.git", + target = "build", + devbranch = "main", + devurl = "dev", + push_preview = true, +) + diff --git a/docs/src/api.md b/docs/src/api.md new file mode 100644 index 00000000..68cd2a76 --- /dev/null +++ b/docs/src/api.md @@ -0,0 +1,42 @@ +# API Reference + +This page provides an overview of the oneAPI.jl API. For detailed documentation, see the specific API reference pages: + +- [Context & Device Management](api/context.md) - Managing drivers, devices, and contexts +- [Array Operations](api/arrays.md) - Working with GPU arrays +- [Kernel Programming](api/kernels.md) - Writing and launching custom kernels +- [Memory Management](api/memory.md) - Memory allocation and transfer +- [Compiler & Reflection](api/compiler.md) - Code generation and introspection + +## Core Functions + +```@autodocs +Modules = [oneAPI] +Pages = ["src/context.jl", "src/utils.jl"] +Filter = t -> t !== oneAPI.synchronize +``` + +## Compiler Functions + +```@autodocs +Modules = [oneAPI] +Pages = ["src/compiler/execution.jl", "src/compiler/reflection.jl"] +``` + +## oneL0 (Level Zero) + +Low-level bindings to the Level Zero API. See the [Level Zero page](level_zero.md) for details. + +```@autodocs +Modules = [oneAPI.oneL0] +Filter = t -> t !== oneAPI.oneL0.synchronize +``` + +## oneMKL + +Intel oneAPI Math Kernel Library bindings. See the [oneMKL page](onemkl.md) for details. + +```@autodocs +Modules = [oneAPI.oneMKL] +``` + diff --git a/docs/src/api/arrays.md b/docs/src/api/arrays.md new file mode 100644 index 00000000..0554649f --- /dev/null +++ b/docs/src/api/arrays.md @@ -0,0 +1,264 @@ +# Array Operations + +This page documents the array types and operations provided by oneAPI.jl. + +## Array Types + +### Host-Side Arrays + +#### `oneArray{T,N,B}` + +N-dimensional dense array type for Intel GPU programming using oneAPI and Level Zero. + +**Type Parameters:** +- `T`: Element type (must be stored inline, no isbits-unions) +- `N`: Number of dimensions +- `B`: Buffer type, one of: + - `oneL0.DeviceBuffer`: GPU device memory (default, not CPU-accessible) + - `oneL0.SharedBuffer`: Unified shared memory (CPU and GPU accessible) + - `oneL0.HostBuffer`: Pinned host memory (CPU-accessible, GPU-visible) + +**Type Aliases:** +- `oneVector{T}` = `oneArray{T,1}` - 1D array +- `oneMatrix{T}` = `oneArray{T,2}` - 2D array +- `oneVecOrMat{T}` = `Union{oneVector{T}, oneMatrix{T}}` - 1D or 2D array + +### Device-Side Arrays + +#### `oneDeviceArray{T,N,A}` + +Device-side array type for use within GPU kernels. This type represents a view of GPU memory +accessible within kernel code. Unlike `oneArray` which is used on the host, `oneDeviceArray` +is designed for device-side operations and cannot be directly constructed on the host. + +**Type Parameters:** +- `T`: Element type +- `N`: Number of dimensions +- `A`: Address space (typically `AS.CrossWorkgroup` for global memory) + +**Type Aliases:** +- `oneDeviceVector` = `oneDeviceArray{T,1}` - 1D device array +- `oneDeviceMatrix` = `oneDeviceArray{T,2}` - 2D device array + +#### `oneLocalArray(::Type{T}, dims)` + +Allocate local (workgroup-shared) memory within a GPU kernel. Local memory is shared among +all work-items in a workgroup and provides faster access than global memory. + +## Memory Type Queries + +### `is_device(a::oneArray) -> Bool` + +Check if the array is stored in device memory (not directly CPU-accessible). + +### `is_shared(a::oneArray) -> Bool` + +Check if the array is stored in shared (unified) memory, accessible from both CPU and GPU. + +### `is_host(a::oneArray) -> Bool` + +Check if the array is stored in pinned host memory, which resides on the CPU but is visible to the GPU. + + +## Array Construction + +`oneArray` supports multiple construction patterns similar to standard Julia arrays: + +```julia +using oneAPI + +# Uninitialized arrays +a = oneArray{Float32}(undef, 100) +b = oneArray{Float32,2}(undef, 10, 10) + +# Specify memory type +c = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 100) # Shared memory +d = oneArray{Float32,1,oneL0.HostBuffer}(undef, 100) # Host memory + +# From existing arrays +e = oneArray(rand(Float32, 100)) +f = oneArray([1, 2, 3, 4]) + +# Using zeros/ones/rand +g = oneAPI.zeros(Float32, 100) +h = oneAPI.ones(Float32, 100) +i = oneAPI.rand(Float32, 100) + +# Do-block for automatic cleanup +result = oneArray{Float32}(100) do arr + arr .= 1.0f0 + sum(arr) # Returns result, arr is freed automatically +end +``` + +## Array Operations + +`oneArray` implements the full `AbstractArray` interface and supports: + +### Broadcasting + +```julia +a = oneArray(rand(Float32, 100)) +b = oneArray(rand(Float32, 100)) + +c = a .+ b # Element-wise addition +d = a .* 2.0f0 # Scalar multiplication +e = sin.(a) # Unary operations +f = a .+ b .* c # Fused operations +``` + +### Reductions + +```julia +a = oneArray(rand(Float32, 100)) + +s = sum(a) # Sum +p = prod(a) # Product +m = maximum(a) # Maximum +n = minimum(a) # Minimum +μ = mean(a) # Mean (requires Statistics) +``` + +### Mapping + +```julia +a = oneArray(rand(Float32, 100)) + +b = map(x -> x^2, a) # Apply function +c = map(+, a, b) # Binary operation +``` + +### Accumulation + +```julia +a = oneArray([1, 2, 3, 4]) + +b = cumsum(a) # Cumulative sum: [1, 3, 6, 10] +c = cumprod(a) # Cumulative product: [1, 2, 6, 24] +``` + +### Finding Elements + +```julia +a = oneArray([1.0f0, -2.0f0, 3.0f0, -4.0f0]) + +indices = findall(x -> x > 0, a) # Indices of positive elements +``` + +### Random Number Generation + +```julia +using oneAPI, Random + +# Uniform distribution +a = oneAPI.rand(Float32, 100) +b = oneAPI.rand(Float32, 10, 10) + +# Normal distribution +c = oneAPI.randn(Float32, 100) + +# With seed +Random.seed!(1234) +d = oneAPI.rand(Float32, 100) +``` + +## Data Transfer + +### CPU to GPU + +```julia +# Using constructor +h_array = rand(Float32, 100) +d_array = oneArray(h_array) + +# Using copyto! +d_array = oneArray{Float32}(undef, 100) +copyto!(d_array, h_array) +``` + +### GPU to CPU + +```julia +# Using Array constructor +h_array = Array(d_array) + +# Using copyto! +h_array = Vector{Float32}(undef, 100) +copyto!(h_array, d_array) +``` + +### GPU to GPU + +```julia +d_array1 = oneArray(rand(Float32, 100)) +d_array2 = similar(d_array1) +copyto!(d_array2, d_array1) +``` + +## Memory Types Comparison + +| Memory Type | CPU Access | GPU Access | Performance | Use Case | +|-------------|-----------|------------|-------------|----------| +| Device (default) | ❌ No | ✅ Fast | Fastest | GPU computations | +| Shared | ✅ Yes | ✅ Good | Good | CPU-GPU data sharing | +| Host | ✅ Yes | ✅ Slower | Moderate | Staging, pinned buffers | + +```julia +# Device memory (default, fastest for GPU) +a = oneArray{Float32}(undef, 100) + +# Shared memory (CPU and GPU accessible) +b = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 100) + +# Host memory (CPU memory visible to GPU) +c = oneArray{Float32,1,oneL0.HostBuffer}(undef, 100) + +# Query memory type +is_device(a) # true +is_shared(b) # true +is_host(c) # true +``` + +## Views and Slicing + +`oneArray` supports array views for efficient sub-array operations without copying: + +```julia +a = oneArray(rand(Float32, 100)) + +# Create a view +v = view(a, 1:50) +v .= 0.0f0 # Modifies first 50 elements of a + +# Slicing returns a view +s = a[1:50] # This is a view, not a copy +``` + +## Reshaping + +```julia +a = oneArray(rand(Float32, 100)) + +# Reshape to 2D +b = reshape(a, 10, 10) + +# Flatten +c = vec(b) # Returns 1D view +``` + +## Advanced: Custom Array Wrappers + +For advanced use cases, oneAPI.jl provides type aliases for array wrappers: + +- `oneDenseArray`: Dense contiguous arrays +- `oneStridedArray`: Arrays with arbitrary strides (including views) +- `oneWrappedArray`: Any array backed by a oneArray + +These are useful for writing functions that accept various array types: + +```julia +function my_kernel!(a::oneStridedArray{Float32}) + # Accepts oneArray and views + a .+= 1.0f0 +end +``` diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md new file mode 100644 index 00000000..ba0081c1 --- /dev/null +++ b/docs/src/api/compiler.md @@ -0,0 +1,255 @@ +# Compiler and Reflection + +This page documents the compiler interface and code reflection tools for oneAPI.jl. + +## Code Reflection + +oneAPI.jl provides macros for inspecting code generation at various stages: + +- `@device_code_lowered` - Show lowered IR (desugared Julia code) +- `@device_code_typed` - Show type-inferred IR +- `@device_code_warntype` - Show type-inferred IR with type stability warnings +- `@device_code_llvm` - Show LLVM IR +- `@device_code_spirv` - Show SPIR-V assembly +- `@device_code` - Show all compilation stages interactively + +These macros are re-exported from GPUCompiler.jl. See the [GPUCompiler documentation](https://github.com/JuliaGPU/GPUCompiler.jl) for detailed usage. + +### `return_type(f, tt) -> Type` + +Return the inferred return type of function `f` when called with argument types `tt` in a GPU kernel context. + +**Arguments:** +- `f`: Function to analyze +- `tt`: Tuple type of arguments + +**Returns:** +- Type that `f(args...)` would return where `args::tt` + +**Example:** +```julia +function compute(x::Float32) + return x * 2.0f0 +end + +rt = oneAPI.return_type(compute, Tuple{Float32}) +@assert rt == Float32 +``` + + +## Inspecting Generated Code + +Code reflection tools help you understand how your Julia code is compiled to GPU code: + +### LLVM IR + +View the LLVM intermediate representation: + +```julia +using oneAPI + +function kernel(a, b) + i = get_global_id() + @inbounds a[i] = b[i] + 1.0f0 + return +end + +a = oneArray(zeros(Float32, 10)) +b = oneArray(rand(Float32, 10)) + +@device_code_llvm @oneapi groups=1 items=10 kernel(a, b) +``` + +### SPIR-V Assembly + +View the final SPIR-V assembly that runs on the GPU: + +```julia +@device_code_spirv @oneapi groups=1 items=10 kernel(a, b) +``` + +### Type Inference + +Check for type instabilities that hurt performance: + +```julia +@device_code_warntype @oneapi groups=1 items=10 kernel(a, b) +``` + +### Type-Inferred IR + +See the typed intermediate representation: + +```julia +@device_code_typed @oneapi groups=1 items=10 kernel(a, b) +``` + +### Interactive Inspection + +Use `@device_code` for an interactive menu: + +```julia +@device_code @oneapi groups=1 items=10 kernel(a, b) +# Opens a menu to select which compilation stage to view +``` + +## Return Type Inference + +Query the return type of a kernel: + +```julia +function compute(x::Float32) + return x * 2.0f0 +end + +# Infer return type +rt = oneAPI.return_type(compute, Tuple{Float32}) +@assert rt == Float32 +``` + +## Debugging Type Issues + +### Common Type Instability Sources + +```julia +# ❌ Type instability: Conditional returns different types +function bad_kernel(x, flag) + if flag + return x # Float32 + else + return 0 # Int + end +end + +# ✅ Type stable: Consistent return type +function good_kernel(x, flag) + if flag + return x # Float32 + else + return 0.0f0 # Float32 + end +end +``` + +### Using @device_code_warntype + +```julia +function mystery_kernel!(output, input) + i = get_global_id() + @inbounds output[i] = some_complex_function(input[i]) + return +end + +# Check for type issues +@device_code_warntype @oneapi groups=1 items=10 mystery_kernel!(a, b) + +# Look for red warnings indicating type instability +``` + +## Compilation Options + +### Kernel vs Device Function + +```julia +# Compile as kernel (default for @oneapi) +@device_code_llvm @oneapi kernel=true kernel(a, b) + +# Compile as device function (callable from other kernels) +@device_code_llvm @oneapi kernel=false helper_function(x) +``` + +### Always Inline + +Force inlining of device functions: + +```julia +@oneapi always_inline=true kernel(a, b) +``` + +### Custom Kernel Name + +Specify a custom name for the kernel: + +```julia +@oneapi name="my_custom_kernel" kernel(a, b) +``` + +## Example: Optimizing a Kernel + +Here's a workflow for optimizing a kernel using reflection tools: + +```julia +using oneAPI + +# Initial version +function sum_kernel_v1!(result, data) + i = get_global_id() + if i == 1 + sum = 0 + for j in 1:length(data) + sum += data[j] + end + result[1] = sum + end + return +end + +data = oneArray(rand(Float32, 1000)) +result = oneArray(zeros(Float32, 1)) + +# Check for type issues +@device_code_warntype @oneapi groups=1 items=1 sum_kernel_v1!(result, data) +# Notice: `sum` might be Int instead of Float32! + +# Fixed version +function sum_kernel_v2!(result, data) + i = get_global_id() + if i == 1 + sum = 0.0f0 # Explicitly Float32 + for j in 1:length(data) + sum += data[j] + end + result[1] = sum + end + return +end + +# Verify the fix +@device_code_warntype @oneapi groups=1 items=1 sum_kernel_v2!(result, data) +# Should be type-stable now! + +# Check the generated code +@device_code_llvm @oneapi groups=1 items=1 sum_kernel_v2!(result, data) +``` + +## Profiling + +For performance profiling, see the [Performance Guide](@ref). + +## Troubleshooting + +### Compilation Errors + +If you encounter compilation errors: + +1. **Check type stability**: Use `@device_code_warntype` +2. **Inspect LLVM IR**: Use `@device_code_llvm` to see if the issue is in LLVM generation +3. **Simplify the kernel**: Comment out sections to isolate the problematic code +4. **Check argument types**: Ensure arguments are GPU-compatible (isbits types) + +### SPIR-V Issues + +If SPIR-V generation fails: + +1. **Update dependencies**: Ensure SPIRV-LLVM-Translator is up to date +2. **Check device capabilities**: Some operations require specific hardware features +3. **Reduce complexity**: Very complex kernels might hit compiler limits + +### Performance Issues + +If your kernel is slow: + +1. **Profile memory access patterns**: Coalesced access is crucial +2. **Check occupancy**: Are you launching enough work-items? +3. **Minimize barriers**: Synchronization has overhead +4. **Use local memory wisely**: It's faster than global memory but limited in size diff --git a/docs/src/api/context.md b/docs/src/api/context.md new file mode 100644 index 00000000..19ad45a3 --- /dev/null +++ b/docs/src/api/context.md @@ -0,0 +1,123 @@ +# Context and Device Management + +This page documents the API for managing Level Zero drivers, devices, and contexts in oneAPI.jl. + +## Overview + +oneAPI.jl uses task-local state to manage GPU resources. This allows different Julia tasks to +work with different drivers, devices, or contexts without interfering with each other. + +The typical hierarchy is: +- **Driver**: Represents a Level Zero driver (usually one per GPU vendor/installation) +- **Device**: Represents a physical GPU device +- **Context**: Manages resources like memory allocations and command queues + +## Driver Management + +### `driver() -> ZeDriver` + +Get the current Level Zero driver for the calling task. If no driver has been explicitly +set with `driver!`, returns the first available driver. The driver selection is task-local. + +### `driver!(drv::ZeDriver)` + +Set the current Level Zero driver for the calling task. This also clears the current +device selection, as devices are associated with specific drivers. + +### `drivers() -> Vector{ZeDriver}` + +Return a list of all available Level Zero drivers. + +## Device Management + +### `device() -> ZeDevice` + +Get the current Level Zero device for the calling task. If no device has been explicitly +set with `device!`, returns the first available device for the current driver. The device +selection is task-local. + +### `device!(dev::ZeDevice)` / `device!(i::Int)` + +Set the current Level Zero device for the calling task. Can pass either a device object or +a 1-based device index. + +### `devices() -> Vector{ZeDevice}` / `devices(drv::ZeDriver)` + +Return a list of available Level Zero devices. Without arguments, returns devices for +the current driver. + +## Context Management + +### `context() -> ZeContext` + +Get the current Level Zero context for the calling task. If no context has been explicitly +set with `context!`, returns a global context for the current driver. Contexts manage the +lifetime of resources like memory allocations and command queues. + +### `context!(ctx::ZeContext)` + +Set the current Level Zero context for the calling task. + +## Command Queues + +### `global_queue(ctx::ZeContext, dev::ZeDevice) -> ZeCommandQueue` + +Get the global command queue for the given context and device. This queue is used as the +default queue for executing operations. The queue is created with in-order execution flags. + +### `synchronize()` + +Block the host thread until all operations on the global command queue for the current +context and device have completed. + + +## Example Workflow + +```julia +using oneAPI + +# List available drivers +drv_list = drivers() +println("Available drivers: ", length(drv_list)) + +# Select a specific driver +driver!(drv_list[1]) + +# List devices for current driver +dev_list = devices() +println("Available devices: ", length(dev_list)) + +# Select a specific device +device!(dev_list[1]) + +# Get the current context (created automatically) +ctx = context() + +# Perform GPU operations... +a = oneArray(rand(Float32, 100)) + +# Wait for all operations to complete +synchronize() +``` + +## Multi-Device Programming + +You can use different devices in different Julia tasks: + +```julia +using oneAPI + +# Task 1: Use first device +Threads.@spawn begin + device!(1) + a = oneArray(rand(Float32, 100)) + # ... operations on device 1 ... +end + +# Task 2: Use second device +Threads.@spawn begin + device!(2) + b = oneArray(rand(Float32, 100)) + # ... operations on device 2 ... +end +``` diff --git a/docs/src/api/kernels.md b/docs/src/api/kernels.md new file mode 100644 index 00000000..5040adf2 --- /dev/null +++ b/docs/src/api/kernels.md @@ -0,0 +1,297 @@ +# Kernel Programming + +This page documents the kernel programming API for writing custom GPU kernels in oneAPI.jl. + +## Kernel Launch + +### `@oneapi [kwargs...] kernel(args...)` + +High-level interface for launching Julia kernels on Intel GPUs using oneAPI. + +This macro compiles a Julia function to SPIR-V, prepares the arguments, and optionally +launches the kernel on the GPU. + +**Keyword Arguments:** + +**Macro Keywords (compile-time):** +- `launch::Bool=true`: Whether to launch the kernel immediately + +**Compiler Keywords:** +- `kernel::Bool=false`: Whether to compile as a kernel or device function +- `name::Union{String,Nothing}=nothing`: Explicit name for the kernel +- `always_inline::Bool=false`: Whether to always inline device functions + +**Launch Keywords (runtime):** +- `groups`: Number of workgroups (required). Can be an integer or tuple. +- `items`: Number of work-items per workgroup (required). Can be an integer or tuple. +- `queue::ZeCommandQueue`: Command queue to submit to (defaults to global queue). + +### `zefunction(f, tt; kwargs...)` + +Compile a Julia function to a Level Zero kernel function. This is the lower-level interface +used by `@oneapi`. Returns a callable kernel object. + +### `kernel_convert(x)` + +Convert arguments for kernel execution. This function is called for every argument passed to +a kernel, allowing customization of argument conversion. By default, it converts `oneArray` +to `oneDeviceArray`. + + +## Basic Kernel Example + +```julia +using oneAPI + +function vadd_kernel!(a, b, c) + i = get_global_id() + if i <= length(a) + @inbounds c[i] = a[i] + b[i] + end + return +end + +N = 1024 +a = oneArray(rand(Float32, N)) +b = oneArray(rand(Float32, N)) +c = similar(a) + +# Launch with 4 workgroups of 256 work-items each +@oneapi groups=4 items=256 vadd_kernel!(a, b, c) +``` + +## Launch Configuration + +### Workgroups and Work-Items + +The oneAPI execution model is based on: +- **Work-items**: Individual threads of execution (analogous to CUDA threads) +- **Workgroups**: Groups of work-items that can synchronize and share local memory (analogous to CUDA blocks) + +```julia +# 1D configuration +@oneapi groups=10 items=64 kernel(args...) # 640 work-items total + +# 2D configuration +@oneapi groups=(10, 10) items=(8, 8) kernel(args...) # 6400 work-items total + +# 3D configuration +@oneapi groups=(4, 4, 4) items=(4, 4, 4) kernel(args...) # 4096 work-items total +``` + +### Determining Launch Configuration + +```julia +# For simple element-wise operations +N = length(array) +items = 256 # Typical workgroup size +groups = cld(N, items) # Ceiling division + +@oneapi groups=groups items=items kernel(array) +``` + +### Compile Without Launch + +You can compile a kernel without launching it: + +```julia +# Compile the kernel +kernel = @oneapi launch=false vadd_kernel!(a, b, c) + +# Launch later with different configurations +kernel(a, b, c; groups=4, items=256) +kernel(a, b, c; groups=8, items=128) +``` + +## Device Intrinsics + +Inside GPU kernels, you can use various intrinsics to query execution context and synchronize work-items. + +### Thread Indexing + +```julia +# Global ID (unique across all work-items) +i = get_global_id() # 1D linear index +i = get_global_id(0) # X dimension +j = get_global_id(1) # Y dimension +k = get_global_id(2) # Z dimension + +# Local ID (within workgroup) +local_i = get_local_id() # 1D linear index +local_i = get_local_id(0) # X dimension +local_j = get_local_id(1) # Y dimension +local_k = get_local_id(2) # Z dimension + +# Workgroup ID +group_i = get_group_id(0) # X dimension +group_j = get_group_id(1) # Y dimension +group_k = get_group_id(2) # Z dimension + +# Workgroup size +local_size = get_local_size() # Total work-items in workgroup +local_size_x = get_local_size(0) +local_size_y = get_local_size(1) + +# Global size +global_size = get_global_size() # Total work-items +global_size_x = get_global_size(0) +``` + +### 2D Matrix Example + +```julia +function matmul_kernel!(C, A, B) + # Get 2D indices + row = get_global_id(0) + col = get_global_id(1) + + if row <= size(C, 1) && col <= size(C, 2) + sum = 0.0f0 + for k in 1:size(A, 2) + @inbounds sum += A[row, k] * B[k, col] + end + @inbounds C[row, col] = sum + end + return +end + +M, N, K = 256, 256, 256 +A = oneArray(rand(Float32, M, K)) +B = oneArray(rand(Float32, K, N)) +C = oneArray{Float32}(undef, M, N) + +# Launch with 2D configuration +items = (16, 16) # 16x16 work-items per workgroup +groups = (cld(M, items[1]), cld(N, items[2])) + +@oneapi groups=groups items=items matmul_kernel!(C, A, B) +``` + +### Synchronization + +```julia +# Barrier: synchronize all work-items in a workgroup +barrier() + +# Memory fences (ensure memory operations are visible) +mem_fence() # Both local and global memory +local_mem_fence() # Local memory only +global_mem_fence() # Global memory only +``` + +### Local Memory + +Local memory (workgroup-shared memory) enables cooperation between work-items: + +```julia +function optimized_reduction!(result, input) + local_id = get_local_id() + local_size = get_local_size() + + # Allocate local memory (shared within workgroup) + local_data = oneLocalArray(Float32, 256) + + # Load into local memory + @inbounds local_data[local_id] = input[get_global_id()] + barrier() + + # Tree reduction in local memory + stride = local_size ÷ 2 + while stride > 0 + if local_id <= stride + @inbounds local_data[local_id] += local_data[local_id + stride] + end + barrier() + stride ÷= 2 + end + + # First work-item writes result + if local_id == 1 + @inbounds result[get_group_id()] = local_data[1] + end + return +end +``` + +### Atomic Operations + +For thread-safe operations on shared data: + +```julia +# Atomic add +oneAPI.atomic_add!(ptr, value) + +# Atomic exchange +old_value = oneAPI.atomic_xchg!(ptr, new_value) + +# Atomic compare-and-swap +old_value = oneAPI.atomic_cas!(ptr, compare, new_value) + +# Atomic min/max +oneAPI.atomic_min!(ptr, value) +oneAPI.atomic_max!(ptr, value) +``` + +Example histogram kernel: + +```julia +function histogram_kernel!(hist, data, bins) + i = get_global_id() + if i <= length(data) + @inbounds val = data[i] + bin = clamp(floor(Int, val * bins) + 1, 1, bins) + oneAPI.atomic_add!(pointer(hist, bin), 1) + end + return +end +``` + +## Kernel Restrictions + +GPU kernels have certain restrictions: + +1. **Must return `nothing`**: Kernels cannot return values directly. Use output arrays instead. +2. **No dynamic memory allocation**: Cannot allocate arrays inside kernels +3. **No I/O operations**: Cannot print or write to files (use printf-style debugging with care) +4. **Limited recursion**: Avoid or minimize recursive calls +5. **Type stability**: Ensure type-stable code for best performance + +```julia +# ❌ Bad: Returns a value +function bad_kernel(a) + return a[1] + 1 +end + +# ✅ Good: Returns nothing, uses output parameter +function good_kernel!(result, a) + @inbounds result[1] = a[1] + 1 + return +end +``` + +## KernelAbstractions.jl + +For portable GPU programming across CUDA, AMD, and Intel GPUs, use KernelAbstractions.jl: + +```julia +using KernelAbstractions +using oneAPI + +@kernel function generic_kernel!(a, b) + i = @index(Global) + @inbounds a[i] = a[i] + b[i] +end + +a = oneArray(rand(Float32, 100)) +b = oneArray(rand(Float32, 100)) + +backend = get_backend(a) # oneAPIBackend() +kernel! = generic_kernel!(backend) +kernel!(a, b, ndrange=length(a)) +``` + +See the [KernelAbstractions.jl documentation](https://juliagpu.github.io/KernelAbstractions.jl/stable/) for more details. + +## Debugging Kernels + +See the [Compiler and Reflection](@ref) page for tools to inspect generated code and debug kernels. diff --git a/docs/src/api/memory.md b/docs/src/api/memory.md new file mode 100644 index 00000000..8cf72965 --- /dev/null +++ b/docs/src/api/memory.md @@ -0,0 +1,359 @@ +# Memory Management + +This page documents memory management in oneAPI.jl. + +## Memory Operations + +### `Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst, src, N)` + +Low-level memory copy operation on the GPU. Copies `N` elements from `src` to `dst` using +the specified context and device. Both `src` and `dst` can be either host pointers (`Ptr`) +or device pointers (`ZePtr`). + +!!! warning + This is a low-level function. No bounds checking is performed. For safe array copying, + use `copyto!` on `oneArray` objects instead. + +### `unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr, pattern, N)` + +Low-level memory fill operation on the GPU. Fills `N` elements at `ptr` with the given pattern +using the specified context and device. + +!!! warning + This is a low-level function. For safe array operations, use `fill!` on `oneArray` + objects instead. + + +## Memory Types + +oneAPI supports three types of memory through Unified Shared Memory (USM): + +### Device Memory (Default) + +Fastest GPU access, not directly accessible from CPU. + +```julia +# Create array in device memory (default) +a = oneArray{Float32}(undef, 1000) +@assert is_device(a) + +# Or explicitly specify +b = oneArray{Float32,1,oneL0.DeviceBuffer}(undef, 1000) +``` + +**Advantages:** +- Fastest GPU access +- Best for compute-intensive operations + +**Disadvantages:** +- Cannot directly access from CPU +- Requires explicit copy to/from CPU + +**Use when:** Data stays on GPU for multiple operations + +### Shared Memory + +Accessible from both CPU and GPU with automatic migration. + +```julia +# Create array in shared memory +a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000) +@assert is_shared(a) + +# Can access from CPU +a[1] = 42.0f0 # Automatic migration to CPU +println(a[1]) # Read from CPU + +# Can use in GPU kernels +@oneapi groups=1 items=1000 kernel(a) # Automatic migration to GPU +``` + +**Advantages:** +- Accessible from both CPU and GPU +- Unified virtual addressing +- Automatic migration + +**Disadvantages:** +- Migration overhead +- Slower than device memory for pure GPU work + +**Use when:** Frequent CPU-GPU data exchange needed + +### Host Memory + +CPU memory that's pinned and visible to GPU. + +```julia +# Create array in host memory +a = oneArray{Float32,1,oneL0.HostBuffer}(undef, 1000) +@assert is_host(a) + +# Direct CPU access +a[1] = 42.0f0 + +# Can be used by GPU (but slower than device memory) +@oneapi groups=1 items=1000 kernel(a) +``` + +**Advantages:** +- Direct CPU access +- Pinned memory (faster PCIe transfers) +- Good for staging + +**Disadvantages:** +- Slower GPU access than device memory +- Uses pinned system memory (limited resource) + +**Use when:** Staging data for transfer, or CPU needs to write while GPU reads + +## Memory Type Comparison + +| Feature | Device | Shared | Host | +|---------|--------|--------|------| +| CPU Access | ❌ No | ✅ Yes | ✅ Yes | +| GPU Performance | ⭐⭐⭐ Fastest | ⭐⭐ Good | ⭐ Slower | +| Migration | Manual | Automatic | Manual | +| Use Case | Pure GPU | Mixed CPU/GPU | Staging | + +## Memory Allocation and Deallocation + +### Automatic Management + +Julia's garbage collector automatically manages `oneArray` memory: + +```julia +function allocate_and_compute() + a = oneArray(rand(Float32, 1000)) + b = oneArray(rand(Float32, 1000)) + c = a .+ b + return Array(c) # Only c is copied back + # a and b will be garbage collected +end + +result = allocate_and_compute() +# GPU memory for a and b is freed eventually +``` + +### Manual Garbage Collection + +Force garbage collection to free GPU memory: + +```julia +# Allocate large arrays +a = oneArray(rand(Float32, 10_000_000)) +b = oneArray(rand(Float32, 10_000_000)) + +# Clear references +a = nothing +b = nothing + +# Force GC to reclaim GPU memory +GC.gc() +``` + +### Explicit Freeing + +Immediately free GPU memory (use with caution): + +```julia +a = oneArray(rand(Float32, 1000)) +# ... use a ... + +# Explicitly free (dangerous if still in use!) +unsafe_free!(a) + +# a is now invalid - do not use! +``` + +!!! warning + Only use `unsafe_free!` when you're certain the array is no longer needed, including + by any pending GPU operations. Prefer letting the GC handle cleanup. + +### Do-Block Pattern + +Use do-blocks for automatic cleanup: + +```julia +result = oneArray{Float32}(1000) do temp + # temp is automatically freed when block exits + temp .= 1.0f0 + sum(temp) # Result is returned +end +``` + +## Memory Pooling + +oneAPI.jl uses memory pooling to reduce allocation overhead: + +```julia +using oneAPI + +# Allocations are pooled +for i in 1:100 + a = oneArray(rand(Float32, 1000)) + # ... use a ... + # Memory is returned to pool, not freed +end +``` + +The pool automatically manages memory reuse, reducing allocation costs. + +## Checking Memory Usage + +Query GPU memory info: + +```julia +using oneAPI.oneL0 + +dev = device() +props = memory_properties(dev) + +for prop in props + println("Memory size: ", prop.totalSize ÷ (1024^3), " GB") +end +``` + +## Out of Memory Errors + +If you encounter out-of-memory errors: + +### 1. Reduce Batch Size + +```julia +# Instead of processing all at once +result = process(oneArray(huge_data)) + +# Process in smaller batches +for batch in batches(huge_data, size=1000) + result = process(oneArray(batch)) + # Process result... +end +``` + +### 2. Free Unused Arrays + +```julia +a = oneArray(rand(Float32, 1_000_000)) +b = compute(a) + +# If 'a' is no longer needed +unsafe_free!(a) + +# Continue with 'b' +result = process(b) +``` + +### 3. Use Shared or Host Memory + +```julia +# Instead of device memory +a = oneArray{Float32}(undef, huge_size) + +# Use shared memory (can swap to system RAM) +a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, huge_size) +``` + +### 4. Force Garbage Collection + +```julia +# After freeing references +large_array = nothing +GC.gc() # Immediately reclaim GPU memory +``` + +### 5. Use Multiple Devices + +```julia +# Distribute work across devices +for (i, dev_id) in enumerate(1:length(devices())) + Threads.@spawn begin + device!(dev_id) + partition = data_partitions[i] + a = oneArray(partition) + result = compute(a) + # ... + end +end +``` + +## Low-Level Memory Operations + +For advanced users, oneL0 provides direct memory management: + +```julia +using oneAPI.oneL0 + +ctx = context() +dev = device() + +# Allocate device memory +ptr = device_alloc(ctx, dev, 1024, 8) # 1024 bytes, 8-byte aligned + +# Copy data +data = rand(Float32, 256) +GC.@preserve data begin + unsafe_copyto!(ctx, dev, ptr, pointer(data), 256) +end + +# Free memory +free(ctx, ptr) +``` + +## Memory Advise and Prefetch + +Hint to the runtime about memory usage (shared memory only): + +```julia +using oneAPI.oneL0 + +a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000) + +# Advise that this will be read-only on the device +# (Implementation depends on Level Zero driver support) + +# Prefetch to device +ctx = context() +dev = device() +queue = global_queue(ctx, dev) + +execute!(queue) do list + append_prefetch!(list, pointer(a), sizeof(a)) +end +``` + +## Best Practices + +1. **Use device memory by default** for best GPU performance +2. **Use shared memory** when you need CPU access without explicit copies +3. **Use host memory** for staging data or when CPU writes frequently +4. **Let GC handle cleanup** unless you have specific memory pressure +5. **Reuse allocations** within loops when possible +6. **Profile memory usage** to identify bottlenecks +7. **Be cautious with `unsafe_free!`** - use only when you're certain it's safe + +## Example: Efficient Memory Usage + +```julia +using oneAPI + +function efficient_pipeline(data_batches) + # Allocate output buffer once + result = oneArray{Float32}(undef, 1000) + results = Float32[] + + for batch in data_batches + # Reuse input buffer by copying + input = oneArray(batch) + + # Compute in-place when possible + @oneapi groups=4 items=250 process_kernel!(result, input) + + # Copy result back + push!(results, Array(result)...) + + # Input is freed when loop continues + end + + return results +end +``` diff --git a/docs/src/arrays.md b/docs/src/arrays.md new file mode 100644 index 00000000..5ecad50c --- /dev/null +++ b/docs/src/arrays.md @@ -0,0 +1,64 @@ +# Array Programming + +oneAPI.jl provides an array type, `oneArray`, which lives on the GPU. It implements the interface defined by `GPUArrays.jl`, allowing for high-level array operations. + +## The `oneArray` Type + +The `oneArray{T,N}` type represents an N-dimensional array with elements of type `T` stored on the GPU. + +```julia +using oneAPI + +# Allocate an uninitialized array +a = oneArray{Float32}(undef, 1024) + +# Initialize from a CPU array +b = oneArray([1, 2, 3, 4]) + +# Initialize with zeros/ones +z = oneAPI.zeros(Float32, 100) +o = oneAPI.ones(Float32, 100) +``` + +## Array Operations + +Since `oneArray` implements the AbstractArray interface, you can use standard Julia array operations. + +```julia +a = oneArray(rand(Float32, 10)) +b = oneArray(rand(Float32, 10)) + +c = a .+ b # Element-wise addition +d = sum(a) # Reduction +e = map(sin, a) # Map +``` + +## Data Transfer + +To move data between the host (CPU) and the device (GPU), use the constructors or `copyto!`. + +```julia +# CPU to GPU +d_a = oneArray(h_a) + +# GPU to CPU +h_a = Array(d_a) +``` + +## Backend Agnostic Programming + +To write code that works on both CPU and GPU (and other backends like CUDA), use the generic array interfaces provided by `GPUArrays.jl`. Avoid hardcoding `oneArray` in your functions; instead, accept `AbstractArray` and let the dispatch system handle the specific implementation. + +```julia +function generic_add!(a::AbstractArray, b::AbstractArray) + a .+= b + return a +end + +# Works on CPU +generic_add!(rand(10), rand(10)) + +# Works on Intel GPU +generic_add!(oneArray(rand(10)), oneArray(rand(10))) +``` + diff --git a/docs/src/device.md b/docs/src/device.md new file mode 100644 index 00000000..acdb4d6e --- /dev/null +++ b/docs/src/device.md @@ -0,0 +1,40 @@ +# Device Intrinsics + +When writing custom kernels, you have access to a set of device intrinsics that map to underlying hardware instructions. + +## Indexing + +These functions allow you to determine the current thread's position in the execution grid. + +- `get_global_id(dim=0)`: Global index of the work item. +- `get_local_id(dim=0)`: Local index of the work item within the workgroup. +- `get_group_id(dim=0)`: Index of the workgroup. +- `get_global_size(dim=0)`: Global size of the ND-range. +- `get_local_size(dim=0)`: Size of the workgroup. +- `get_num_groups(dim=0)`: Number of workgroups. + +## Synchronization + +- `barrier(flags=0)`: Synchronizes all work items in a workgroup. + +## Atomics + +Atomic operations are supported for thread-safe updates to memory. + +- `atomic_add!(ptr, val)` +- `atomic_sub!(ptr, val)` +- `atomic_inc!(ptr)` +- `atomic_dec!(ptr)` +- `atomic_min!(ptr, val)` +- `atomic_max!(ptr, val)` +- `atomic_and!(ptr, val)` +- `atomic_or!(ptr, val)` +- `atomic_xor!(ptr, val)` +- `atomic_cmpxchg!(ptr, cmp, val)` + +Supported types for atomics generally include `Int32`, `Int64`, `UInt32`, `UInt64`, `Float32`, and `Float64`. + +## Math Functions + +Standard math functions from Julia's `Base` are supported within kernels (e.g., `sin`, `cos`, `exp`, `sqrt`). + diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md new file mode 100644 index 00000000..595e7ca5 --- /dev/null +++ b/docs/src/getting_started.md @@ -0,0 +1,58 @@ +# Getting Started + +## Basic Usage + +The most basic usage involves moving data to the GPU using `oneArray` and performing operations on it. + +```julia +using oneAPI + +# Create an array on the CPU +a = rand(Float32, 1024) + +# Move it to the GPU +d_a = oneArray(a) + +# Perform operations on the GPU +d_b = d_a .+ 1.0f0 + +# Move the result back to the CPU +b = Array(d_b) +``` + +## Matrix Multiplication + +Matrix multiplication is accelerated using the oneMKL library when available. + +```julia +using oneAPI + +A = oneArray(rand(Float32, 128, 128)) +B = oneArray(rand(Float32, 128, 128)) + +# This operation runs on the GPU +C = A * B +``` + +## Writing Kernels + +For custom operations, you can write kernels using the `@oneapi` macro. + +```julia +using oneAPI + +function my_kernel(a, b) + i = get_global_id() + @inbounds a[i] += b[i] + return +end + +a = oneArray(ones(Float32, 1024)) +b = oneArray(ones(Float32, 1024)) + +# Launch the kernel with 1024 items +@oneapi items=1024 my_kernel(a, b) +``` + +See the [Kernel Programming](kernels.md) section for more details. + diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 00000000..988d9e92 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,29 @@ +# oneAPI.jl + +*Julia support for the oneAPI programming toolkit.* + +oneAPI.jl provides support for working with the [oneAPI unified programming model](https://software.intel.com/en-us/oneapi). The package is currently verified to work with the implementation provided by the [Intel Compute Runtime](https://github.com/intel/compute-runtime), primarily on Linux. + +## Writing Portable Code + +While oneAPI.jl provides specific functionality for Intel GPUs, it is highly recommended to write **backend-agnostic code** whenever possible. This allows your code to run on various hardware backends (NVIDIA, AMD, Intel, Apple) without modification. + +- **[GPUArrays.jl](https://github.com/JuliaGPU/GPUArrays.jl)**: Use high-level array abstractions that work across different GPU backends. +- **[KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)**: Use this package for writing kernels that can be compiled for CPU, CUDA, ROCm, and oneAPI devices. + +Direct use of `oneAPI`-specific macros (like `@oneapi`) and types (like `oneArray`) should be reserved for cases where you need specific optimizations or features not covered by the generic abstractions. + +## Features + +- **High-level Array Abstractions**: `oneArray` type fully implementing the `GPUArrays.jl` interface. +- **Kernel Programming**: Execute custom kernels written in Julia on Intel GPUs. +- **Level Zero Integration**: Low-level access to the Level Zero API via the `oneL0` submodule. +- **oneMKL Support**: Integration with Intel oneMKL for BLAS, LAPACK, and sparse operations. +- **SYCL Integration**: Interoperability with SYCL (on Linux). + +## Requirements + +- **Julia**: 1.10 or higher +- **OS**: Linux +- **Hardware**: Intel Gen9 graphics or newer (including Intel Arc A-Series) + diff --git a/docs/src/installation.md b/docs/src/installation.md new file mode 100644 index 00000000..59cf36ae --- /dev/null +++ b/docs/src/installation.md @@ -0,0 +1,127 @@ +# Installation + +## Requirements + +oneAPI.jl requires: +- **Julia**: 1.10 or higher +- **OS**: Linux (recommended) or Windows (experimental via WSL2) +- **Hardware**: Intel Gen9 graphics or newer. For Intel Arc GPUs (A580, A750, A770, etc), **Linux 6.2+** is required. + +## Installing oneAPI.jl + +You can install oneAPI.jl using the Julia package manager: + +```julia +pkg> add oneAPI +``` + +This will automatically download the necessary binary dependencies, including: +- `oneAPI loader` +- `SPIR-V tools` +- `Intel Compute Runtime` (if compatible hardware is found) + +## Verifying Installation + +After installation, you can verify that oneAPI.jl is working correctly and detecting your hardware: + +```julia +julia> using oneAPI +julia> oneAPI.versioninfo() +``` + +The output should list the binary dependencies, toolchain versions, available drivers, and devices. + +## Troubleshooting Drivers + +If no drivers or devices are detected, ensure that you have the correct Intel graphics drivers installed for your system. +- On Linux, check if `libze_intel_gpu.so` or similar libraries are available. +- On Windows (WSL2), ensure you have the latest Intel graphics drivers installed on the host Windows system and that WSL2 is configured to access the GPU. + +You can explicitly select drivers and devices if multiple are available: + +```julia +julia> drivers() +julia> devices() +julia> device!(1) # Select the first available device +``` + +## Using System Libraries (Advanced) + +!!! warning + Using system libraries instead of the provided artifacts is **not recommended** for most users. Only use this approach if you have specialized requirements or custom Intel binaries. + +By default, oneAPI.jl uses pre-built binary artifacts (JLLs) for the Intel Compute Runtime, oneAPI loader, and related libraries. However, you may need to use system-installed libraries in certain situations: + +- Custom or newer Intel graphics drivers +- Specialized hardware configurations +- Development or debugging of the runtime stack +- Systems where the artifacts are incompatible + +### Configuration Script + +oneAPI.jl provides a helper script to discover and configure system libraries. From the Julia REPL: + +```julia +julia> include(joinpath(pkgdir(oneAPI), "res", "local.jl")) +``` + +This script will: +1. Search for Intel libraries on your system: + - Intel Graphics Compiler (IGC): `libigc`, `libiga64`, `libigdfcl`, `libopencl-clang` + - Graphics Memory Management Library: `libigdgmm` + - Intel Compute Runtime (NEO): `libze_intel_gpu`, `libigdrcl` + - oneAPI Level Zero Loader: `libze_loader`, `libze_validation_layer` + +2. Generate preferences in `LocalPreferences.toml` that override the artifact paths + +### Manual Configuration + +You can also manually set preferences to use specific library paths. Create or edit `LocalPreferences.toml` in your project or global environment: + +```toml +[NEO_jll] +libze_intel_gpu_path = "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so.1" +libigdrcl_path = "/usr/lib/x86_64-linux-gnu/intel-opencl/libigdrcl.so" + +[libigc_jll] +libigc_path = "/usr/lib/x86_64-linux-gnu/libigc.so" +libigdfcl_path = "/usr/lib/x86_64-linux-gnu/libigdfcl.so" + +[gmmlib_jll] +libigdgmm_path = "/usr/lib/x86_64-linux-gnu/libigdgmm.so" + +[oneAPI_Level_Zero_Loader_jll] +libze_loader_path = "/usr/lib/x86_64-linux-gnu/libze_loader.so" +``` + +### Reverting to Artifacts + +To revert to the default artifact binaries, simply delete the oneAPI-related entries from `LocalPreferences.toml` (or delete the entire file if it only contains these preferences). + +### Common Locations + +System libraries are typically installed in: + +**Ubuntu/Debian:** +- `/usr/lib/x86_64-linux-gnu/` +- `/usr/lib/x86_64-linux-gnu/intel-opencl/` + +**Fedora/RHEL:** +- `/usr/lib64/` +- `/usr/lib64/intel-opencl/` + +**Custom Intel oneAPI installation:** +- `/opt/intel/oneapi/compiler/latest/linux/lib/` +- `/opt/intel/oneapi/compiler/latest/linux/lib/x64/` + +### Verifying System Library Configuration + +After configuring system libraries, restart Julia and verify the configuration: + +```julia +julia> using oneAPI +julia> oneAPI.versioninfo() +``` + +Check that the reported library paths match your system libraries. If issues arise, examine the `LocalPreferences.toml` file and ensure all paths are correct and the libraries are compatible with each other. + diff --git a/docs/src/kernels.md b/docs/src/kernels.md new file mode 100644 index 00000000..b5e03b12 --- /dev/null +++ b/docs/src/kernels.md @@ -0,0 +1,61 @@ +# Kernel Programming + +For maximum performance or custom operations not covered by high-level array abstractions, you can write custom kernels in Julia that execute on the GPU. + +## The `@oneapi` Macro + +The `@oneapi` macro is used to launch a kernel on the device. It takes configuration arguments like the number of items (threads) and groups (blocks). + +```julia +using oneAPI + +function kernel(a, b) + i = get_global_id() + if i <= length(a) + @inbounds a[i] += b[i] + end + return +end + +a = oneArray(rand(Float32, 100)) +b = oneArray(rand(Float32, 100)) + +# Launch configuration +items = 100 +groups = 1 + +@oneapi items=items groups=groups kernel(a, b) +``` + +## KernelAbstractions.jl + +For portable kernel programming, it is highly recommended to use [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl). This allows you to write kernels that work on CPU, CUDA, ROCm, and oneAPI. + +```julia +using KernelAbstractions, oneAPI + +@kernel function my_kernel!(a, b) + i = @index(Global, Linear) + @inbounds a[i] += b[i] +end + +# Get the backend +backend = get_backend(a) + +# Instantiate the kernel +k = my_kernel!(backend) + +# Launch with configuration +k(a, b; ndrange=length(a)) +``` + +## Device Intrinsics + +Inside a kernel, you can use various intrinsics to interact with the hardware: +- `get_global_id()`: Get the global thread ID. +- `get_local_id()`: Get the local thread ID within a workgroup. +- `get_group_id()`: Get the workgroup ID. +- `barrier()`: Synchronize threads within a workgroup. + +These correspond to standard OpenCL/Level Zero intrinsics. + diff --git a/docs/src/level_zero.md b/docs/src/level_zero.md new file mode 100644 index 00000000..f093fcce --- /dev/null +++ b/docs/src/level_zero.md @@ -0,0 +1,49 @@ +# Level Zero Interface + +The `oneL0` submodule provides low-level access to the Level Zero API, which gives you fine-grained control over the hardware. + +## Drivers and Devices + +You can enumerate available drivers and devices: + +```julia +using oneAPI.oneL0 + +# Get available drivers +drvs = drivers() + +# Get devices for a driver +devs = devices(first(drvs)) + +# Inspect device properties +props = compute_properties(first(devs)) +println("Max workgroup size: ", props.maxTotalGroupSize) +``` + +## Contexts and Queues + +Manage contexts and command queues for executing operations: + +```julia +# Create a context +ctx = ZeContext(first(drvs)) + +# Create a command queue +queue = ZeCommandQueue(ctx, first(devs)) + +# Execute a command list +execute!(queue) do list + append_barrier!(list) +end +``` + +## Memory Operations + +You can perform low-level memory operations using command lists: + +```julia +execute!(queue) do list + append_copy!(list, dst_ptr, src_ptr, size) +end +``` + diff --git a/docs/src/memory.md b/docs/src/memory.md new file mode 100644 index 00000000..f97146e8 --- /dev/null +++ b/docs/src/memory.md @@ -0,0 +1,51 @@ +# Memory Management + +Efficient memory management is crucial for GPU programming. oneAPI.jl provides tools to manage device memory allocation and data transfer. + +## Unified Shared Memory (USM) + +oneAPI uses Unified Shared Memory, which allows for pointers that can be accessible from both the host and the device, or specific to one. + +- **Device Memory**: Accessible only by the device. Fastest access for kernels. +- **Host Memory**: Accessible by the host and device. +- **Shared Memory**: Automatically migrated between host and device. + +`oneArray` typically uses device memory for performance. + +## Allocation + +You can perform low-level memory allocation using the `oneL0` submodule if needed, though `oneArray` handles this automatically. + +```julia +using oneAPI.oneL0 + +# Allocate device memory +ptr = oneL0.zeMemAllocDevice(context(), device(), 1024, 1) + +# Free memory +oneL0.zeMemFree(context(), ptr) +``` + +## Garbage Collection + +Julia's garbage collector automatically manages `oneArray` objects. However, GPU memory is a limited resource. If you are running into out-of-memory errors, you might need to manually trigger garbage collection or free arrays. + +```julia +a = oneArray(rand(Float32, 1024*1024*100)) +a = nothing +GC.gc() # Reclaim memory +``` + +## Explicit Freeing + +For immediate memory release, you can use `unsafe_free!`: + +```julia +using oneAPI + +a = oneArray(rand(1024)) +oneAPI.unsafe_free!(a) +``` + +**Warning**: Only use `unsafe_free!` if you are sure the array is no longer used, including by any pending GPU operations. + diff --git a/docs/src/onemkl.md b/docs/src/onemkl.md new file mode 100644 index 00000000..91f2645c --- /dev/null +++ b/docs/src/onemkl.md @@ -0,0 +1,59 @@ +# oneMKL Integration + +oneAPI.jl provides bindings to the Intel oneMKL library, enabling high-performance linear algebra operations on Intel GPUs. + +## Dense Linear Algebra (BLAS/LAPACK) + +Standard BLAS and LAPACK operations are automatically accelerated when using `oneArray`. + +```julia +using oneAPI, LinearAlgebra + +A = oneArray(rand(Float32, 100, 100)) +B = oneArray(rand(Float32, 100, 100)) + +# Matrix multiplication (GEMM) +C = A * B + +# Linear solve (AX = B) +X = A \ B +``` + +## Sparse Linear Algebra + +oneAPI.jl supports sparse matrix operations via oneMKL's sparse BLAS functionality. These integrate with Julia's `SparseArrays` standard library. + +```julia +using oneAPI, oneAPI.oneMKL, SparseArrays, LinearAlgebra + +# Create a sparse matrix on CPU +A = sprand(100, 100, 0.1) + +# Move to GPU (converts to oneMKL format) +dA = oneMKL.oneSparseMatrixCSC(A) + +# Create a dense vector +x = oneArray(rand(100)) + +# Sparse matrix-vector multiplication +y = dA * x +``` + +Note that `oneSparseMatrixCSC` is available for Compressed Sparse Column format, which is the standard in Julia. + +## FFTs + +Fast Fourier Transforms are supported through `AbstractFFTs.jl` interface integration with oneMKL DFTs. + +```julia +using oneAPI, FFTW + +a = oneArray(rand(ComplexF32, 1024)) + +# Forward FFT +b = fft(a) + +# Inverse FFT +c = ifft(b) +``` + diff --git a/docs/src/troubleshooting.md b/docs/src/troubleshooting.md new file mode 100644 index 00000000..9f790f65 --- /dev/null +++ b/docs/src/troubleshooting.md @@ -0,0 +1,55 @@ +# Troubleshooting + +## Common Issues + +### No devices detected + +**Symptom**: `oneAPI.devices()` returns an empty list. + +**Solution**: +1. Ensure you are running on Linux (recommended) or WSL2. +2. Check if the Intel Compute Runtime is installed and accessible. +3. Verify your user has permissions to access the GPU render device (usually `render` group). +4. Run `oneAPI.versioninfo()` to see detailed diagnostic information. + +### "Double type is not supported" + +**Symptom**: Kernel compilation fails with an error about `Float64` or `Double` support. + +**Solution**: +Some Intel GPUs (especially integrated graphics) lack native hardware support for 64-bit floating point operations. +- Use `Float32` instead of `Float64`. +- Check support with: + ```julia + using oneAPI.oneL0 + oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 != 0 + ``` + +### "Out of memory" errors + +**Symptom**: Memory allocation fails. + +**Solution**: +- Trigger garbage collection: `GC.gc()`. +- Manually free unused arrays: `oneAPI.unsafe_free!(array)`. +- Check if you are exceeding the device's memory capacity. + +## Debugging + +### Validation Layer + +Enable the Level Zero validation layer to catch API misuse: + +```bash +export ZE_ENABLE_VALIDATION_LAYER=1 +export ZE_ENABLE_PARAMETER_VALIDATION=1 +``` + +### Debug Mode + +Enable debug mode in oneAPI.jl to use debug builds of underlying toolchains (if available): + +```julia +oneAPI.set_debug!(true) +``` + diff --git a/docs/src/usage/performance.md b/docs/src/usage/performance.md new file mode 100644 index 00000000..c7064aee --- /dev/null +++ b/docs/src/usage/performance.md @@ -0,0 +1,485 @@ +# Performance Guide + +This guide provides tips and techniques for optimizing oneAPI.jl applications. + +## Quick Wins + +### 1. Use Device Memory + +Device memory is fastest for GPU operations: + +```julia +# ✅ Good: Device memory (default) +a = oneArray{Float32}(undef, 1000) + +# ❌ Slower: Shared memory (unless CPU access is needed) +a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000) +``` + +### 2. Minimize Data Transfers + +Keep data on GPU between operations: + +```julia +# ❌ Bad: Unnecessary transfers +for i in 1:100 + cpu_data = Array(gpu_array) # GPU → CPU + cpu_data .+= 1 + gpu_array = oneArray(cpu_data) # CPU → GPU +end + +# ✅ Good: Keep data on GPU +for i in 1:100 + gpu_array .+= 1 # All on GPU +end +``` + +### 3. Use Fused Operations + +Broadcasting automatically fuses operations: + +```julia +# ❌ Slower: Multiple kernel launches +a = oneArray(rand(Float32, 1000)) +b = sin.(a) +c = b .+ 1.0f0 +d = c .* 2.0f0 + +# ✅ Faster: Single fused kernel +d = 2.0f0 .* (sin.(a) .+ 1.0f0) +``` + +### 4. Specify Float32 + +GPUs are typically optimized for single precision: + +```julia +# ❌ Slower: Float64 (if not needed) +a = oneArray(rand(Float64, 1000)) + +# ✅ Faster: Float32 +a = oneArray(rand(Float32, 1000)) +``` + +## Kernel Optimization + +### Launch Configuration + +Choose appropriate workgroup sizes: + +```julia +# Typical good workgroup sizes +items = 256 # Common choice, adjust based on hardware +items = 128 # Try smaller if using lots of local memory +items = 512 # Try larger for simple kernels + +# Calculate groups +N = length(array) +groups = cld(N, items) # Ceiling division + +@oneapi groups=groups items=items kernel(array) +``` + +### Memory Access Patterns + +Coalesced memory access is crucial for performance: + +```julia +# ✅ Good: Coalesced access (consecutive threads access consecutive memory) +function good_kernel!(output, input) + i = get_global_id() + @inbounds output[i] = input[i] + 1.0f0 + return +end + +# ❌ Bad: Strided access (cache inefficient) +function bad_kernel!(output, input, stride) + i = get_global_id() + @inbounds output[i] = input[i * stride] + 1.0f0 + return +end +``` + +### Use Local Memory + +Local memory is faster than global memory for data reuse: + +```julia +function optimized_reduction!(result, input) + local_id = get_local_id() + local_size = get_local_size() + group_id = get_group_id() + + # Allocate local memory + local_mem = oneLocalArray(Float32, 256) + + # Load global → local (coalesced) + global_id = get_global_id() + @inbounds local_mem[local_id] = input[global_id] + barrier() + + # Reduce in local memory (much faster) + stride = local_size ÷ 2 + while stride > 0 + if local_id <= stride + @inbounds local_mem[local_id] += local_mem[local_id + stride] + end + barrier() + stride ÷= 2 + end + + # Write result + if local_id == 1 + @inbounds result[group_id] = local_mem[1] + end + return +end +``` + +### Minimize Barriers + +Barriers have overhead: + +```julia +# ❌ Bad: Unnecessary barriers +function wasteful_kernel!(a) + i = get_local_id() + a[i] += 1 + barrier() # Not needed if no data sharing + a[i] *= 2 + barrier() # Not needed + return +end + +# ✅ Good: Barriers only when needed +function efficient_kernel!(a, shared) + i = get_local_id() + + # Load to shared memory + shared[i] = a[i] + barrier() # Needed: ensure all loads complete + + # Use shared data + result = shared[i] + shared[i+1] + a[i] = result + return +end +``` + +### Avoid Divergence + +Minimize thread divergence (different execution paths): + +```julia +# ❌ Bad: High divergence +function divergent_kernel!(a) + i = get_global_id() + if i % 32 == 0 + # Only 1 in 32 threads executes this + @inbounds a[i] = expensive_computation(a[i]) + else + @inbounds a[i] += 1.0f0 + end + return +end + +# ✅ Better: Separate into different kernels +function uniform_kernel!(a) + i = get_global_id() + @inbounds a[i] += 1.0f0 + return +end + +function sparse_kernel!(a, indices) + i = get_global_id() + if i <= length(indices) + idx = indices[i] + @inbounds a[idx] = expensive_computation(a[idx]) + end + return +end +``` + +## Type Stability + +Type instability severely hurts performance: + +```julia +# ❌ Bad: Type unstable +function unstable_kernel!(output, input, flag) + i = get_global_id() + if flag + value = input[i] # Float32 + else + value = 0 # Int + end + output[i] = value * 2 # Type uncertain! + return +end + +# ✅ Good: Type stable +function stable_kernel!(output, input, flag) + i = get_global_id() + if flag + value = input[i] # Float32 + else + value = 0.0f0 # Float32 + end + output[i] = value * 2.0f0 # All Float32! + return +end + +# Check type stability +@device_code_warntype @oneapi groups=1 items=10 stable_kernel!(output, input, true) +``` + +## Algorithmic Optimization + +### Use Library Functions + +Leverage optimized library implementations: + +```julia +using oneAPI, LinearAlgebra + +# ✅ Good: Use oneMKL through LinearAlgebra +A = oneArray(rand(Float32, 1000, 1000)) +B = oneArray(rand(Float32, 1000, 1000)) +C = A * B # Uses optimized oneMKL + +# ❌ Bad: Write your own matrix multiplication +# (unless you have a very specific use case) +``` + +### Choose Right Algorithm + +Some algorithms parallelize better than others: + +```julia +# ❌ Sequential algorithm +function sequential_sum(arr) + sum = 0.0f0 + for x in arr + sum += x + end + return sum +end + +# ✅ Parallel reduction +result = sum(oneArray(data)) # Optimized parallel reduction +``` + +## Benchmarking + +### Basic Timing + +```julia +using BenchmarkTools, oneAPI + +a = oneArray(rand(Float32, 1000)) +b = oneArray(rand(Float32, 1000)) + +# Warmup +c = a .+ b +synchronize() + +# Benchmark +@benchmark begin + c = $a .+ $b + synchronize() +end +``` + +### Accurate GPU Timing + +Always synchronize before timing: + +```julia +using oneAPI + +a = oneArray(rand(Float32, 1_000_000)) + +# ❌ Wrong: Doesn't wait for GPU +@time a .+= 1 # Only measures kernel launch overhead + +# ✅ Correct: Wait for GPU to finish +@time begin + a .+= 1 + synchronize() +end +``` + +### Profiling with Time + +```julia +function profile_operation(a, b) + # Warmup + c = a .+ b + synchronize() + + # Time kernel launch + t1 = time() + c = a .+ b + t2 = time() + launch_time = t2 - t1 + + # Time including synchronization + synchronize() + t3 = time() + total_time = t3 - t1 + + println("Launch: ", launch_time * 1000, " ms") + println("Total: ", total_time * 1000, " ms") + println("Actual: ", (total_time - launch_time) * 1000, " ms") +end + +a = oneArray(rand(Float32, 10_000_000)) +b = oneArray(rand(Float32, 10_000_000)) +profile_operation(a, b) +``` + +## Memory Bandwidth + +### Theoretical Peak + +Calculate theoretical bandwidth: + +```julia +# Example: Intel Iris Xe Graphics +# 96 execution units, 1.35 GHz +# Memory bandwidth: ~68 GB/s + +# Your kernel processes N Float32 values +N = 10_000_000 +bytes_transferred = N * sizeof(Float32) * 2 # Read + Write + +# Measure time +t = @elapsed begin + a .+= b + synchronize() +end + +bandwidth_achieved = bytes_transferred / t / 1e9 # GB/s +println("Bandwidth: ", bandwidth_achieved, " GB/s") +``` + +### Improving Bandwidth Utilization + +```julia +# ✅ Good: Single pass with fusion +result = @. a + b * c - d / e # One pass over data + +# ❌ Bad: Multiple passes +result = a .+ b +result = result .* c +result = result .- d +result = result ./ e +# Four separate passes over data! +``` + +## Common Performance Issues + +### Issue 1: Too Many Small Kernels + +```julia +# ❌ Bad: Many small kernel launches +for i in 1:100 + a .+= 1 # 100 kernel launches! +end + +# ✅ Good: Single kernel or batching +a .+= 100 # Single operation +``` + +### Issue 2: Unnecessary Allocations + +```julia +# ❌ Bad: Allocates temporary +c = a .+ b # Allocates new array + +# ✅ Good: In-place operation +c = similar(a) +c .= a .+ b # Uses pre-allocated array +``` + +### Issue 3: Wrong Number Type + +```julia +# ❌ Bad: Mixed types +a = oneArray(rand(Float32, 1000)) +b = a .+ 1.0 # Float64 constant! + +# ✅ Good: Matching types +b = a .+ 1.0f0 # Float32 constant +``` + +## Performance Checklist + +- [ ] Using device memory (not shared unless necessary) +- [ ] Minimizing CPU-GPU transfers +- [ ] Using Float32 (unless Float64 required) +- [ ] Fusing operations with broadcasting +- [ ] Type-stable kernels (`@device_code_warntype`) +- [ ] Appropriate workgroup sizes +- [ ] Coalesced memory access +- [ ] Minimal thread divergence +- [ ] Leveraging local memory for reuse +- [ ] Using library functions when available +- [ ] Synchronizing before timing +- [ ] Avoiding unnecessary allocations + +## Hardware-Specific Tuning + +Different Intel GPUs have different characteristics: + +```julia +using oneAPI.oneL0 + +dev = device() +props = properties(dev) +compute_props = compute_properties(dev) + +println("Device: ", props.name) +println("EU count: ", compute_props.numEUsPerSubslice * + compute_props.numSubslicesPerSlice * + compute_props.numSlices) +println("Max workgroup size: ", compute_props.maxTotalGroupSize) +println("Max local memory: ", compute_props.maxSharedLocalMemory, " bytes") + +# Adjust your code based on these properties +``` + +## Advanced: Async Operations + +For overlapping compute and transfers (advanced users): + +```julia +using oneAPI.oneL0 + +ctx = context() +dev = device() + +# Create multiple queues for async operations +queue1 = ZeCommandQueue(ctx, dev) +queue2 = ZeCommandQueue(ctx, dev) + +# Launch kernel on queue1 +execute!(queue1) do list + # ... kernel launch ... +end + +# Overlap with transfer on queue2 +execute!(queue2) do list + append_copy!(list, dst, src, size) +end + +# Synchronize both +synchronize(queue1) +synchronize(queue2) +``` + +## Further Resources + +- [Intel GPU Architecture](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-gpu-architecture.html) +- [oneAPI Programming Guide](https://www.intel.com/content/www/us/en/developer/tools/oneapi/programming-guide.html) +- [Level Zero Specification](https://spec.oneapi.io/level-zero/latest/index.html) diff --git a/src/accumulate.jl b/ext/oneAPIAcceleratedKernelsExt.jl similarity index 70% rename from src/accumulate.jl rename to ext/oneAPIAcceleratedKernelsExt.jl index 206b4ea1..6bb96180 100644 --- a/src/accumulate.jl +++ b/ext/oneAPIAcceleratedKernelsExt.jl @@ -1,3 +1,10 @@ +module oneAPIAcceleratedKernelsExt + +import oneAPI +import oneAPI: oneArray, oneAPIBackend +import AcceleratedKernels as AK + +# Accumulate operations using AcceleratedKernels Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)), kwargs...) = AK.accumulate!(op, B, A, oneAPIBackend(); init, kwargs...) @@ -6,3 +13,5 @@ Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) = Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...) Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...) + +end # module diff --git a/src/array.jl b/src/array.jl index d576cdb7..d602f8d9 100644 --- a/src/array.jl +++ b/src/array.jl @@ -41,6 +41,46 @@ function check_eltype(T) end end +""" + oneArray{T,N,B} <: AbstractGPUArray{T,N} + +N-dimensional dense array type for Intel GPU programming using oneAPI and Level Zero. + +# Type Parameters +- `T`: Element type (must be stored inline, no isbits-unions) +- `N`: Number of dimensions +- `B`: Buffer type, one of: + - `oneL0.DeviceBuffer`: GPU device memory (default, not CPU-accessible) + - `oneL0.SharedBuffer`: Unified shared memory (CPU and GPU accessible) + - `oneL0.HostBuffer`: Pinned host memory (CPU-accessible, GPU-visible) + +# Memory Types + +- **Device memory** (default): Fastest GPU access, not directly accessible from CPU +- **Shared memory**: Accessible from both CPU and GPU, with unified virtual addressing +- **Host memory**: CPU memory that's visible to the GPU, useful for staging + +Use [`is_device`](@ref), [`is_shared`](@ref), [`is_host`](@ref) to query memory type. + +# Examples +```julia +# Create arrays with different memory types +A = oneArray{Float32,2}(undef, 10, 10) # Device memory (default) +B = oneArray{Float32,2,oneL0.SharedBuffer}(undef, 10, 10) # Shared memory +C = oneArray{Float32,2,oneL0.HostBuffer}(undef, 10, 10) # Host memory + +# From existing array +D = oneArray(rand(Float32, 10, 10)) # Creates device memory array + +# Using do-block for automatic cleanup +result = oneArray{Float32}(100) do arr + # Use arr... + Array(arr) # Copy result back before cleanup +end +``` + +See also: [`oneVector`](@ref), [`oneMatrix`](@ref), [`is_device`](@ref), [`is_shared`](@ref) +""" mutable struct oneArray{T,N,B} <: AbstractGPUArray{T,N} data::DataRef{B} @@ -179,8 +219,37 @@ end buftype(x::oneArray) = buftype(typeof(x)) buftype(::Type{<:oneArray{<:Any,<:Any,B}}) where {B} = @isdefined(B) ? B : Any +""" + is_device(a::oneArray) -> Bool + +Check if the array is stored in device memory (not directly CPU-accessible). + +Device memory provides the fastest GPU access but cannot be directly accessed from the CPU. + +See also: [`is_shared`](@ref), [`is_host`](@ref) +""" is_device(a::oneArray) = isa(a.data[], oneL0.DeviceBuffer) + +""" + is_shared(a::oneArray) -> Bool + +Check if the array is stored in shared (unified) memory. + +Shared memory is accessible from both CPU and GPU with unified virtual addressing. + +See also: [`is_device`](@ref), [`is_host`](@ref) +""" is_shared(a::oneArray) = isa(a.data[], oneL0.SharedBuffer) + +""" + is_host(a::oneArray) -> Bool + +Check if the array is stored in pinned host memory. + +Host memory resides on the CPU but is visible to the GPU, useful for staging data. + +See also: [`is_device`](@ref), [`is_shared`](@ref) +""" is_host(a::oneArray) = isa(a.data[], oneL0.HostBuffer) ## derived types diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 6503a9b0..9ce5bc5c 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -7,6 +7,54 @@ const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:kernel, :name, :always_inline] const LAUNCH_KWARGS = [:groups, :items, :queue] +""" + @oneapi [kwargs...] kernel(args...) + +High-level interface for launching Julia kernels on Intel GPUs using oneAPI. + +This macro compiles a Julia function to SPIR-V, prepares the arguments, and optionally +launches the kernel on the GPU. + +# Keyword Arguments + +## Macro Keywords (compile-time) +- `launch::Bool=true`: Whether to launch the kernel immediately. If `false`, returns the + compiled kernel object without executing it. + +## Compiler Keywords +- `kernel::Bool=false`: Whether to compile as a kernel (true) or device function (false) +- `name::Union{String,Nothing}=nothing`: Explicit name for the kernel +- `always_inline::Bool=false`: Whether to always inline device functions + +## Launch Keywords (runtime) +- `groups`: Number of workgroups (required). Can be an integer or tuple. +- `items`: Number of work-items per workgroup (required). Can be an integer or tuple. +- `queue::ZeCommandQueue=global_queue(...)`: Command queue to submit to. + +# Examples + +```julia +# Simple vector addition kernel +function vadd(a, b, c) + i = get_global_id() + @inbounds c[i] = a[i] + b[i] + return +end + +a = oneArray(rand(Float32, 1024)) +b = oneArray(rand(Float32, 1024)) +c = similar(a) + +# Launch with 4 workgroups of 256 items each +@oneapi groups=4 items=256 vadd(a, b, c) + +# Compile without launching +kernel = @oneapi launch=false vadd(a, b, c) +kernel(a, b, c; groups=4, items=256) # Launch later +``` + +See also: `zefunction`, `kernel_convert` +""" macro oneapi(ex...) call = ex[end] kwargs = map(ex[1:end-1]) do kwarg diff --git a/src/context.jl b/src/context.jl index 3a36ea35..b0f9ff10 100644 --- a/src/context.jl +++ b/src/context.jl @@ -8,23 +8,98 @@ export driver, driver!, device, device!, context, context!, global_queue, synchronize +""" + driver() -> ZeDriver + +Get the current Level Zero driver for the calling task. If no driver has been explicitly +set with [`driver!`](@ref), returns the first available driver. + +The driver selection is task-local, allowing different Julia tasks to use different drivers. + +# Examples +```julia +drv = driver() +println("Using driver: ", drv) +``` + +See also: `driver!`, `drivers` +""" function driver() get!(task_local_storage(), :ZeDriver) do first(drivers()) end end +""" + driver!(drv::ZeDriver) + +Set the current Level Zero driver for the calling task. This also clears the current +device selection, as devices are associated with specific drivers. + +The driver selection is task-local, allowing different Julia tasks to use different drivers. + +# Arguments +- `drv::ZeDriver`: The driver to use for subsequent operations. + +# Examples +```julia +drv = drivers()[2] # Select second available driver +driver!(drv) +``` + +See also: `driver`, `drivers` +""" function driver!(drv::ZeDriver) task_local_storage(:ZeDriver, drv) delete!(task_local_storage(), :ZeDevice) end +""" + device() -> ZeDevice + +Get the current Level Zero device for the calling task. If no device has been explicitly +set with [`device!`](@ref), returns the first available device for the current driver. + +The device selection is task-local, allowing different Julia tasks to use different devices. + +# Examples +```julia +dev = device() +println("Using device: ", dev) +``` + +See also: `device!`, `devices`, `driver` +""" function device() get!(task_local_storage(), :ZeDevice) do first(devices(driver())) end end +""" + device!(dev::ZeDevice) + device!(i::Int) + +Set the current Level Zero device for the calling task. + +The device selection is task-local, allowing different Julia tasks to use different devices. + +# Arguments +- `dev::ZeDevice`: The device to use for subsequent operations. +- `i::Int`: Device index (1-based) from the list of available devices for the current driver. + +# Examples +```julia +# Select by device object +dev = devices()[2] +device!(dev) + +# Select by index +device!(2) # Select second device +``` + +See also: [`device`](@ref), [`devices`](@ref) +""" function device!(drv::ZeDevice) task_local_storage(:ZeDevice, drv) end @@ -32,6 +107,23 @@ device!(i::Int) = device!(devices(driver())[i]) const global_contexts = Dict{ZeDriver,ZeContext}() +""" + context() -> ZeContext + +Get the current Level Zero context for the calling task. If no context has been explicitly +set with [`context!`](@ref), returns a global context for the current driver. + +Contexts manage the lifetime of resources like memory allocations and command queues. +The context selection is task-local, but contexts themselves are cached globally per driver. + +# Examples +```julia +ctx = context() +println("Using context: ", ctx) +``` + +See also: [`context!`](@ref), [`driver`](@ref) +""" function context() get!(task_local_storage(), :ZeContext) do get!(global_contexts, driver()) do @@ -40,13 +132,54 @@ function context() end end +""" + context!(ctx::ZeContext) + +Set the current Level Zero context for the calling task. + +The context selection is task-local, allowing different Julia tasks to use different contexts. + +# Arguments +- `ctx::ZeContext`: The context to use for subsequent operations. + +# Examples +```julia +ctx = ZeContext(driver()) +context!(ctx) +``` + +See also: `context`, `ZeContext` +""" function context!(ctx::ZeContext) task_local_storage(:ZeContext, ctx) end -# the global queue can be used as a default queue to execute operations on, -# guaranteeing expected semantics when using a device on a Julia task. +""" + global_queue(ctx::ZeContext, dev::ZeDevice) -> ZeCommandQueue + +Get the global command queue for the given context and device. This queue is used as the +default queue for executing operations, guaranteeing expected semantics when using a device +on a Julia task. + +The queue is created with in-order execution flags, meaning commands are executed in the +order they are submitted. Queues are cached per task and (context, device) pair. + +# Arguments +- `ctx::ZeContext`: The context for the command queue. +- `dev::ZeDevice`: The device for the command queue. + +# Returns +- `ZeCommandQueue`: A cached command queue with in-order execution. +# Examples +```julia +ctx = context() +dev = device() +queue = global_queue(ctx, dev) +``` + +See also: `context`, `device`, `synchronize` +""" function global_queue(ctx::ZeContext, dev::ZeDevice) # NOTE: dev purposefully does not default to context() or device() to stress that # objects should track ownership, and not rely on implicit global state. @@ -55,12 +188,52 @@ function global_queue(ctx::ZeContext, dev::ZeDevice) end end +""" + synchronize() + +Block the host thread until all operations on the global command queue for the current +context and device have completed. + +This is useful for timing operations or ensuring that GPU work has finished before +accessing results on the CPU. + +# Examples +```julia +x = oneArray(rand(1000)) +y = x .+ 1 +synchronize() # Wait for GPU computation to complete +println("GPU work completed") +``` + +See also: [`global_queue`](@ref), [`context`](@ref), [`device`](@ref) +""" function oneL0.synchronize() oneL0.synchronize(global_queue(context(), device())) end # re-export and augment parts of oneL0 to make driver and device selection easier export drivers, devices + +""" + devices() -> Vector{ZeDevice} + devices(drv::ZeDriver) -> Vector{ZeDevice} + +Return a list of available Level Zero devices. Without arguments, returns devices for +the current driver. With a driver argument, returns devices for that specific driver. + +# Examples +```julia +# Get devices for current driver +devs = devices() +println("Found ", length(devs), " devices") + +# Get devices for specific driver +drv = drivers()[1] +devs = devices(drv) +``` + +See also: `device`, `device!`, `drivers` +""" oneL0.devices() = devices(driver()) diff --git a/src/device/array.jl b/src/device/array.jl index 260be837..ae339110 100644 --- a/src/device/array.jl +++ b/src/device/array.jl @@ -8,6 +8,40 @@ export oneDeviceArray, oneDeviceVector, oneDeviceMatrix, oneLocalArray # NOTE: we can't support the typical `tuple or series of integer` style construction, # because we're currently requiring a trailing pointer argument. +""" + oneDeviceArray{T,N,A} <: DenseArray{T,N} + +Device-side array type for use within GPU kernels. + +This type represents a view of GPU memory accessible within kernel code. Unlike +[`oneArray`](@ref) which is used on the host, `oneDeviceArray` is designed for +device-side operations and cannot be directly constructed on the host. + +# Type Parameters +- `T`: Element type +- `N`: Number of dimensions +- `A`: Address space (typically `AS.CrossWorkgroup` for global memory) + +# Usage + +`oneDeviceArray` is typically not constructed directly. Instead, `oneArray` objects +are automatically converted to `oneDeviceArray` when passed as kernel arguments. + +# Examples + +```julia +function kernel(a::oneDeviceArray{Float32,1}) + i = get_global_id() + @inbounds a[i] = a[i] * 2.0f0 + return +end + +a = oneArray(rand(Float32, 100)) +@oneapi groups=1 items=100 kernel(a) # a is converted to oneDeviceArray +``` + +See also: [`oneArray`](@ref), [`oneLocalArray`](@ref), [`@oneapi`](@ref) +""" struct oneDeviceArray{T,N,A} <: DenseArray{T,N} ptr::LLVMPtr{T,A} maxsize::Int @@ -257,6 +291,47 @@ end export oneLocalArray +""" + oneLocalArray(::Type{T}, dims) + +Allocate local (workgroup-shared) memory within a GPU kernel. + +Local memory is shared among all work-items in a workgroup and provides faster access than +global memory. It's useful for algorithms that require cooperation between work-items, +such as reductions or matrix multiplication tiling. + +# Arguments +- `T`: Element type +- `dims`: Dimensions (must be compile-time constants) + +# Examples + +```julia +function matmul_kernel(A, B, C) + # Allocate 16x16 tile in local memory + tile_A = oneLocalArray(Float32, (16, 16)) + tile_B = oneLocalArray(Float32, (16, 16)) + + # Load data into local memory + local_i = get_local_id(0) + local_j = get_local_id(1) + tile_A[local_i, local_j] = A[...] + tile_B[local_i, local_j] = B[...] + + barrier() # Synchronize workgroup + + # Compute using local memory + # ... + return +end +``` + +!!! note + The dimensions must be known at compile time. Local memory is limited (typically 64KB + per workgroup), so large allocations may fail. + +See also: [`oneDeviceArray`](@ref), [`barrier`](@ref) +""" @inline function oneLocalArray(::Type{T}, dims) where {T} len = prod(dims) # NOTE: this relies on const-prop to forward the literal length to the generator. diff --git a/src/memory.jl b/src/memory.jl index 86a681f8..61b17106 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -1,5 +1,26 @@ # memory operations +""" + Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst, src, N) + +Low-level memory copy operation on the GPU. + +Copies `N` elements of type `T` from `src` to `dst` using the specified context and device. +Both `src` and `dst` can be either host pointers (`Ptr`) or device pointers (`ZePtr`). + +# Arguments +- `ctx::ZeContext`: Level Zero context +- `dev::ZeDevice`: Level Zero device +- `dst::Union{Ptr{T},ZePtr{T}}`: Destination pointer +- `src::Union{Ptr{T},ZePtr{T}}`: Source pointer +- `N::Integer`: Number of elements to copy + +!!! warning + This is a low-level function. No bounds checking is performed. For safe array copying, + use `copyto!` on `oneArray` objects instead. + +See also: [`copyto!`](@ref), [`oneArray`](@ref) +""" function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst::Union{Ptr{T},ZePtr{T}}, src::Union{Ptr{T},ZePtr{T}}, N::Integer) where T bytes = N*sizeof(T) @@ -9,6 +30,26 @@ function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst::Union{Ptr{T},Ze end end +""" + unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr, pattern, N) + +Low-level memory fill operation on the GPU. + +Fills `N` elements at `ptr` with the given pattern using the specified context and device. + +# Arguments +- `ctx::ZeContext`: Level Zero context +- `dev::ZeDevice`: Level Zero device +- `ptr::Union{Ptr{T},ZePtr{T}}`: Pointer to memory to fill +- `pattern::Union{Ptr{T},ZePtr{T}}`: Pointer to pattern value +- `N::Integer`: Number of elements to fill + +!!! warning + This is a low-level function. For safe array operations, use `fill!` on `oneArray` + objects instead. + +See also: [`fill!`](@ref), [`oneArray`](@ref) +""" function unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr::Union{Ptr{T},ZePtr{T}}, pattern::Union{Ptr{T},ZePtr{T}}, N::Integer) where T bytes = N*sizeof(T) diff --git a/src/oneAPI.jl b/src/oneAPI.jl index 9e39fa9f..b9caa398 100644 --- a/src/oneAPI.jl +++ b/src/oneAPI.jl @@ -59,7 +59,7 @@ export SYCL include("../lib/mkl/oneMKL.jl") export oneMKL end -import AcceleratedKernels as AK + # integrations and specialized functionality include("broadcast.jl") include("mapreduce.jl") @@ -69,7 +69,6 @@ include("utils.jl") include("oneAPIKernels.jl") import .oneAPIKernels: oneAPIBackend -include("accumulate.jl") include("indexing.jl") export oneAPIBackend diff --git a/src/utils.jl b/src/utils.jl index 22180da7..b2d1e784 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -68,7 +68,7 @@ end Run expression `ex` and synchronize the GPU afterwards. -See also: [`synchronize`](@ref). +See also: `synchronize`. """ macro sync(ex) quote diff --git a/test/Project.toml b/test/Project.toml index c214ed96..cb603629 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,6 @@ [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" diff --git a/test/setup.jl b/test/setup.jl index 269d5b9c..a3b0f1a4 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,4 +1,4 @@ -using Distributed, Test, oneAPI +using Distributed, Test, oneAPI, AcceleratedKernels oneAPI.functional() || error("oneAPI.jl is not functional on this system")