diff --git a/.github/workflows/DocsCleanup.yml b/.github/workflows/DocsCleanup.yml
new file mode 100644
index 00000000..be55aca8
--- /dev/null
+++ b/.github/workflows/DocsCleanup.yml
@@ -0,0 +1,28 @@
+name: Doc Preview Cleanup
+
+on:
+  pull_request:
+    types: [closed]
+
+jobs:
+  doc-preview-cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout gh-pages branch
+        uses: actions/checkout@v2
+        with:
+          ref: gh-pages
+
+      - name: Delete preview and history
+        run: |
+            git config user.name "oneAPI.jl"
+            git config user.email "oneapi@juliagpu.github.io"
+            git rm -rf "previews/PR$PRNUM"
+            git commit -m "delete preview"
+            git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree})
+        env:
+            PRNUM: ${{ github.event.number }}
+
+      - name: Push changes
+        run: |
+            git push --force origin gh-pages-new:gh-pages
\ No newline at end of file
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..850ecc62
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,29 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - main
+    tags: '*'
+  pull_request:
+    types: [opened, synchronize, reopened]
+  schedule:
+    - cron: '0 0 * * 0'
+
+jobs:
+  docs:
+    name: Build documentation
+    env:
+      DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      JULIA_DEBUG: Documenter
+    runs-on: [self-hosted, linux, X64]
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: 'lts'
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@latest
+      - run: julia --project=docs/ docs/make.jl
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 736148be..3fc6e9ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ deps/onemkl_lapack.cpp
 deps/onemkl_lapack.h
 deps/onemkl_sparse.cpp
 deps/onemkl_sparse.h
+docs/build
diff --git a/Project.toml b/Project.toml
index fc7cd1bc..e2dac0e2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ authors = ["Tim Besard <tim.besard@gmail.com>", "Alexis Montoison",  "Michel Sch
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -30,6 +29,12 @@ oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
 oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
 
+[weakdeps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
+
+[extensions]
+oneAPIAcceleratedKernelsExt = "AcceleratedKernels"
+
 [compat]
 AbstractFFTs = "1.5.0"
 AcceleratedKernels = "0.4.3"
diff --git a/README.md b/README.md
index c22cdb6b..8485889f 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 *Julia support for the oneAPI programming toolkit.*
 
-[![][doi-img]][doi-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url]
+[![][doi-img]][doi-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url] [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url]
 
 [doi-img]: https://zenodo.org/badge/252466420.svg
 [doi-url]: https://zenodo.org/badge/latestdoi/252466420
@@ -13,6 +13,12 @@
 [codecov-img]: https://codecov.io/gh/JuliaGPU/oneAPI.jl/branch/master/graph/badge.svg
 [codecov-url]: https://codecov.io/gh/JuliaGPU/oneAPI.jl
 
+[docs-stable-img]: https://img.shields.io/badge/docs-stable-blue.svg
+[docs-stable-url]: https://juliagpu.github.io/oneAPI.jl/stable
+
+[docs-dev-img]: https://img.shields.io/badge/docs-dev-blue.svg
+[docs-dev-url]: https://juliagpu.github.io/oneAPI.jl/dev
+
 oneAPI.jl provides support for working with the [oneAPI unified programming
 model](https://software.intel.com/en-us/oneapi). The package is verified to work with the
 (currently) only implementation of this interface [that is part of the Intel Compute
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 00000000..90803543
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+
+[sources]
+oneAPI = {path = "/home/michel/git/oneAPI.jl"}
+
+[compat]
+Documenter = "1"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 00000000..b5529648
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,54 @@
+using Pkg
+
+Pkg.develop(PackageSpec(path=joinpath(dirname(@__FILE__), "..")))
+# # when first running instantiate
+Pkg.instantiate()
+using Documenter
+using Documenter.Remotes
+using oneAPI
+
+oneAPI.versioninfo()
+
+makedocs(
+    sitename = "oneAPI.jl",
+    format = Documenter.HTML(
+        prettyurls = Base.get(ENV, "CI", nothing) == "true",
+        canonical = "https://exanauts.github.io/ExaPF.jl/stable/",
+        mathengine = Documenter.KaTeX(),
+    ),
+    modules = [oneAPI],
+    pages = [
+        "Home" => "index.md",
+        "Installation" => "installation.md",
+        "Getting Started" => "getting_started.md",
+        "Usage" => [
+            "Array Programming" => "arrays.md",
+            "Kernel Programming" => "kernels.md",
+            "Memory Management" => "memory.md",
+            "Device Intrinsics" => "device.md",
+            "Performance Guide" => "usage/performance.md",
+        ],
+        "API Reference" => [
+            "Overview" => "api.md",
+            "Context & Device Management" => "api/context.md",
+            "Array Operations" => "api/arrays.md",
+            "Kernel Programming" => "api/kernels.md",
+            "Memory Management" => "api/memory.md",
+            "Compiler & Reflection" => "api/compiler.md",
+            "Level Zero (oneL0)" => "level_zero.md",
+            "oneMKL" => "onemkl.md",
+        ],
+        "Troubleshooting" => "troubleshooting.md",
+    ],
+    checkdocs = :none,  # Don't error on missing docstrings
+    warnonly = [:cross_references, :missing_docs],  # Only warn, don't error
+)
+
+deploydocs(
+    repo = "github.com/JuliaGPU/oneAPI.jl.git",
+    target = "build",
+    devbranch = "main",
+    devurl = "dev",
+    push_preview = true,
+)
+
diff --git a/docs/src/api.md b/docs/src/api.md
new file mode 100644
index 00000000..68cd2a76
--- /dev/null
+++ b/docs/src/api.md
@@ -0,0 +1,42 @@
+# API Reference
+
+This page provides an overview of the oneAPI.jl API. For detailed documentation, see the specific API reference pages:
+
+- [Context & Device Management](api/context.md) - Managing drivers, devices, and contexts
+- [Array Operations](api/arrays.md) - Working with GPU arrays
+- [Kernel Programming](api/kernels.md) - Writing and launching custom kernels
+- [Memory Management](api/memory.md) - Memory allocation and transfer
+- [Compiler & Reflection](api/compiler.md) - Code generation and introspection
+
+## Core Functions
+
+```@autodocs
+Modules = [oneAPI]
+Pages   = ["src/context.jl", "src/utils.jl"]
+Filter = t -> t !== oneAPI.synchronize
+```
+
+## Compiler Functions
+
+```@autodocs
+Modules = [oneAPI]
+Pages = ["src/compiler/execution.jl", "src/compiler/reflection.jl"]
+```
+
+## oneL0 (Level Zero)
+
+Low-level bindings to the Level Zero API. See the [Level Zero page](level_zero.md) for details.
+
+```@autodocs
+Modules = [oneAPI.oneL0]
+Filter = t -> t !== oneAPI.oneL0.synchronize
+```
+
+## oneMKL
+
+Intel oneAPI Math Kernel Library bindings. See the [oneMKL page](onemkl.md) for details.
+
+```@autodocs
+Modules = [oneAPI.oneMKL]
+```
+
diff --git a/docs/src/api/arrays.md b/docs/src/api/arrays.md
new file mode 100644
index 00000000..0554649f
--- /dev/null
+++ b/docs/src/api/arrays.md
@@ -0,0 +1,264 @@
+# Array Operations
+
+This page documents the array types and operations provided by oneAPI.jl.
+
+## Array Types
+
+### Host-Side Arrays
+
+#### `oneArray{T,N,B}`
+
+N-dimensional dense array type for Intel GPU programming using oneAPI and Level Zero.
+
+**Type Parameters:**
+- `T`: Element type (must be stored inline, no isbits-unions)
+- `N`: Number of dimensions
+- `B`: Buffer type, one of:
+  - `oneL0.DeviceBuffer`: GPU device memory (default, not CPU-accessible)
+  - `oneL0.SharedBuffer`: Unified shared memory (CPU and GPU accessible)
+  - `oneL0.HostBuffer`: Pinned host memory (CPU-accessible, GPU-visible)
+
+**Type Aliases:**
+- `oneVector{T}` = `oneArray{T,1}` - 1D array
+- `oneMatrix{T}` = `oneArray{T,2}` - 2D array
+- `oneVecOrMat{T}` = `Union{oneVector{T}, oneMatrix{T}}` - 1D or 2D array
+
+### Device-Side Arrays
+
+#### `oneDeviceArray{T,N,A}`
+
+Device-side array type for use within GPU kernels. This type represents a view of GPU memory
+accessible within kernel code. Unlike `oneArray` which is used on the host, `oneDeviceArray`
+is designed for device-side operations and cannot be directly constructed on the host.
+
+**Type Parameters:**
+- `T`: Element type
+- `N`: Number of dimensions
+- `A`: Address space (typically `AS.CrossWorkgroup` for global memory)
+
+**Type Aliases:**
+- `oneDeviceVector` = `oneDeviceArray{T,1}` - 1D device array
+- `oneDeviceMatrix` = `oneDeviceArray{T,2}` - 2D device array
+
+#### `oneLocalArray(::Type{T}, dims)`
+
+Allocate local (workgroup-shared) memory within a GPU kernel. Local memory is shared among
+all work-items in a workgroup and provides faster access than global memory.
+
+## Memory Type Queries
+
+### `is_device(a::oneArray) -> Bool`
+
+Check if the array is stored in device memory (not directly CPU-accessible).
+
+### `is_shared(a::oneArray) -> Bool`
+
+Check if the array is stored in shared (unified) memory, accessible from both CPU and GPU.
+
+### `is_host(a::oneArray) -> Bool`
+
+Check if the array is stored in pinned host memory, which resides on the CPU but is visible to the GPU.
+
+
+## Array Construction
+
+`oneArray` supports multiple construction patterns similar to standard Julia arrays:
+
+```julia
+using oneAPI
+
+# Uninitialized arrays
+a = oneArray{Float32}(undef, 100)
+b = oneArray{Float32,2}(undef, 10, 10)
+
+# Specify memory type
+c = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 100)  # Shared memory
+d = oneArray{Float32,1,oneL0.HostBuffer}(undef, 100)    # Host memory
+
+# From existing arrays
+e = oneArray(rand(Float32, 100))
+f = oneArray([1, 2, 3, 4])
+
+# Using zeros/ones/rand
+g = oneAPI.zeros(Float32, 100)
+h = oneAPI.ones(Float32, 100)
+i = oneAPI.rand(Float32, 100)
+
+# Do-block for automatic cleanup
+result = oneArray{Float32}(100) do arr
+    arr .= 1.0f0
+    sum(arr)  # Returns result, arr is freed automatically
+end
+```
+
+## Array Operations
+
+`oneArray` implements the full `AbstractArray` interface and supports:
+
+### Broadcasting
+
+```julia
+a = oneArray(rand(Float32, 100))
+b = oneArray(rand(Float32, 100))
+
+c = a .+ b          # Element-wise addition
+d = a .* 2.0f0      # Scalar multiplication
+e = sin.(a)         # Unary operations
+f = a .+ b .* c     # Fused operations
+```
+
+### Reductions
+
+```julia
+a = oneArray(rand(Float32, 100))
+
+s = sum(a)          # Sum
+p = prod(a)         # Product
+m = maximum(a)      # Maximum
+n = minimum(a)      # Minimum
+μ = mean(a)         # Mean (requires Statistics)
+```
+
+### Mapping
+
+```julia
+a = oneArray(rand(Float32, 100))
+
+b = map(x -> x^2, a)        # Apply function
+c = map(+, a, b)            # Binary operation
+```
+
+### Accumulation
+
+```julia
+a = oneArray([1, 2, 3, 4])
+
+b = cumsum(a)       # Cumulative sum: [1, 3, 6, 10]
+c = cumprod(a)      # Cumulative product: [1, 2, 6, 24]
+```
+
+### Finding Elements
+
+```julia
+a = oneArray([1.0f0, -2.0f0, 3.0f0, -4.0f0])
+
+indices = findall(x -> x > 0, a)  # Indices of positive elements
+```
+
+### Random Number Generation
+
+```julia
+using oneAPI, Random
+
+# Uniform distribution
+a = oneAPI.rand(Float32, 100)
+b = oneAPI.rand(Float32, 10, 10)
+
+# Normal distribution
+c = oneAPI.randn(Float32, 100)
+
+# With seed
+Random.seed!(1234)
+d = oneAPI.rand(Float32, 100)
+```
+
+## Data Transfer
+
+### CPU to GPU
+
+```julia
+# Using constructor
+h_array = rand(Float32, 100)
+d_array = oneArray(h_array)
+
+# Using copyto!
+d_array = oneArray{Float32}(undef, 100)
+copyto!(d_array, h_array)
+```
+
+### GPU to CPU
+
+```julia
+# Using Array constructor
+h_array = Array(d_array)
+
+# Using copyto!
+h_array = Vector{Float32}(undef, 100)
+copyto!(h_array, d_array)
+```
+
+### GPU to GPU
+
+```julia
+d_array1 = oneArray(rand(Float32, 100))
+d_array2 = similar(d_array1)
+copyto!(d_array2, d_array1)
+```
+
+## Memory Types Comparison
+
+| Memory Type | CPU Access | GPU Access | Performance | Use Case |
+|-------------|-----------|------------|-------------|----------|
+| Device (default) | ❌ No | ✅ Fast | Fastest | GPU computations |
+| Shared | ✅ Yes | ✅ Good | Good | CPU-GPU data sharing |
+| Host | ✅ Yes | ✅ Slower | Moderate | Staging, pinned buffers |
+
+```julia
+# Device memory (default, fastest for GPU)
+a = oneArray{Float32}(undef, 100)
+
+# Shared memory (CPU and GPU accessible)
+b = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 100)
+
+# Host memory (CPU memory visible to GPU)
+c = oneArray{Float32,1,oneL0.HostBuffer}(undef, 100)
+
+# Query memory type
+is_device(a)  # true
+is_shared(b)  # true
+is_host(c)    # true
+```
+
+## Views and Slicing
+
+`oneArray` supports array views for efficient sub-array operations without copying:
+
+```julia
+a = oneArray(rand(Float32, 100))
+
+# Create a view
+v = view(a, 1:50)
+v .= 0.0f0  # Modifies first 50 elements of a
+
+# Slicing returns a view
+s = a[1:50]  # This is a view, not a copy
+```
+
+## Reshaping
+
+```julia
+a = oneArray(rand(Float32, 100))
+
+# Reshape to 2D
+b = reshape(a, 10, 10)
+
+# Flatten
+c = vec(b)  # Returns 1D view
+```
+
+## Advanced: Custom Array Wrappers
+
+For advanced use cases, oneAPI.jl provides type aliases for array wrappers:
+
+- `oneDenseArray`: Dense contiguous arrays
+- `oneStridedArray`: Arrays with arbitrary strides (including views)
+- `oneWrappedArray`: Any array backed by a oneArray
+
+These are useful for writing functions that accept various array types:
+
+```julia
+function my_kernel!(a::oneStridedArray{Float32})
+    # Accepts oneArray and views
+    a .+= 1.0f0
+end
+```
diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md
new file mode 100644
index 00000000..ba0081c1
--- /dev/null
+++ b/docs/src/api/compiler.md
@@ -0,0 +1,255 @@
+# Compiler and Reflection
+
+This page documents the compiler interface and code reflection tools for oneAPI.jl.
+
+## Code Reflection
+
+oneAPI.jl provides macros for inspecting code generation at various stages:
+
+- `@device_code_lowered` - Show lowered IR (desugared Julia code)
+- `@device_code_typed` - Show type-inferred IR
+- `@device_code_warntype` - Show type-inferred IR with type stability warnings
+- `@device_code_llvm` - Show LLVM IR
+- `@device_code_spirv` - Show SPIR-V assembly
+- `@device_code` - Show all compilation stages interactively
+
+These macros are re-exported from GPUCompiler.jl. See the [GPUCompiler documentation](https://github.com/JuliaGPU/GPUCompiler.jl) for detailed usage.
+
+### `return_type(f, tt) -> Type`
+
+Return the inferred return type of function `f` when called with argument types `tt` in a GPU kernel context.
+
+**Arguments:**
+- `f`: Function to analyze
+- `tt`: Tuple type of arguments
+
+**Returns:**
+- Type that `f(args...)` would return where `args::tt`
+
+**Example:**
+```julia
+function compute(x::Float32)
+    return x * 2.0f0
+end
+
+rt = oneAPI.return_type(compute, Tuple{Float32})
+@assert rt == Float32
+```
+
+
+## Inspecting Generated Code
+
+Code reflection tools help you understand how your Julia code is compiled to GPU code:
+
+### LLVM IR
+
+View the LLVM intermediate representation:
+
+```julia
+using oneAPI
+
+function kernel(a, b)
+    i = get_global_id()
+    @inbounds a[i] = b[i] + 1.0f0
+    return
+end
+
+a = oneArray(zeros(Float32, 10))
+b = oneArray(rand(Float32, 10))
+
+@device_code_llvm @oneapi groups=1 items=10 kernel(a, b)
+```
+
+### SPIR-V Assembly
+
+View the final SPIR-V assembly that runs on the GPU:
+
+```julia
+@device_code_spirv @oneapi groups=1 items=10 kernel(a, b)
+```
+
+### Type Inference
+
+Check for type instabilities that hurt performance:
+
+```julia
+@device_code_warntype @oneapi groups=1 items=10 kernel(a, b)
+```
+
+### Type-Inferred IR
+
+See the typed intermediate representation:
+
+```julia
+@device_code_typed @oneapi groups=1 items=10 kernel(a, b)
+```
+
+### Interactive Inspection
+
+Use `@device_code` for an interactive menu:
+
+```julia
+@device_code @oneapi groups=1 items=10 kernel(a, b)
+# Opens a menu to select which compilation stage to view
+```
+
+## Return Type Inference
+
+Query the return type of a kernel:
+
+```julia
+function compute(x::Float32)
+    return x * 2.0f0
+end
+
+# Infer return type
+rt = oneAPI.return_type(compute, Tuple{Float32})
+@assert rt == Float32
+```
+
+## Debugging Type Issues
+
+### Common Type Instability Sources
+
+```julia
+# ❌ Type instability: Conditional returns different types
+function bad_kernel(x, flag)
+    if flag
+        return x        # Float32
+    else
+        return 0        # Int
+    end
+end
+
+# ✅ Type stable: Consistent return type
+function good_kernel(x, flag)
+    if flag
+        return x        # Float32
+    else
+        return 0.0f0    # Float32
+    end
+end
+```
+
+### Using @device_code_warntype
+
+```julia
+function mystery_kernel!(output, input)
+    i = get_global_id()
+    @inbounds output[i] = some_complex_function(input[i])
+    return
+end
+
+# Check for type issues
+@device_code_warntype @oneapi groups=1 items=10 mystery_kernel!(a, b)
+
+# Look for red warnings indicating type instability
+```
+
+## Compilation Options
+
+### Kernel vs Device Function
+
+```julia
+# Compile as kernel (default for @oneapi)
+@device_code_llvm @oneapi kernel=true kernel(a, b)
+
+# Compile as device function (callable from other kernels)
+@device_code_llvm @oneapi kernel=false helper_function(x)
+```
+
+### Always Inline
+
+Force inlining of device functions:
+
+```julia
+@oneapi always_inline=true kernel(a, b)
+```
+
+### Custom Kernel Name
+
+Specify a custom name for the kernel:
+
+```julia
+@oneapi name="my_custom_kernel" kernel(a, b)
+```
+
+## Example: Optimizing a Kernel
+
+Here's a workflow for optimizing a kernel using reflection tools:
+
+```julia
+using oneAPI
+
+# Initial version
+function sum_kernel_v1!(result, data)
+    i = get_global_id()
+    if i == 1
+        sum = 0
+        for j in 1:length(data)
+            sum += data[j]
+        end
+        result[1] = sum
+    end
+    return
+end
+
+data = oneArray(rand(Float32, 1000))
+result = oneArray(zeros(Float32, 1))
+
+# Check for type issues
+@device_code_warntype @oneapi groups=1 items=1 sum_kernel_v1!(result, data)
+# Notice: `sum` might be Int instead of Float32!
+
+# Fixed version
+function sum_kernel_v2!(result, data)
+    i = get_global_id()
+    if i == 1
+        sum = 0.0f0  # Explicitly Float32
+        for j in 1:length(data)
+            sum += data[j]
+        end
+        result[1] = sum
+    end
+    return
+end
+
+# Verify the fix
+@device_code_warntype @oneapi groups=1 items=1 sum_kernel_v2!(result, data)
+# Should be type-stable now!
+
+# Check the generated code
+@device_code_llvm @oneapi groups=1 items=1 sum_kernel_v2!(result, data)
+```
+
+## Profiling
+
+For performance profiling, see the [Performance Guide](@ref).
+
+## Troubleshooting
+
+### Compilation Errors
+
+If you encounter compilation errors:
+
+1. **Check type stability**: Use `@device_code_warntype`
+2. **Inspect LLVM IR**: Use `@device_code_llvm` to see if the issue is in LLVM generation
+3. **Simplify the kernel**: Comment out sections to isolate the problematic code
+4. **Check argument types**: Ensure arguments are GPU-compatible (isbits types)
+
+### SPIR-V Issues
+
+If SPIR-V generation fails:
+
+1. **Update dependencies**: Ensure SPIRV-LLVM-Translator is up to date
+2. **Check device capabilities**: Some operations require specific hardware features
+3. **Reduce complexity**: Very complex kernels might hit compiler limits
+
+### Performance Issues
+
+If your kernel is slow:
+
+1. **Profile memory access patterns**: Coalesced access is crucial
+2. **Check occupancy**: Are you launching enough work-items?
+3. **Minimize barriers**: Synchronization has overhead
+4. **Use local memory wisely**: It's faster than global memory but limited in size
diff --git a/docs/src/api/context.md b/docs/src/api/context.md
new file mode 100644
index 00000000..19ad45a3
--- /dev/null
+++ b/docs/src/api/context.md
@@ -0,0 +1,123 @@
+# Context and Device Management
+
+This page documents the API for managing Level Zero drivers, devices, and contexts in oneAPI.jl.
+
+## Overview
+
+oneAPI.jl uses task-local state to manage GPU resources. This allows different Julia tasks to
+work with different drivers, devices, or contexts without interfering with each other.
+
+The typical hierarchy is:
+- **Driver**: Represents a Level Zero driver (usually one per GPU vendor/installation)
+- **Device**: Represents a physical GPU device
+- **Context**: Manages resources like memory allocations and command queues
+
+## Driver Management
+
+### `driver() -> ZeDriver`
+
+Get the current Level Zero driver for the calling task. If no driver has been explicitly
+set with `driver!`, returns the first available driver. The driver selection is task-local.
+
+### `driver!(drv::ZeDriver)`
+
+Set the current Level Zero driver for the calling task. This also clears the current
+device selection, as devices are associated with specific drivers.
+
+### `drivers() -> Vector{ZeDriver}`
+
+Return a list of all available Level Zero drivers.
+
+## Device Management
+
+### `device() -> ZeDevice`
+
+Get the current Level Zero device for the calling task. If no device has been explicitly
+set with `device!`, returns the first available device for the current driver. The device
+selection is task-local.
+
+### `device!(dev::ZeDevice)` / `device!(i::Int)`
+
+Set the current Level Zero device for the calling task. Can pass either a device object or
+a 1-based device index.
+
+### `devices() -> Vector{ZeDevice}` / `devices(drv::ZeDriver)`
+
+Return a list of available Level Zero devices. Without arguments, returns devices for
+the current driver.
+
+## Context Management
+
+### `context() -> ZeContext`
+
+Get the current Level Zero context for the calling task. If no context has been explicitly
+set with `context!`, returns a global context for the current driver. Contexts manage the
+lifetime of resources like memory allocations and command queues.
+
+### `context!(ctx::ZeContext)`
+
+Set the current Level Zero context for the calling task.
+
+## Command Queues
+
+### `global_queue(ctx::ZeContext, dev::ZeDevice) -> ZeCommandQueue`
+
+Get the global command queue for the given context and device. This queue is used as the
+default queue for executing operations. The queue is created with in-order execution flags.
+
+### `synchronize()`
+
+Block the host thread until all operations on the global command queue for the current
+context and device have completed.
+
+
+## Example Workflow
+
+```julia
+using oneAPI
+
+# List available drivers
+drv_list = drivers()
+println("Available drivers: ", length(drv_list))
+
+# Select a specific driver
+driver!(drv_list[1])
+
+# List devices for current driver
+dev_list = devices()
+println("Available devices: ", length(dev_list))
+
+# Select a specific device
+device!(dev_list[1])
+
+# Get the current context (created automatically)
+ctx = context()
+
+# Perform GPU operations...
+a = oneArray(rand(Float32, 100))
+
+# Wait for all operations to complete
+synchronize()
+```
+
+## Multi-Device Programming
+
+You can use different devices in different Julia tasks:
+
+```julia
+using oneAPI
+
+# Task 1: Use first device
+Threads.@spawn begin
+    device!(1)
+    a = oneArray(rand(Float32, 100))
+    # ... operations on device 1 ...
+end
+
+# Task 2: Use second device
+Threads.@spawn begin
+    device!(2)
+    b = oneArray(rand(Float32, 100))
+    # ... operations on device 2 ...
+end
+```
diff --git a/docs/src/api/kernels.md b/docs/src/api/kernels.md
new file mode 100644
index 00000000..5040adf2
--- /dev/null
+++ b/docs/src/api/kernels.md
@@ -0,0 +1,297 @@
+# Kernel Programming
+
+This page documents the kernel programming API for writing custom GPU kernels in oneAPI.jl.
+
+## Kernel Launch
+
+### `@oneapi [kwargs...] kernel(args...)`
+
+High-level interface for launching Julia kernels on Intel GPUs using oneAPI.
+
+This macro compiles a Julia function to SPIR-V, prepares the arguments, and optionally
+launches the kernel on the GPU.
+
+**Keyword Arguments:**
+
+**Macro Keywords (compile-time):**
+- `launch::Bool=true`: Whether to launch the kernel immediately
+
+**Compiler Keywords:**
+- `kernel::Bool=false`: Whether to compile as a kernel or device function
+- `name::Union{String,Nothing}=nothing`: Explicit name for the kernel
+- `always_inline::Bool=false`: Whether to always inline device functions
+
+**Launch Keywords (runtime):**
+- `groups`: Number of workgroups (required). Can be an integer or tuple.
+- `items`: Number of work-items per workgroup (required). Can be an integer or tuple.
+- `queue::ZeCommandQueue`: Command queue to submit to (defaults to global queue).
+
+### `zefunction(f, tt; kwargs...)`
+
+Compile a Julia function to a Level Zero kernel function. This is the lower-level interface
+used by `@oneapi`. Returns a callable kernel object.
+
+### `kernel_convert(x)`
+
+Convert arguments for kernel execution. This function is called for every argument passed to
+a kernel, allowing customization of argument conversion. By default, it converts `oneArray`
+to `oneDeviceArray`.
+
+
+## Basic Kernel Example
+
+```julia
+using oneAPI
+
+function vadd_kernel!(a, b, c)
+    i = get_global_id()
+    if i <= length(a)
+        @inbounds c[i] = a[i] + b[i]
+    end
+    return
+end
+
+N = 1024
+a = oneArray(rand(Float32, N))
+b = oneArray(rand(Float32, N))
+c = similar(a)
+
+# Launch with 4 workgroups of 256 work-items each
+@oneapi groups=4 items=256 vadd_kernel!(a, b, c)
+```
+
+## Launch Configuration
+
+### Workgroups and Work-Items
+
+The oneAPI execution model is based on:
+- **Work-items**: Individual threads of execution (analogous to CUDA threads)
+- **Workgroups**: Groups of work-items that can synchronize and share local memory (analogous to CUDA blocks)
+
+```julia
+# 1D configuration
+@oneapi groups=10 items=64 kernel(args...)        # 640 work-items total
+
+# 2D configuration
+@oneapi groups=(10, 10) items=(8, 8) kernel(args...)  # 6400 work-items total
+
+# 3D configuration
+@oneapi groups=(4, 4, 4) items=(4, 4, 4) kernel(args...)  # 4096 work-items total
+```
+
+### Determining Launch Configuration
+
+```julia
+# For simple element-wise operations
+N = length(array)
+items = 256  # Typical workgroup size
+groups = cld(N, items)  # Ceiling division
+
+@oneapi groups=groups items=items kernel(array)
+```
+
+### Compile Without Launch
+
+You can compile a kernel without launching it:
+
+```julia
+# Compile the kernel
+kernel = @oneapi launch=false vadd_kernel!(a, b, c)
+
+# Launch later with different configurations
+kernel(a, b, c; groups=4, items=256)
+kernel(a, b, c; groups=8, items=128)
+```
+
+## Device Intrinsics
+
+Inside GPU kernels, you can use various intrinsics to query execution context and synchronize work-items.
+
+### Thread Indexing
+
+```julia
+# Global ID (unique across all work-items)
+i = get_global_id()      # 1D linear index
+i = get_global_id(0)     # X dimension
+j = get_global_id(1)     # Y dimension
+k = get_global_id(2)     # Z dimension
+
+# Local ID (within workgroup)
+local_i = get_local_id()   # 1D linear index
+local_i = get_local_id(0)  # X dimension
+local_j = get_local_id(1)  # Y dimension
+local_k = get_local_id(2)  # Z dimension
+
+# Workgroup ID
+group_i = get_group_id(0)  # X dimension
+group_j = get_group_id(1)  # Y dimension
+group_k = get_group_id(2)  # Z dimension
+
+# Workgroup size
+local_size = get_local_size()   # Total work-items in workgroup
+local_size_x = get_local_size(0)
+local_size_y = get_local_size(1)
+
+# Global size
+global_size = get_global_size()   # Total work-items
+global_size_x = get_global_size(0)
+```
+
+### 2D Matrix Example
+
+```julia
+function matmul_kernel!(C, A, B)
+    # Get 2D indices
+    row = get_global_id(0)
+    col = get_global_id(1)
+
+    if row <= size(C, 1) && col <= size(C, 2)
+        sum = 0.0f0
+        for k in 1:size(A, 2)
+            @inbounds sum += A[row, k] * B[k, col]
+        end
+        @inbounds C[row, col] = sum
+    end
+    return
+end
+
+M, N, K = 256, 256, 256
+A = oneArray(rand(Float32, M, K))
+B = oneArray(rand(Float32, K, N))
+C = oneArray{Float32}(undef, M, N)
+
+# Launch with 2D configuration
+items = (16, 16)  # 16x16 work-items per workgroup
+groups = (cld(M, items[1]), cld(N, items[2]))
+
+@oneapi groups=groups items=items matmul_kernel!(C, A, B)
+```
+
+### Synchronization
+
+```julia
+# Barrier: synchronize all work-items in a workgroup
+barrier()
+
+# Memory fences (ensure memory operations are visible)
+mem_fence()     # Both local and global memory
+local_mem_fence()   # Local memory only
+global_mem_fence()  # Global memory only
+```
+
+### Local Memory
+
+Local memory (workgroup-shared memory) enables cooperation between work-items:
+
+```julia
+function optimized_reduction!(result, input)
+    local_id = get_local_id()
+    local_size = get_local_size()
+
+    # Allocate local memory (shared within workgroup)
+    local_data = oneLocalArray(Float32, 256)
+
+    # Load into local memory
+    @inbounds local_data[local_id] = input[get_global_id()]
+    barrier()
+
+    # Tree reduction in local memory
+    stride = local_size ÷ 2
+    while stride > 0
+        if local_id <= stride
+            @inbounds local_data[local_id] += local_data[local_id + stride]
+        end
+        barrier()
+        stride ÷= 2
+    end
+
+    # First work-item writes result
+    if local_id == 1
+        @inbounds result[get_group_id()] = local_data[1]
+    end
+    return
+end
+```
+
+### Atomic Operations
+
+For thread-safe operations on shared data:
+
+```julia
+# Atomic add
+oneAPI.atomic_add!(ptr, value)
+
+# Atomic exchange
+old_value = oneAPI.atomic_xchg!(ptr, new_value)
+
+# Atomic compare-and-swap
+old_value = oneAPI.atomic_cas!(ptr, compare, new_value)
+
+# Atomic min/max
+oneAPI.atomic_min!(ptr, value)
+oneAPI.atomic_max!(ptr, value)
+```
+
+Example histogram kernel:
+
+```julia
+function histogram_kernel!(hist, data, bins)
+    i = get_global_id()
+    if i <= length(data)
+        @inbounds val = data[i]
+        bin = clamp(floor(Int, val * bins) + 1, 1, bins)
+        oneAPI.atomic_add!(pointer(hist, bin), 1)
+    end
+    return
+end
+```
+
+## Kernel Restrictions
+
+GPU kernels have certain restrictions:
+
+1. **Must return `nothing`**: Kernels cannot return values directly. Use output arrays instead.
+2. **No dynamic memory allocation**: Cannot allocate arrays inside kernels
+3. **No I/O operations**: Cannot print or write to files (use printf-style debugging with care)
+4. **Limited recursion**: Avoid or minimize recursive calls
+5. **Type stability**: Ensure type-stable code for best performance
+
+```julia
+# ❌ Bad: Returns a value
+function bad_kernel(a)
+    return a[1] + 1
+end
+
+# ✅ Good: Returns nothing, uses output parameter
+function good_kernel!(result, a)
+    @inbounds result[1] = a[1] + 1
+    return
+end
+```
+
+## KernelAbstractions.jl
+
+For portable GPU programming across CUDA, AMD, and Intel GPUs, use KernelAbstractions.jl:
+
+```julia
+using KernelAbstractions
+using oneAPI
+
+@kernel function generic_kernel!(a, b)
+    i = @index(Global)
+    @inbounds a[i] = a[i] + b[i]
+end
+
+a = oneArray(rand(Float32, 100))
+b = oneArray(rand(Float32, 100))
+
+backend = get_backend(a)  # oneAPIBackend()
+kernel! = generic_kernel!(backend)
+kernel!(a, b, ndrange=length(a))
+```
+
+See the [KernelAbstractions.jl documentation](https://juliagpu.github.io/KernelAbstractions.jl/stable/) for more details.
+
+## Debugging Kernels
+
+See the [Compiler and Reflection](@ref) page for tools to inspect generated code and debug kernels.
diff --git a/docs/src/api/memory.md b/docs/src/api/memory.md
new file mode 100644
index 00000000..8cf72965
--- /dev/null
+++ b/docs/src/api/memory.md
@@ -0,0 +1,359 @@
+# Memory Management
+
+This page documents memory management in oneAPI.jl.
+
+## Memory Operations
+
+### `Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst, src, N)`
+
+Low-level memory copy operation on the GPU. Copies `N` elements from `src` to `dst` using
+the specified context and device. Both `src` and `dst` can be either host pointers (`Ptr`)
+or device pointers (`ZePtr`).
+
+!!! warning
+    This is a low-level function. No bounds checking is performed. For safe array copying,
+    use `copyto!` on `oneArray` objects instead.
+
+### `unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr, pattern, N)`
+
+Low-level memory fill operation on the GPU. Fills `N` elements at `ptr` with the given pattern
+using the specified context and device.
+
+!!! warning
+    This is a low-level function. For safe array operations, use `fill!` on `oneArray`
+    objects instead.
+
+
+## Memory Types
+
+oneAPI supports three types of memory through Unified Shared Memory (USM):
+
+### Device Memory (Default)
+
+Fastest GPU access, not directly accessible from CPU.
+
+```julia
+# Create array in device memory (default)
+a = oneArray{Float32}(undef, 1000)
+@assert is_device(a)
+
+# Or explicitly specify
+b = oneArray{Float32,1,oneL0.DeviceBuffer}(undef, 1000)
+```
+
+**Advantages:**
+- Fastest GPU access
+- Best for compute-intensive operations
+
+**Disadvantages:**
+- Cannot directly access from CPU
+- Requires explicit copy to/from CPU
+
+**Use when:** Data stays on GPU for multiple operations
+
+### Shared Memory
+
+Accessible from both CPU and GPU with automatic migration.
+
+```julia
+# Create array in shared memory
+a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000)
+@assert is_shared(a)
+
+# Can access from CPU
+a[1] = 42.0f0  # Automatic migration to CPU
+println(a[1])  # Read from CPU
+
+# Can use in GPU kernels
+@oneapi groups=1 items=1000 kernel(a)  # Automatic migration to GPU
+```
+
+**Advantages:**
+- Accessible from both CPU and GPU
+- Unified virtual addressing
+- Automatic migration
+
+**Disadvantages:**
+- Migration overhead
+- Slower than device memory for pure GPU work
+
+**Use when:** Frequent CPU-GPU data exchange needed
+
+### Host Memory
+
+CPU memory that's pinned and visible to GPU.
+
+```julia
+# Create array in host memory
+a = oneArray{Float32,1,oneL0.HostBuffer}(undef, 1000)
+@assert is_host(a)
+
+# Direct CPU access
+a[1] = 42.0f0
+
+# Can be used by GPU (but slower than device memory)
+@oneapi groups=1 items=1000 kernel(a)
+```
+
+**Advantages:**
+- Direct CPU access
+- Pinned memory (faster PCIe transfers)
+- Good for staging
+
+**Disadvantages:**
+- Slower GPU access than device memory
+- Uses pinned system memory (limited resource)
+
+**Use when:** Staging data for transfer, or CPU needs to write while GPU reads
+
+## Memory Type Comparison
+
+| Feature | Device | Shared | Host |
+|---------|--------|--------|------|
+| CPU Access | ❌ No | ✅ Yes | ✅ Yes |
+| GPU Performance | ⭐⭐⭐ Fastest | ⭐⭐ Good | ⭐ Slower |
+| Migration | Manual | Automatic | Manual |
+| Use Case | Pure GPU | Mixed CPU/GPU | Staging |
+
+## Memory Allocation and Deallocation
+
+### Automatic Management
+
+Julia's garbage collector automatically manages `oneArray` memory:
+
+```julia
+function allocate_and_compute()
+    a = oneArray(rand(Float32, 1000))
+    b = oneArray(rand(Float32, 1000))
+    c = a .+ b
+    return Array(c)  # Only c is copied back
+    # a and b will be garbage collected
+end
+
+result = allocate_and_compute()
+# GPU memory for a and b is freed eventually
+```
+
+### Manual Garbage Collection
+
+Force garbage collection to free GPU memory:
+
+```julia
+# Allocate large arrays
+a = oneArray(rand(Float32, 10_000_000))
+b = oneArray(rand(Float32, 10_000_000))
+
+# Clear references
+a = nothing
+b = nothing
+
+# Force GC to reclaim GPU memory
+GC.gc()
+```
+
+### Explicit Freeing
+
+Immediately free GPU memory (use with caution):
+
+```julia
+a = oneArray(rand(Float32, 1000))
+# ... use a ...
+
+# Explicitly free (dangerous if still in use!)
+unsafe_free!(a)
+
+# a is now invalid - do not use!
+```
+
+!!! warning
+    Only use `unsafe_free!` when you're certain the array is no longer needed, including
+    by any pending GPU operations. Prefer letting the GC handle cleanup.
+
+### Do-Block Pattern
+
+Use do-blocks for automatic cleanup:
+
+```julia
+result = oneArray{Float32}(1000) do temp
+    # temp is automatically freed when block exits
+    temp .= 1.0f0
+    sum(temp)  # Result is returned
+end
+```
+
+## Memory Pooling
+
+oneAPI.jl uses memory pooling to reduce allocation overhead:
+
+```julia
+using oneAPI
+
+# Allocations are pooled
+for i in 1:100
+    a = oneArray(rand(Float32, 1000))
+    # ... use a ...
+    # Memory is returned to pool, not freed
+end
+```
+
+The pool automatically manages memory reuse, reducing allocation costs.
+
+## Checking Memory Usage
+
+Query GPU memory info:
+
+```julia
+using oneAPI.oneL0
+
+dev = device()
+props = memory_properties(dev)
+
+for prop in props
+    println("Memory size: ", prop.totalSize ÷ (1024^3), " GB")
+end
+```
+
+## Out of Memory Errors
+
+If you encounter out-of-memory errors:
+
+### 1. Reduce Batch Size
+
+```julia
+# Instead of processing all at once
+result = process(oneArray(huge_data))
+
+# Process in smaller batches
+for batch in batches(huge_data, size=1000)
+    result = process(oneArray(batch))
+    # Process result...
+end
+```
+
+### 2. Free Unused Arrays
+
+```julia
+a = oneArray(rand(Float32, 1_000_000))
+b = compute(a)
+
+# If 'a' is no longer needed
+unsafe_free!(a)
+
+# Continue with 'b'
+result = process(b)
+```
+
+### 3. Use Shared or Host Memory
+
+```julia
+# Instead of device memory
+a = oneArray{Float32}(undef, huge_size)
+
+# Use shared memory (can swap to system RAM)
+a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, huge_size)
+```
+
+### 4. Force Garbage Collection
+
+```julia
+# After freeing references
+large_array = nothing
+GC.gc()  # Immediately reclaim GPU memory
+```
+
+### 5. Use Multiple Devices
+
+```julia
+# Distribute work across devices
+for (i, dev_id) in enumerate(1:length(devices()))
+    Threads.@spawn begin
+        device!(dev_id)
+        partition = data_partitions[i]
+        a = oneArray(partition)
+        result = compute(a)
+        # ...
+    end
+end
+```
+
+## Low-Level Memory Operations
+
+For advanced users, oneL0 provides direct memory management:
+
+```julia
+using oneAPI.oneL0
+
+ctx = context()
+dev = device()
+
+# Allocate device memory
+ptr = device_alloc(ctx, dev, 1024, 8)  # 1024 bytes, 8-byte aligned
+
+# Copy data
+data = rand(Float32, 256)
+GC.@preserve data begin
+    unsafe_copyto!(ctx, dev, ptr, pointer(data), 256)
+end
+
+# Free memory
+free(ctx, ptr)
+```
+
+## Memory Advise and Prefetch
+
+Hint to the runtime about memory usage (shared memory only):
+
+```julia
+using oneAPI.oneL0
+
+a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000)
+
+# Advise that this will be read-only on the device
+# (Implementation depends on Level Zero driver support)
+
+# Prefetch to device
+ctx = context()
+dev = device()
+queue = global_queue(ctx, dev)
+
+execute!(queue) do list
+    append_prefetch!(list, pointer(a), sizeof(a))
+end
+```
+
+## Best Practices
+
+1. **Use device memory by default** for best GPU performance
+2. **Use shared memory** when you need CPU access without explicit copies
+3. **Use host memory** for staging data or when CPU writes frequently
+4. **Let GC handle cleanup** unless you have specific memory pressure
+5. **Reuse allocations** within loops when possible
+6. **Profile memory usage** to identify bottlenecks
+7. **Be cautious with `unsafe_free!`** - use only when you're certain it's safe
+
+## Example: Efficient Memory Usage
+
+```julia
+using oneAPI
+
+function efficient_pipeline(data_batches)
+    # Allocate output buffer once
+    result = oneArray{Float32}(undef, 1000)
+    results = Float32[]
+
+    for batch in data_batches
+        # Reuse input buffer by copying
+        input = oneArray(batch)
+
+        # Compute in-place when possible
+        @oneapi groups=4 items=250 process_kernel!(result, input)
+
+        # Copy result back
+        push!(results, Array(result)...)
+
+        # Input is freed when loop continues
+    end
+
+    return results
+end
+```
diff --git a/docs/src/arrays.md b/docs/src/arrays.md
new file mode 100644
index 00000000..5ecad50c
--- /dev/null
+++ b/docs/src/arrays.md
@@ -0,0 +1,64 @@
+# Array Programming
+
+oneAPI.jl provides an array type, `oneArray`, which lives on the GPU. It implements the interface defined by `GPUArrays.jl`, allowing for high-level array operations.
+
+## The `oneArray` Type
+
+The `oneArray{T,N}` type represents an N-dimensional array with elements of type `T` stored on the GPU.
+
+```julia
+using oneAPI
+
+# Allocate an uninitialized array
+a = oneArray{Float32}(undef, 1024)
+
+# Initialize from a CPU array
+b = oneArray([1, 2, 3, 4])
+
+# Initialize with zeros/ones
+z = oneAPI.zeros(Float32, 100)
+o = oneAPI.ones(Float32, 100)
+```
+
+## Array Operations
+
+Since `oneArray` implements the AbstractArray interface, you can use standard Julia array operations.
+
+```julia
+a = oneArray(rand(Float32, 10))
+b = oneArray(rand(Float32, 10))
+
+c = a .+ b        # Element-wise addition
+d = sum(a)        # Reduction
+e = map(sin, a)   # Map
+```
+
+## Data Transfer
+
+To move data between the host (CPU) and the device (GPU), use the constructors or `copyto!`.
+
+```julia
+# CPU to GPU
+d_a = oneArray(h_a)
+
+# GPU to CPU
+h_a = Array(d_a)
+```
+
+## Backend Agnostic Programming
+
+To write code that works on both CPU and GPU (and other backends like CUDA), use the generic array interfaces provided by `GPUArrays.jl`. Avoid hardcoding `oneArray` in your functions; instead, accept `AbstractArray` and let the dispatch system handle the specific implementation.
+
+```julia
+function generic_add!(a::AbstractArray, b::AbstractArray)
+    a .+= b
+    return a
+end
+
+# Works on CPU
+generic_add!(rand(10), rand(10))
+
+# Works on Intel GPU
+generic_add!(oneArray(rand(10)), oneArray(rand(10)))
+```
+
diff --git a/docs/src/device.md b/docs/src/device.md
new file mode 100644
index 00000000..acdb4d6e
--- /dev/null
+++ b/docs/src/device.md
@@ -0,0 +1,40 @@
+# Device Intrinsics
+
+When writing custom kernels, you have access to a set of device intrinsics that map to underlying hardware instructions.
+
+## Indexing
+
+These functions allow you to determine the current thread's position in the execution grid.
+
+- `get_global_id(dim=0)`: Global index of the work item.
+- `get_local_id(dim=0)`: Local index of the work item within the workgroup.
+- `get_group_id(dim=0)`: Index of the workgroup.
+- `get_global_size(dim=0)`: Global size of the ND-range.
+- `get_local_size(dim=0)`: Size of the workgroup.
+- `get_num_groups(dim=0)`: Number of workgroups.
+
+## Synchronization
+
+- `barrier(flags=0)`: Synchronizes all work items in a workgroup.
+
+## Atomics
+
+Atomic operations are supported for thread-safe updates to memory.
+
+- `atomic_add!(ptr, val)`
+- `atomic_sub!(ptr, val)`
+- `atomic_inc!(ptr)`
+- `atomic_dec!(ptr)`
+- `atomic_min!(ptr, val)`
+- `atomic_max!(ptr, val)`
+- `atomic_and!(ptr, val)`
+- `atomic_or!(ptr, val)`
+- `atomic_xor!(ptr, val)`
+- `atomic_cmpxchg!(ptr, cmp, val)`
+
+Supported types for atomics generally include `Int32`, `Int64`, `UInt32`, `UInt64`, `Float32`, and `Float64`.
+
+## Math Functions
+
+Standard math functions from Julia's `Base` are supported within kernels (e.g., `sin`, `cos`, `exp`, `sqrt`).
+
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
new file mode 100644
index 00000000..595e7ca5
--- /dev/null
+++ b/docs/src/getting_started.md
@@ -0,0 +1,58 @@
+# Getting Started
+
+## Basic Usage
+
+The most basic usage involves moving data to the GPU using `oneArray` and performing operations on it.
+
+```julia
+using oneAPI
+
+# Create an array on the CPU
+a = rand(Float32, 1024)
+
+# Move it to the GPU
+d_a = oneArray(a)
+
+# Perform operations on the GPU
+d_b = d_a .+ 1.0f0
+
+# Move the result back to the CPU
+b = Array(d_b)
+```
+
+## Matrix Multiplication
+
+Matrix multiplication is accelerated using the oneMKL library when available.
+
+```julia
+using oneAPI
+
+A = oneArray(rand(Float32, 128, 128))
+B = oneArray(rand(Float32, 128, 128))
+
+# This operation runs on the GPU
+C = A * B
+```
+
+## Writing Kernels
+
+For custom operations, you can write kernels using the `@oneapi` macro.
+
+```julia
+using oneAPI
+
+function my_kernel(a, b)
+    i = get_global_id()
+    @inbounds a[i] += b[i]
+    return
+end
+
+a = oneArray(ones(Float32, 1024))
+b = oneArray(ones(Float32, 1024))
+
+# Launch the kernel with 1024 items
+@oneapi items=1024 my_kernel(a, b)
+```
+
+See the [Kernel Programming](kernels.md) section for more details.
+
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 00000000..988d9e92
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,29 @@
+# oneAPI.jl
+
+*Julia support for the oneAPI programming toolkit.*
+
+oneAPI.jl provides support for working with the [oneAPI unified programming model](https://software.intel.com/en-us/oneapi). The package is currently verified to work with the implementation provided by the [Intel Compute Runtime](https://github.com/intel/compute-runtime), primarily on Linux.
+
+## Writing Portable Code
+
+While oneAPI.jl provides specific functionality for Intel GPUs, it is highly recommended to write **backend-agnostic code** whenever possible. This allows your code to run on various hardware backends (NVIDIA, AMD, Intel, Apple) without modification.
+
+- **[GPUArrays.jl](https://github.com/JuliaGPU/GPUArrays.jl)**: Use high-level array abstractions that work across different GPU backends.
+- **[KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl)**: Use this package for writing kernels that can be compiled for CPU, CUDA, ROCm, and oneAPI devices.
+
+Direct use of `oneAPI`-specific macros (like `@oneapi`) and types (like `oneArray`) should be reserved for cases where you need specific optimizations or features not covered by the generic abstractions.
+
+## Features
+
+- **High-level Array Abstractions**: `oneArray` type fully implementing the `GPUArrays.jl` interface.
+- **Kernel Programming**: Execute custom kernels written in Julia on Intel GPUs.
+- **Level Zero Integration**: Low-level access to the Level Zero API via the `oneL0` submodule.
+- **oneMKL Support**: Integration with Intel oneMKL for BLAS, LAPACK, and sparse operations.
+- **SYCL Integration**: Interoperability with SYCL (on Linux).
+
+## Requirements
+
+- **Julia**: 1.10 or higher
+- **OS**: Linux
+- **Hardware**: Intel Gen9 graphics or newer (including Intel Arc A-Series)
+
diff --git a/docs/src/installation.md b/docs/src/installation.md
new file mode 100644
index 00000000..59cf36ae
--- /dev/null
+++ b/docs/src/installation.md
@@ -0,0 +1,127 @@
+# Installation
+
+## Requirements
+
+oneAPI.jl requires:
+- **Julia**: 1.10 or higher
+- **OS**: Linux (recommended) or Windows (experimental via WSL2)
+- **Hardware**: Intel Gen9 graphics or newer. For Intel Arc GPUs (A580, A750, A770, etc), **Linux 6.2+** is required.
+
+## Installing oneAPI.jl
+
+You can install oneAPI.jl using the Julia package manager:
+
+```julia
+pkg> add oneAPI
+```
+
+This will automatically download the necessary binary dependencies, including:
+- `oneAPI loader`
+- `SPIR-V tools`
+- `Intel Compute Runtime` (if compatible hardware is found)
+
+## Verifying Installation
+
+After installation, you can verify that oneAPI.jl is working correctly and detecting your hardware:
+
+```julia
+julia> using oneAPI
+julia> oneAPI.versioninfo()
+```
+
+The output should list the binary dependencies, toolchain versions, available drivers, and devices.
+
+## Troubleshooting Drivers
+
+If no drivers or devices are detected, ensure that you have the correct Intel graphics drivers installed for your system.
+- On Linux, check if `libze_intel_gpu.so` or similar libraries are available.
+- On Windows (WSL2), ensure you have the latest Intel graphics drivers installed on the host Windows system and that WSL2 is configured to access the GPU.
+
+You can explicitly select drivers and devices if multiple are available:
+
+```julia
+julia> drivers()
+julia> devices()
+julia> device!(1) # Select the first available device
+```
+
+## Using System Libraries (Advanced)
+
+!!! warning
+    Using system libraries instead of the provided artifacts is **not recommended** for most users. Only use this approach if you have specialized requirements or custom Intel binaries.
+
+By default, oneAPI.jl uses pre-built binary artifacts (JLLs) for the Intel Compute Runtime, oneAPI loader, and related libraries. However, you may need to use system-installed libraries in certain situations:
+
+- Custom or newer Intel graphics drivers
+- Specialized hardware configurations
+- Development or debugging of the runtime stack
+- Systems where the artifacts are incompatible
+
+### Configuration Script
+
+oneAPI.jl provides a helper script to discover and configure system libraries. From the Julia REPL:
+
+```julia
+julia> include(joinpath(pkgdir(oneAPI), "res", "local.jl"))
+```
+
+This script will:
+1. Search for Intel libraries on your system:
+   - Intel Graphics Compiler (IGC): `libigc`, `libiga64`, `libigdfcl`, `libopencl-clang`
+   - Graphics Memory Management Library: `libigdgmm`
+   - Intel Compute Runtime (NEO): `libze_intel_gpu`, `libigdrcl`
+   - oneAPI Level Zero Loader: `libze_loader`, `libze_validation_layer`
+
+2. Generate preferences in `LocalPreferences.toml` that override the artifact paths
+
+### Manual Configuration
+
+You can also manually set preferences to use specific library paths. Create or edit `LocalPreferences.toml` in your project or global environment:
+
+```toml
+[NEO_jll]
+libze_intel_gpu_path = "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so.1"
+libigdrcl_path = "/usr/lib/x86_64-linux-gnu/intel-opencl/libigdrcl.so"
+
+[libigc_jll]
+libigc_path = "/usr/lib/x86_64-linux-gnu/libigc.so"
+libigdfcl_path = "/usr/lib/x86_64-linux-gnu/libigdfcl.so"
+
+[gmmlib_jll]
+libigdgmm_path = "/usr/lib/x86_64-linux-gnu/libigdgmm.so"
+
+[oneAPI_Level_Zero_Loader_jll]
+libze_loader_path = "/usr/lib/x86_64-linux-gnu/libze_loader.so"
+```
+
+### Reverting to Artifacts
+
+To revert to the default artifact binaries, simply delete the oneAPI-related entries from `LocalPreferences.toml` (or delete the entire file if it only contains these preferences).
+
+### Common Locations
+
+System libraries are typically installed in:
+
+**Ubuntu/Debian:**
+- `/usr/lib/x86_64-linux-gnu/`
+- `/usr/lib/x86_64-linux-gnu/intel-opencl/`
+
+**Fedora/RHEL:**
+- `/usr/lib64/`
+- `/usr/lib64/intel-opencl/`
+
+**Custom Intel oneAPI installation:**
+- `/opt/intel/oneapi/compiler/latest/linux/lib/`
+- `/opt/intel/oneapi/compiler/latest/linux/lib/x64/`
+
+### Verifying System Library Configuration
+
+After configuring system libraries, restart Julia and verify the configuration:
+
+```julia
+julia> using oneAPI
+julia> oneAPI.versioninfo()
+```
+
+Check that the reported library paths match your system libraries. If issues arise, examine the `LocalPreferences.toml` file and ensure all paths are correct and the libraries are compatible with each other.
+
diff --git a/docs/src/kernels.md b/docs/src/kernels.md
new file mode 100644
index 00000000..b5e03b12
--- /dev/null
+++ b/docs/src/kernels.md
@@ -0,0 +1,61 @@
+# Kernel Programming
+
+For maximum performance or custom operations not covered by high-level array abstractions, you can write custom kernels in Julia that execute on the GPU.
+
+## The `@oneapi` Macro
+
+The `@oneapi` macro is used to launch a kernel on the device. It takes configuration arguments like the number of items (threads) and groups (blocks).
+
+```julia
+using oneAPI
+
+function kernel(a, b)
+    i = get_global_id()
+    if i <= length(a)
+        @inbounds a[i] += b[i]
+    end
+    return
+end
+
+a = oneArray(rand(Float32, 100))
+b = oneArray(rand(Float32, 100))
+
+# Launch configuration
+items = 100
+groups = 1
+
+@oneapi items=items groups=groups kernel(a, b)
+```
+
+## KernelAbstractions.jl
+
+For portable kernel programming, it is highly recommended to use [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl). This allows you to write kernels that work on CPU, CUDA, ROCm, and oneAPI.
+
+```julia
+using KernelAbstractions, oneAPI
+
+@kernel function my_kernel!(a, b)
+    i = @index(Global, Linear)
+    @inbounds a[i] += b[i]
+end
+
+# Get the backend
+backend = get_backend(a)
+
+# Instantiate the kernel
+k = my_kernel!(backend)
+
+# Launch with configuration
+k(a, b; ndrange=length(a))
+```
+
+## Device Intrinsics
+
+Inside a kernel, you can use various intrinsics to interact with the hardware:
+- `get_global_id()`: Get the global thread ID.
+- `get_local_id()`: Get the local thread ID within a workgroup.
+- `get_group_id()`: Get the workgroup ID.
+- `barrier()`: Synchronize threads within a workgroup.
+
+These correspond to standard OpenCL/Level Zero intrinsics.
+
diff --git a/docs/src/level_zero.md b/docs/src/level_zero.md
new file mode 100644
index 00000000..f093fcce
--- /dev/null
+++ b/docs/src/level_zero.md
@@ -0,0 +1,49 @@
+# Level Zero Interface
+
+The `oneL0` submodule provides low-level access to the Level Zero API, which gives you fine-grained control over the hardware.
+
+## Drivers and Devices
+
+You can enumerate available drivers and devices:
+
+```julia
+using oneAPI.oneL0
+
+# Get available drivers
+drvs = drivers()
+
+# Get devices for a driver
+devs = devices(first(drvs))
+
+# Inspect device properties
+props = compute_properties(first(devs))
+println("Max workgroup size: ", props.maxTotalGroupSize)
+```
+
+## Contexts and Queues
+
+Manage contexts and command queues for executing operations:
+
+```julia
+# Create a context
+ctx = ZeContext(first(drvs))
+
+# Create a command queue
+queue = ZeCommandQueue(ctx, first(devs))
+
+# Execute a command list
+execute!(queue) do list
+    append_barrier!(list)
+end
+```
+
+## Memory Operations
+
+You can perform low-level memory operations using command lists:
+
+```julia
+execute!(queue) do list
+    append_copy!(list, dst_ptr, src_ptr, size)
+end
+```
+
diff --git a/docs/src/memory.md b/docs/src/memory.md
new file mode 100644
index 00000000..f97146e8
--- /dev/null
+++ b/docs/src/memory.md
@@ -0,0 +1,51 @@
+# Memory Management
+
+Efficient memory management is crucial for GPU programming. oneAPI.jl provides tools to manage device memory allocation and data transfer.
+
+## Unified Shared Memory (USM)
+
+oneAPI uses Unified Shared Memory, which allows for pointers that can be accessible from both the host and the device, or specific to one.
+
+- **Device Memory**: Accessible only by the device. Fastest access for kernels.
+- **Host Memory**: Accessible by the host and device.
+- **Shared Memory**: Automatically migrated between host and device.
+
+`oneArray` typically uses device memory for performance.
+
+## Allocation
+
+You can perform low-level memory allocation using the `oneL0` submodule if needed, though `oneArray` handles this automatically.
+
+```julia
+using oneAPI.oneL0
+
+# Allocate device memory
+ptr = oneL0.zeMemAllocDevice(context(), device(), 1024, 1)
+
+# Free memory
+oneL0.zeMemFree(context(), ptr)
+```
+
+## Garbage Collection
+
+Julia's garbage collector automatically manages `oneArray` objects. However, GPU memory is a limited resource. If you are running into out-of-memory errors, you might need to manually trigger garbage collection or free arrays.
+
+```julia
+a = oneArray(rand(Float32, 1024*1024*100))
+a = nothing
+GC.gc() # Reclaim memory
+```
+
+## Explicit Freeing
+
+For immediate memory release, you can use `unsafe_free!`:
+
+```julia
+using oneAPI
+
+a = oneArray(rand(1024))
+oneAPI.unsafe_free!(a)
+```
+
+**Warning**: Only use `unsafe_free!` if you are sure the array is no longer used, including by any pending GPU operations.
+
diff --git a/docs/src/onemkl.md b/docs/src/onemkl.md
new file mode 100644
index 00000000..91f2645c
--- /dev/null
+++ b/docs/src/onemkl.md
@@ -0,0 +1,59 @@
+# oneMKL Integration
+
+oneAPI.jl provides bindings to the Intel oneMKL library, enabling high-performance linear algebra operations on Intel GPUs.
+
+## Dense Linear Algebra (BLAS/LAPACK)
+
+Standard BLAS and LAPACK operations are automatically accelerated when using `oneArray`.
+
+```julia
+using oneAPI, LinearAlgebra
+
+A = oneArray(rand(Float32, 100, 100))
+B = oneArray(rand(Float32, 100, 100))
+
+# Matrix multiplication (GEMM)
+C = A * B
+
+# Linear solve (AX = B)
+X = A \ B
+```
+
+## Sparse Linear Algebra
+
+oneAPI.jl supports sparse matrix operations via oneMKL's sparse BLAS functionality. These integrate with Julia's `SparseArrays` standard library.
+
+```julia
+using oneAPI, oneAPI.oneMKL, SparseArrays, LinearAlgebra
+
+# Create a sparse matrix on CPU
+A = sprand(100, 100, 0.1)
+
+# Move to GPU (converts to oneMKL format)
+dA = oneMKL.oneSparseMatrixCSC(A)
+
+# Create a dense vector
+x = oneArray(rand(100))
+
+# Sparse matrix-vector multiplication
+y = dA * x
+```
+
+Note that `oneSparseMatrixCSC` is available for Compressed Sparse Column format, which is the standard in Julia.
+
+## FFTs
+
+Fast Fourier Transforms are supported through `AbstractFFTs.jl` interface integration with oneMKL DFTs.
+
+```julia
+using oneAPI, FFTW
+
+a = oneArray(rand(ComplexF32, 1024))
+
+# Forward FFT
+b = fft(a)
+
+# Inverse FFT
+c = ifft(b)
+```
+
diff --git a/docs/src/troubleshooting.md b/docs/src/troubleshooting.md
new file mode 100644
index 00000000..9f790f65
--- /dev/null
+++ b/docs/src/troubleshooting.md
@@ -0,0 +1,55 @@
+# Troubleshooting
+
+## Common Issues
+
+### No devices detected
+
+**Symptom**: `oneAPI.devices()` returns an empty list.
+
+**Solution**:
+1. Ensure you are running on Linux (recommended) or WSL2.
+2. Check if the Intel Compute Runtime is installed and accessible.
+3. Verify your user has permissions to access the GPU render device (usually `render` group).
+4. Run `oneAPI.versioninfo()` to see detailed diagnostic information.
+
+### "Double type is not supported"
+
+**Symptom**: Kernel compilation fails with an error about `Float64` or `Double` support.
+
+**Solution**:
+Some Intel GPUs (especially integrated graphics) lack native hardware support for 64-bit floating point operations.
+- Use `Float32` instead of `Float64`.
+- Check support with:
+  ```julia
+  using oneAPI.oneL0
+  oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 != 0
+  ```
+
+### "Out of memory" errors
+
+**Symptom**: Memory allocation fails.
+
+**Solution**:
+- Trigger garbage collection: `GC.gc()`.
+- Manually free unused arrays: `oneAPI.unsafe_free!(array)`.
+- Check if you are exceeding the device's memory capacity.
+
+## Debugging
+
+### Validation Layer
+
+Enable the Level Zero validation layer to catch API misuse:
+
+```bash
+export ZE_ENABLE_VALIDATION_LAYER=1
+export ZE_ENABLE_PARAMETER_VALIDATION=1
+```
+
+### Debug Mode
+
+Enable debug mode in oneAPI.jl to use debug builds of underlying toolchains (if available):
+
+```julia
+oneAPI.set_debug!(true)
+```
+
diff --git a/docs/src/usage/performance.md b/docs/src/usage/performance.md
new file mode 100644
index 00000000..c7064aee
--- /dev/null
+++ b/docs/src/usage/performance.md
@@ -0,0 +1,485 @@
+# Performance Guide
+
+This guide provides tips and techniques for optimizing oneAPI.jl applications.
+
+## Quick Wins
+
+### 1. Use Device Memory
+
+Device memory is fastest for GPU operations:
+
+```julia
+# ✅ Good: Device memory (default)
+a = oneArray{Float32}(undef, 1000)
+
+# ❌ Slower: Shared memory (unless CPU access is needed)
+a = oneArray{Float32,1,oneL0.SharedBuffer}(undef, 1000)
+```
+
+### 2. Minimize Data Transfers
+
+Keep data on GPU between operations:
+
+```julia
+# ❌ Bad: Unnecessary transfers
+for i in 1:100
+    cpu_data = Array(gpu_array)  # GPU → CPU
+    cpu_data .+= 1
+    gpu_array = oneArray(cpu_data)  # CPU → GPU
+end
+
+# ✅ Good: Keep data on GPU
+for i in 1:100
+    gpu_array .+= 1  # All on GPU
+end
+```
+
+### 3. Use Fused Operations
+
+Broadcasting automatically fuses operations:
+
+```julia
+# ❌ Slower: Multiple kernel launches
+a = oneArray(rand(Float32, 1000))
+b = sin.(a)
+c = b .+ 1.0f0
+d = c .* 2.0f0
+
+# ✅ Faster: Single fused kernel
+d = 2.0f0 .* (sin.(a) .+ 1.0f0)
+```
+
+### 4. Specify Float32
+
+GPUs are typically optimized for single precision:
+
+```julia
+# ❌ Slower: Float64 (if not needed)
+a = oneArray(rand(Float64, 1000))
+
+# ✅ Faster: Float32
+a = oneArray(rand(Float32, 1000))
+```
+
+## Kernel Optimization
+
+### Launch Configuration
+
+Choose appropriate workgroup sizes:
+
+```julia
+# Typical good workgroup sizes
+items = 256   # Common choice, adjust based on hardware
+items = 128   # Try smaller if using lots of local memory
+items = 512   # Try larger for simple kernels
+
+# Calculate groups
+N = length(array)
+groups = cld(N, items)  # Ceiling division
+
+@oneapi groups=groups items=items kernel(array)
+```
+
+### Memory Access Patterns
+
+Coalesced memory access is crucial for performance:
+
+```julia
+# ✅ Good: Coalesced access (consecutive threads access consecutive memory)
+function good_kernel!(output, input)
+    i = get_global_id()
+    @inbounds output[i] = input[i] + 1.0f0
+    return
+end
+
+# ❌ Bad: Strided access (cache inefficient)
+function bad_kernel!(output, input, stride)
+    i = get_global_id()
+    @inbounds output[i] = input[i * stride] + 1.0f0
+    return
+end
+```
+
+### Use Local Memory
+
+Local memory is faster than global memory for data reuse:
+
+```julia
+function optimized_reduction!(result, input)
+    local_id = get_local_id()
+    local_size = get_local_size()
+    group_id = get_group_id()
+
+    # Allocate local memory
+    local_mem = oneLocalArray(Float32, 256)
+
+    # Load global → local (coalesced)
+    global_id = get_global_id()
+    @inbounds local_mem[local_id] = input[global_id]
+    barrier()
+
+    # Reduce in local memory (much faster)
+    stride = local_size ÷ 2
+    while stride > 0
+        if local_id <= stride
+            @inbounds local_mem[local_id] += local_mem[local_id + stride]
+        end
+        barrier()
+        stride ÷= 2
+    end
+
+    # Write result
+    if local_id == 1
+        @inbounds result[group_id] = local_mem[1]
+    end
+    return
+end
+```
+
+### Minimize Barriers
+
+Barriers have overhead:
+
+```julia
+# ❌ Bad: Unnecessary barriers
+function wasteful_kernel!(a)
+    i = get_local_id()
+    a[i] += 1
+    barrier()  # Not needed if no data sharing
+    a[i] *= 2
+    barrier()  # Not needed
+    return
+end
+
+# ✅ Good: Barriers only when needed
+function efficient_kernel!(a, shared)
+    i = get_local_id()
+
+    # Load to shared memory
+    shared[i] = a[i]
+    barrier()  # Needed: ensure all loads complete
+
+    # Use shared data
+    result = shared[i] + shared[i+1]
+    a[i] = result
+    return
+end
+```
+
+### Avoid Divergence
+
+Minimize thread divergence (different execution paths):
+
+```julia
+# ❌ Bad: High divergence
+function divergent_kernel!(a)
+    i = get_global_id()
+    if i % 32 == 0
+        # Only 1 in 32 threads executes this
+        @inbounds a[i] = expensive_computation(a[i])
+    else
+        @inbounds a[i] += 1.0f0
+    end
+    return
+end
+
+# ✅ Better: Separate into different kernels
+function uniform_kernel!(a)
+    i = get_global_id()
+    @inbounds a[i] += 1.0f0
+    return
+end
+
+function sparse_kernel!(a, indices)
+    i = get_global_id()
+    if i <= length(indices)
+        idx = indices[i]
+        @inbounds a[idx] = expensive_computation(a[idx])
+    end
+    return
+end
+```
+
+## Type Stability
+
+Type instability severely hurts performance:
+
+```julia
+# ❌ Bad: Type unstable
+function unstable_kernel!(output, input, flag)
+    i = get_global_id()
+    if flag
+        value = input[i]  # Float32
+    else
+        value = 0         # Int
+    end
+    output[i] = value * 2  # Type uncertain!
+    return
+end
+
+# ✅ Good: Type stable
+function stable_kernel!(output, input, flag)
+    i = get_global_id()
+    if flag
+        value = input[i]  # Float32
+    else
+        value = 0.0f0     # Float32
+    end
+    output[i] = value * 2.0f0  # All Float32!
+    return
+end
+
+# Check type stability
+@device_code_warntype @oneapi groups=1 items=10 stable_kernel!(output, input, true)
+```
+
+## Algorithmic Optimization
+
+### Use Library Functions
+
+Leverage optimized library implementations:
+
+```julia
+using oneAPI, LinearAlgebra
+
+# ✅ Good: Use oneMKL through LinearAlgebra
+A = oneArray(rand(Float32, 1000, 1000))
+B = oneArray(rand(Float32, 1000, 1000))
+C = A * B  # Uses optimized oneMKL
+
+# ❌ Bad: Write your own matrix multiplication
+# (unless you have a very specific use case)
+```
+
+### Choose Right Algorithm
+
+Some algorithms parallelize better than others:
+
+```julia
+# ❌ Sequential algorithm
+function sequential_sum(arr)
+    sum = 0.0f0
+    for x in arr
+        sum += x
+    end
+    return sum
+end
+
+# ✅ Parallel reduction
+result = sum(oneArray(data))  # Optimized parallel reduction
+```
+
+## Benchmarking
+
+### Basic Timing
+
+```julia
+using BenchmarkTools, oneAPI
+
+a = oneArray(rand(Float32, 1000))
+b = oneArray(rand(Float32, 1000))
+
+# Warmup
+c = a .+ b
+synchronize()
+
+# Benchmark
+@benchmark begin
+    c = $a .+ $b
+    synchronize()
+end
+```
+
+### Accurate GPU Timing
+
+Always synchronize before timing:
+
+```julia
+using oneAPI
+
+a = oneArray(rand(Float32, 1_000_000))
+
+# ❌ Wrong: Doesn't wait for GPU
+@time a .+= 1  # Only measures kernel launch overhead
+
+# ✅ Correct: Wait for GPU to finish
+@time begin
+    a .+= 1
+    synchronize()
+end
+```
+
+### Profiling with Time
+
+```julia
+function profile_operation(a, b)
+    # Warmup
+    c = a .+ b
+    synchronize()
+
+    # Time kernel launch
+    t1 = time()
+    c = a .+ b
+    t2 = time()
+    launch_time = t2 - t1
+
+    # Time including synchronization
+    synchronize()
+    t3 = time()
+    total_time = t3 - t1
+
+    println("Launch: ", launch_time * 1000, " ms")
+    println("Total:  ", total_time * 1000, " ms")
+    println("Actual: ", (total_time - launch_time) * 1000, " ms")
+end
+
+a = oneArray(rand(Float32, 10_000_000))
+b = oneArray(rand(Float32, 10_000_000))
+profile_operation(a, b)
+```
+
+## Memory Bandwidth
+
+### Theoretical Peak
+
+Calculate theoretical bandwidth:
+
+```julia
+# Example: Intel Iris Xe Graphics
+# 96 execution units, 1.35 GHz
+# Memory bandwidth: ~68 GB/s
+
+# Your kernel processes N Float32 values
+N = 10_000_000
+bytes_transferred = N * sizeof(Float32) * 2  # Read + Write
+
+# Measure time
+t = @elapsed begin
+    a .+= b
+    synchronize()
+end
+
+bandwidth_achieved = bytes_transferred / t / 1e9  # GB/s
+println("Bandwidth: ", bandwidth_achieved, " GB/s")
+```
+
+### Improving Bandwidth Utilization
+
+```julia
+# ✅ Good: Single pass with fusion
+result = @. a + b * c - d / e  # One pass over data
+
+# ❌ Bad: Multiple passes
+result = a .+ b
+result = result .* c
+result = result .- d
+result = result ./ e
+# Four separate passes over data!
+```
+
+## Common Performance Issues
+
+### Issue 1: Too Many Small Kernels
+
+```julia
+# ❌ Bad: Many small kernel launches
+for i in 1:100
+    a .+= 1  # 100 kernel launches!
+end
+
+# ✅ Good: Single kernel or batching
+a .+= 100  # Single operation
+```
+
+### Issue 2: Unnecessary Allocations
+
+```julia
+# ❌ Bad: Allocates temporary
+c = a .+ b  # Allocates new array
+
+# ✅ Good: In-place operation
+c = similar(a)
+c .= a .+ b  # Uses pre-allocated array
+```
+
+### Issue 3: Wrong Number Type
+
+```julia
+# ❌ Bad: Mixed types
+a = oneArray(rand(Float32, 1000))
+b = a .+ 1.0  # Float64 constant!
+
+# ✅ Good: Matching types
+b = a .+ 1.0f0  # Float32 constant
+```
+
+## Performance Checklist
+
+- [ ] Using device memory (not shared unless necessary)
+- [ ] Minimizing CPU-GPU transfers
+- [ ] Using Float32 (unless Float64 required)
+- [ ] Fusing operations with broadcasting
+- [ ] Type-stable kernels (`@device_code_warntype`)
+- [ ] Appropriate workgroup sizes
+- [ ] Coalesced memory access
+- [ ] Minimal thread divergence
+- [ ] Leveraging local memory for reuse
+- [ ] Using library functions when available
+- [ ] Synchronizing before timing
+- [ ] Avoiding unnecessary allocations
+
+## Hardware-Specific Tuning
+
+Different Intel GPUs have different characteristics:
+
+```julia
+using oneAPI.oneL0
+
+dev = device()
+props = properties(dev)
+compute_props = compute_properties(dev)
+
+println("Device: ", props.name)
+println("EU count: ", compute_props.numEUsPerSubslice *
+                       compute_props.numSubslicesPerSlice *
+                       compute_props.numSlices)
+println("Max workgroup size: ", compute_props.maxTotalGroupSize)
+println("Max local memory: ", compute_props.maxSharedLocalMemory, " bytes")
+
+# Adjust your code based on these properties
+```
+
+## Advanced: Async Operations
+
+For overlapping compute and transfers (advanced users):
+
+```julia
+using oneAPI.oneL0
+
+ctx = context()
+dev = device()
+
+# Create multiple queues for async operations
+queue1 = ZeCommandQueue(ctx, dev)
+queue2 = ZeCommandQueue(ctx, dev)
+
+# Launch kernel on queue1
+execute!(queue1) do list
+    # ... kernel launch ...
+end
+
+# Overlap with transfer on queue2
+execute!(queue2) do list
+    append_copy!(list, dst, src, size)
+end
+
+# Synchronize both
+synchronize(queue1)
+synchronize(queue2)
+```
+
+## Further Resources
+
+- [Intel GPU Architecture](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-gpu-architecture.html)
+- [oneAPI Programming Guide](https://www.intel.com/content/www/us/en/developer/tools/oneapi/programming-guide.html)
+- [Level Zero Specification](https://spec.oneapi.io/level-zero/latest/index.html)
diff --git a/src/accumulate.jl b/ext/oneAPIAcceleratedKernelsExt.jl
similarity index 70%
rename from src/accumulate.jl
rename to ext/oneAPIAcceleratedKernelsExt.jl
index 206b4ea1..6bb96180 100644
--- a/src/accumulate.jl
+++ b/ext/oneAPIAcceleratedKernelsExt.jl
@@ -1,3 +1,10 @@
+module oneAPIAcceleratedKernelsExt
+
+import oneAPI
+import oneAPI: oneArray, oneAPIBackend
+import AcceleratedKernels as AK
+
+# Accumulate operations using AcceleratedKernels
 Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)), kwargs...) =
     AK.accumulate!(op, B, A, oneAPIBackend(); init, kwargs...)
 
@@ -6,3 +13,5 @@ Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) =
 
 Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...)
 Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...)
+
+end # module
diff --git a/src/array.jl b/src/array.jl
index d576cdb7..d602f8d9 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -41,6 +41,46 @@ function check_eltype(T)
   end
 end
 
+"""
+    oneArray{T,N,B} <: AbstractGPUArray{T,N}
+
+N-dimensional dense array type for Intel GPU programming using oneAPI and Level Zero.
+
+# Type Parameters
+- `T`: Element type (must be stored inline, no isbits-unions)
+- `N`: Number of dimensions
+- `B`: Buffer type, one of:
+  - `oneL0.DeviceBuffer`: GPU device memory (default, not CPU-accessible)
+  - `oneL0.SharedBuffer`: Unified shared memory (CPU and GPU accessible)
+  - `oneL0.HostBuffer`: Pinned host memory (CPU-accessible, GPU-visible)
+
+# Memory Types
+
+- **Device memory** (default): Fastest GPU access, not directly accessible from CPU
+- **Shared memory**: Accessible from both CPU and GPU, with unified virtual addressing
+- **Host memory**: CPU memory that's visible to the GPU, useful for staging
+
+Use [`is_device`](@ref), [`is_shared`](@ref), [`is_host`](@ref) to query memory type.
+
+# Examples
+```julia
+# Create arrays with different memory types
+A = oneArray{Float32,2}(undef, 10, 10)                    # Device memory (default)
+B = oneArray{Float32,2,oneL0.SharedBuffer}(undef, 10, 10) # Shared memory
+C = oneArray{Float32,2,oneL0.HostBuffer}(undef, 10, 10)   # Host memory
+
+# From existing array
+D = oneArray(rand(Float32, 10, 10))  # Creates device memory array
+
+# Using do-block for automatic cleanup
+result = oneArray{Float32}(100) do arr
+    # Use arr...
+    Array(arr)  # Copy result back before cleanup
+end
+```
+
+See also: [`oneVector`](@ref), [`oneMatrix`](@ref), [`is_device`](@ref), [`is_shared`](@ref)
+"""
 mutable struct oneArray{T,N,B} <: AbstractGPUArray{T,N}
   data::DataRef{B}
 
@@ -179,8 +219,37 @@ end
 buftype(x::oneArray) = buftype(typeof(x))
 buftype(::Type{<:oneArray{<:Any,<:Any,B}}) where {B} = @isdefined(B) ? B : Any
 
+"""
+    is_device(a::oneArray) -> Bool
+
+Check if the array is stored in device memory (not directly CPU-accessible).
+
+Device memory provides the fastest GPU access but cannot be directly accessed from the CPU.
+
+See also: [`is_shared`](@ref), [`is_host`](@ref)
+"""
 is_device(a::oneArray) = isa(a.data[], oneL0.DeviceBuffer)
+
+"""
+    is_shared(a::oneArray) -> Bool
+
+Check if the array is stored in shared (unified) memory.
+
+Shared memory is accessible from both CPU and GPU with unified virtual addressing.
+
+See also: [`is_device`](@ref), [`is_host`](@ref)
+"""
 is_shared(a::oneArray) = isa(a.data[], oneL0.SharedBuffer)
+
+"""
+    is_host(a::oneArray) -> Bool
+
+Check if the array is stored in pinned host memory.
+
+Host memory resides on the CPU but is visible to the GPU, useful for staging data.
+
+See also: [`is_device`](@ref), [`is_shared`](@ref)
+"""
 is_host(a::oneArray) = isa(a.data[], oneL0.HostBuffer)
 
 ## derived types
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 6503a9b0..9ce5bc5c 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -7,6 +7,54 @@ const MACRO_KWARGS = [:launch]
 const COMPILER_KWARGS = [:kernel, :name, :always_inline]
 const LAUNCH_KWARGS = [:groups, :items, :queue]
 
+"""
+    @oneapi [kwargs...] kernel(args...)
+
+High-level interface for launching Julia kernels on Intel GPUs using oneAPI.
+
+This macro compiles a Julia function to SPIR-V, prepares the arguments, and optionally
+launches the kernel on the GPU.
+
+# Keyword Arguments
+
+## Macro Keywords (compile-time)
+- `launch::Bool=true`: Whether to launch the kernel immediately. If `false`, returns the
+  compiled kernel object without executing it.
+
+## Compiler Keywords
+- `kernel::Bool=false`: Whether to compile as a kernel (true) or device function (false)
+- `name::Union{String,Nothing}=nothing`: Explicit name for the kernel
+- `always_inline::Bool=false`: Whether to always inline device functions
+
+## Launch Keywords (runtime)
+- `groups`: Number of workgroups (required). Can be an integer or tuple.
+- `items`: Number of work-items per workgroup (required). Can be an integer or tuple.
+- `queue::ZeCommandQueue=global_queue(...)`: Command queue to submit to.
+
+# Examples
+
+```julia
+# Simple vector addition kernel
+function vadd(a, b, c)
+    i = get_global_id()
+    @inbounds c[i] = a[i] + b[i]
+    return
+end
+
+a = oneArray(rand(Float32, 1024))
+b = oneArray(rand(Float32, 1024))
+c = similar(a)
+
+# Launch with 4 workgroups of 256 items each
+@oneapi groups=4 items=256 vadd(a, b, c)
+
+# Compile without launching
+kernel = @oneapi launch=false vadd(a, b, c)
+kernel(a, b, c; groups=4, items=256)  # Launch later
+```
+
+See also: `zefunction`, `kernel_convert`
+"""
 macro oneapi(ex...)
     call = ex[end]
     kwargs = map(ex[1:end-1]) do kwarg
diff --git a/src/context.jl b/src/context.jl
index 3a36ea35..b0f9ff10 100644
--- a/src/context.jl
+++ b/src/context.jl
@@ -8,23 +8,98 @@
 
 export driver, driver!, device, device!, context, context!, global_queue, synchronize
 
+"""
+    driver() -> ZeDriver
+
+Get the current Level Zero driver for the calling task. If no driver has been explicitly
+set with [`driver!`](@ref), returns the first available driver.
+
+The driver selection is task-local, allowing different Julia tasks to use different drivers.
+
+# Examples
+```julia
+drv = driver()
+println("Using driver: ", drv)
+```
+
+See also: `driver!`, `drivers`
+"""
 function driver()
     get!(task_local_storage(), :ZeDriver) do
         first(drivers())
     end
 end
 
+"""
+    driver!(drv::ZeDriver)
+
+Set the current Level Zero driver for the calling task. This also clears the current
+device selection, as devices are associated with specific drivers.
+
+The driver selection is task-local, allowing different Julia tasks to use different drivers.
+
+# Arguments
+- `drv::ZeDriver`: The driver to use for subsequent operations.
+
+# Examples
+```julia
+drv = drivers()[2]  # Select second available driver
+driver!(drv)
+```
+
+See also: `driver`, `drivers`
+"""
 function driver!(drv::ZeDriver)
     task_local_storage(:ZeDriver, drv)
     delete!(task_local_storage(), :ZeDevice)
 end
 
+"""
+    device() -> ZeDevice
+
+Get the current Level Zero device for the calling task. If no device has been explicitly
+set with [`device!`](@ref), returns the first available device for the current driver.
+
+The device selection is task-local, allowing different Julia tasks to use different devices.
+
+# Examples
+```julia
+dev = device()
+println("Using device: ", dev)
+```
+
+See also: `device!`, `devices`, `driver`
+"""
 function device()
     get!(task_local_storage(), :ZeDevice) do
         first(devices(driver()))
     end
 end
 
+"""
+    device!(dev::ZeDevice)
+    device!(i::Int)
+
+Set the current Level Zero device for the calling task.
+
+The device selection is task-local, allowing different Julia tasks to use different devices.
+
+# Arguments
+- `dev::ZeDevice`: The device to use for subsequent operations.
+- `i::Int`: Device index (1-based) from the list of available devices for the current driver.
+
+# Examples
+```julia
+# Select by device object
+dev = devices()[2]
+device!(dev)
+
+# Select by index
+device!(2)  # Select second device
+```
+
+See also: [`device`](@ref), [`devices`](@ref)
+"""
 function device!(drv::ZeDevice)
     task_local_storage(:ZeDevice, drv)
 end
@@ -32,6 +107,23 @@ device!(i::Int) = device!(devices(driver())[i])
 
 const global_contexts = Dict{ZeDriver,ZeContext}()
 
+"""
+    context() -> ZeContext
+
+Get the current Level Zero context for the calling task. If no context has been explicitly
+set with [`context!`](@ref), returns a global context for the current driver.
+
+Contexts manage the lifetime of resources like memory allocations and command queues.
+The context selection is task-local, but contexts themselves are cached globally per driver.
+
+# Examples
+```julia
+ctx = context()
+println("Using context: ", ctx)
+```
+
+See also: [`context!`](@ref), [`driver`](@ref)
+"""
 function context()
     get!(task_local_storage(), :ZeContext) do
         get!(global_contexts, driver()) do
@@ -40,13 +132,54 @@ function context()
     end
 end
 
+"""
+    context!(ctx::ZeContext)
+
+Set the current Level Zero context for the calling task.
+
+The context selection is task-local, allowing different Julia tasks to use different contexts.
+
+# Arguments
+- `ctx::ZeContext`: The context to use for subsequent operations.
+
+# Examples
+```julia
+ctx = ZeContext(driver())
+context!(ctx)
+```
+
+See also: `context`, `ZeContext`
+"""
 function context!(ctx::ZeContext)
     task_local_storage(:ZeContext, ctx)
 end
 
-# the global queue can be used as a default queue to execute operations on,
-# guaranteeing expected semantics when using a device on a Julia task.
+"""
+    global_queue(ctx::ZeContext, dev::ZeDevice) -> ZeCommandQueue
+
+Get the global command queue for the given context and device. This queue is used as the
+default queue for executing operations, guaranteeing expected semantics when using a device
+on a Julia task.
+
+The queue is created with in-order execution flags, meaning commands are executed in the
+order they are submitted. Queues are cached per task and (context, device) pair.
+
+# Arguments
+- `ctx::ZeContext`: The context for the command queue.
+- `dev::ZeDevice`: The device for the command queue.
+
+# Returns
+- `ZeCommandQueue`: A cached command queue with in-order execution.
 
+# Examples
+```julia
+ctx = context()
+dev = device()
+queue = global_queue(ctx, dev)
+```
+
+See also: `context`, `device`, `synchronize`
+"""
 function global_queue(ctx::ZeContext, dev::ZeDevice)
     # NOTE: dev purposefully does not default to context() or device() to stress that
     #       objects should track ownership, and not rely on implicit global state.
@@ -55,12 +188,52 @@ function global_queue(ctx::ZeContext, dev::ZeDevice)
     end
 end
 
+"""
+    synchronize()
+
+Block the host thread until all operations on the global command queue for the current
+context and device have completed.
+
+This is useful for timing operations or ensuring that GPU work has finished before
+accessing results on the CPU.
+
+# Examples
+```julia
+x = oneArray(rand(1000))
+y = x .+ 1
+synchronize()  # Wait for GPU computation to complete
+println("GPU work completed")
+```
+
+See also: [`global_queue`](@ref), [`context`](@ref), [`device`](@ref)
+"""
 function oneL0.synchronize()
     oneL0.synchronize(global_queue(context(), device()))
 end
 
 # re-export and augment parts of oneL0 to make driver and device selection easier
 export drivers, devices
+
+"""
+    devices() -> Vector{ZeDevice}
+    devices(drv::ZeDriver) -> Vector{ZeDevice}
+
+Return a list of available Level Zero devices. Without arguments, returns devices for
+the current driver. With a driver argument, returns devices for that specific driver.
+
+# Examples
+```julia
+# Get devices for current driver
+devs = devices()
+println("Found ", length(devs), " devices")
+
+# Get devices for specific driver
+drv = drivers()[1]
+devs = devices(drv)
+```
+
+See also: `device`, `device!`, `drivers`
+"""
 oneL0.devices() = devices(driver())
 
 
diff --git a/src/device/array.jl b/src/device/array.jl
index 260be837..ae339110 100644
--- a/src/device/array.jl
+++ b/src/device/array.jl
@@ -8,6 +8,40 @@ export oneDeviceArray, oneDeviceVector, oneDeviceMatrix, oneLocalArray
 # NOTE: we can't support the typical `tuple or series of integer` style construction,
 #       because we're currently requiring a trailing pointer argument.
 
+"""
+    oneDeviceArray{T,N,A} <: DenseArray{T,N}
+
+Device-side array type for use within GPU kernels.
+
+This type represents a view of GPU memory accessible within kernel code. Unlike
+[`oneArray`](@ref) which is used on the host, `oneDeviceArray` is designed for
+device-side operations and cannot be directly constructed on the host.
+
+# Type Parameters
+- `T`: Element type
+- `N`: Number of dimensions
+- `A`: Address space (typically `AS.CrossWorkgroup` for global memory)
+
+# Usage
+
+`oneDeviceArray` is typically not constructed directly. Instead, `oneArray` objects
+are automatically converted to `oneDeviceArray` when passed as kernel arguments.
+
+# Examples
+
+```julia
+function kernel(a::oneDeviceArray{Float32,1})
+    i = get_global_id()
+    @inbounds a[i] = a[i] * 2.0f0
+    return
+end
+
+a = oneArray(rand(Float32, 100))
+@oneapi groups=1 items=100 kernel(a)  # a is converted to oneDeviceArray
+```
+
+See also: [`oneArray`](@ref), [`oneLocalArray`](@ref), [`@oneapi`](@ref)
+"""
 struct oneDeviceArray{T,N,A} <: DenseArray{T,N}
     ptr::LLVMPtr{T,A}
     maxsize::Int
@@ -257,6 +291,47 @@ end
 
 export oneLocalArray
 
+"""
+    oneLocalArray(::Type{T}, dims)
+
+Allocate local (workgroup-shared) memory within a GPU kernel.
+
+Local memory is shared among all work-items in a workgroup and provides faster access than
+global memory. It's useful for algorithms that require cooperation between work-items,
+such as reductions or matrix multiplication tiling.
+
+# Arguments
+- `T`: Element type
+- `dims`: Dimensions (must be compile-time constants)
+
+# Examples
+
+```julia
+function matmul_kernel(A, B, C)
+    # Allocate 16x16 tile in local memory
+    tile_A = oneLocalArray(Float32, (16, 16))
+    tile_B = oneLocalArray(Float32, (16, 16))
+
+    # Load data into local memory
+    local_i = get_local_id(0)
+    local_j = get_local_id(1)
+    tile_A[local_i, local_j] = A[...]
+    tile_B[local_i, local_j] = B[...]
+
+    barrier()  # Synchronize workgroup
+
+    # Compute using local memory
+    # ...
+    return
+end
+```
+
+!!! note
+    The dimensions must be known at compile time. Local memory is limited (typically 64KB
+    per workgroup), so large allocations may fail.
+
+See also: [`oneDeviceArray`](@ref), [`barrier`](@ref)
+"""
 @inline function oneLocalArray(::Type{T}, dims) where {T}
     len = prod(dims)
     # NOTE: this relies on const-prop to forward the literal length to the generator.
diff --git a/src/memory.jl b/src/memory.jl
index 86a681f8..61b17106 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -1,5 +1,26 @@
 # memory operations
 
+"""
+    Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst, src, N)
+
+Low-level memory copy operation on the GPU.
+
+Copies `N` elements of type `T` from `src` to `dst` using the specified context and device.
+Both `src` and `dst` can be either host pointers (`Ptr`) or device pointers (`ZePtr`).
+
+# Arguments
+- `ctx::ZeContext`: Level Zero context
+- `dev::ZeDevice`: Level Zero device
+- `dst::Union{Ptr{T},ZePtr{T}}`: Destination pointer
+- `src::Union{Ptr{T},ZePtr{T}}`: Source pointer
+- `N::Integer`: Number of elements to copy
+
+!!! warning
+    This is a low-level function. No bounds checking is performed. For safe array copying,
+    use `copyto!` on `oneArray` objects instead.
+
+See also: [`copyto!`](@ref), [`oneArray`](@ref)
+"""
 function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst::Union{Ptr{T},ZePtr{T}},
                              src::Union{Ptr{T},ZePtr{T}}, N::Integer) where T
     bytes = N*sizeof(T)
@@ -9,6 +30,26 @@ function Base.unsafe_copyto!(ctx::ZeContext, dev::ZeDevice, dst::Union{Ptr{T},Ze
     end
 end
 
+"""
+    unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr, pattern, N)
+
+Low-level memory fill operation on the GPU.
+
+Fills `N` elements at `ptr` with the given pattern using the specified context and device.
+
+# Arguments
+- `ctx::ZeContext`: Level Zero context
+- `dev::ZeDevice`: Level Zero device
+- `ptr::Union{Ptr{T},ZePtr{T}}`: Pointer to memory to fill
+- `pattern::Union{Ptr{T},ZePtr{T}}`: Pointer to pattern value
+- `N::Integer`: Number of elements to fill
+
+!!! warning
+    This is a low-level function. For safe array operations, use `fill!` on `oneArray`
+    objects instead.
+
+See also: [`fill!`](@ref), [`oneArray`](@ref)
+"""
 function unsafe_fill!(ctx::ZeContext, dev::ZeDevice, ptr::Union{Ptr{T},ZePtr{T}},
                       pattern::Union{Ptr{T},ZePtr{T}}, N::Integer) where T
     bytes = N*sizeof(T)
diff --git a/src/oneAPI.jl b/src/oneAPI.jl
index 9e39fa9f..b9caa398 100644
--- a/src/oneAPI.jl
+++ b/src/oneAPI.jl
@@ -59,7 +59,7 @@ export SYCL
 include("../lib/mkl/oneMKL.jl")
 export oneMKL
 end
-import AcceleratedKernels as AK
+
 # integrations and specialized functionality
 include("broadcast.jl")
 include("mapreduce.jl")
@@ -69,7 +69,6 @@ include("utils.jl")
 
 include("oneAPIKernels.jl")
 import .oneAPIKernels: oneAPIBackend
-include("accumulate.jl")
 include("indexing.jl")
 export oneAPIBackend
 
diff --git a/src/utils.jl b/src/utils.jl
index 22180da7..b2d1e784 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -68,7 +68,7 @@ end
 
 Run expression `ex` and synchronize the GPU afterwards.
 
-See also: [`synchronize`](@ref).
+See also: `synchronize`.
 """
 macro sync(ex)
     quote
diff --git a/test/Project.toml b/test/Project.toml
index c214ed96..cb603629 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
diff --git a/test/setup.jl b/test/setup.jl
index 269d5b9c..a3b0f1a4 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,4 +1,4 @@
-using Distributed, Test, oneAPI
+using Distributed, Test, oneAPI, AcceleratedKernels
 
 oneAPI.functional() || error("oneAPI.jl is not functional on this system")