JuliaGPU
diff --git a/‎Project.toml‎
Lines changed: 4 additions & 1 deletion b/‎Project.toml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 75 additions & 24 deletions b/‎README.md‎
Lines changed: 75 additions & 24 deletions
diff --git a/‎docs/Project.toml‎
Lines changed: 1 addition & 0 deletions b/‎docs/Project.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions b/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/src/api/accumulate.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/src/api/accumulate.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/api/binarysearch.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/src/api/binarysearch.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/api/map.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/src/api/map.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/src/api/mapreduce.md‎
Lines changed: 7 additions & 1 deletion b/‎docs/src/api/mapreduce.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/src/api/predicates.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/src/api/predicates.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/src/api/reduce.md‎
Lines changed: 7 additions & 1 deletion b/‎docs/src/api/reduce.md‎
Lines changed: 7 additions & 1 deletion
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.1.0"
+version = "0.2.0-DEV"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
@@ -10,11 +10,14 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
+Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 
 [compat]
 ArgCheck = "2.1"
 DocStringExtensions = "0.9"
 GPUArraysCore = "0.1"
 KernelAbstractions = "0.9"
+Markdown = "1.11"
 Polyester = "0.7"
+Unrolled = "0.1.5"
 julia = "1.6.7"
@@ -17,12 +17,13 @@ Parallel algorithm building blocks for the Julia ecosystem, targeting multithrea
 - [5. API Examples](#5-api-examples)
   - [5.1. Using Different Backends](#51-using-different-backends)
   - [5.2. `foreachindex`](#52-foreachindex)
-  - [5.3. `sort` and friends](#53-sort-and-friends)
-  - [5.4. `reduce`](#54-reduce)
-  - [5.5. `mapreduce`](#55-mapreduce)
-  - [5.6. `accumulate`](#56-accumulate)
-  - [5.7. `searchsorted` and friends](#57-searchsorted-and-friends)
-  - [5.8. `all` / `any`](#58-all--any)
+  - [5.3. `map`](#53-map)
+  - [5.4. `sort` and friends](#54-sort-and-friends)
+  - [5.5. `reduce`](#55-reduce)
+  - [5.6. `mapreduce`](#56-mapreduce)
+  - [5.7. `accumulate`](#57-accumulate)
+  - [5.8. `searchsorted` and friends](#58-searchsorted-and-friends)
+  - [5.9. `all` / `any`](#59-all--any)
 - [6. Custom Structs](#6-custom-structs)
 - [7. Testing](#7-testing)
 - [8. Issues and Debugging](#8-issues-and-debugging)
@@ -72,18 +73,19 @@ Below is an overview of the currently-implemented algorithms, along with some co
 | Function Family                               | AcceleratedKernels.jl Functions                  | Other Common Names                                        |
 | --------------------------------------------- | ------------------------------------------------ | --------------------------------------------------------- |
 | [General Looping](#52-foreachindex)           | `foreachindex`                                   | `Kokkos::parallel_for` `RAJA::forall` `thrust::transform` |
-| [Sorting](#53-sort-and-friends)               | `sort` `sort!`                                   | `sort` `sort_team` `stable_sort`                          |
+| [General Looping](#53-map)                    | `map` `map!`                                     | `thrust::transform`                                       |
+| [Sorting](#54-sort-and-friends)               | `sort` `sort!`                                   | `sort` `sort_team` `stable_sort`                          |
 |                                               | `merge_sort` `merge_sort!`                       |                                                           |
 |                                               | `merge_sort_by_key` `merge_sort_by_key!`         | `sort_team_by_key`                                        |
 |                                               | `sortperm` `sortperm!`                           | `sort_permutation` `index_permutation`                    |
 |                                               | `merge_sortperm` `merge_sortperm!`               |                                                           |
 |                                               | `merge_sortperm_lowmem` `merge_sortperm_lowmem!` |                                                           |
-| [Reduction](#54-reduce)                       | `reduce`                                         | `Kokkos:parallel_reduce` `fold` `aggregate`               |
-| [MapReduce](#55-mapreduce)                    | `mapreduce`                                      | `transform_reduce` `fold`                                 |
-| [Accumulation](#56-accumulate)                | `accumulate` `accumulate!`                       | `prefix_sum` `thrust::scan` `cumsum`                      |
-| [Binary Search](#57-searchsorted-and-friends) | `searchsortedfirst` `searchsortedfirst!`         | `std::lower_bound`                                        |
+| [Reduction](#55-reduce)                       | `reduce`                                         | `Kokkos:parallel_reduce` `fold` `aggregate`               |
+| [MapReduce](#56-mapreduce)                    | `mapreduce`                                      | `transform_reduce` `fold`                                 |
+| [Accumulation](#57-accumulate)                | `accumulate` `accumulate!`                       | `prefix_sum` `thrust::scan` `cumsum`                      |
+| [Binary Search](#58-searchsorted-and-friends) | `searchsortedfirst` `searchsortedfirst!`         | `std::lower_bound`                                        |
 |                                               | `searchsortedlast` `searchsortedlast!`           | `thrust::upper_bound`                                     |
-| [Predicates](#58-all--any)                    | `all` `any`                                      |                                                           |
+| [Predicates](#59-all--any)                    | `all` `any`                                      |                                                           |
 
 
 ## 5. API Examples
@@ -214,7 +216,40 @@ Similarly, for performance on the CPU the overhead of spawning threads should be
 ```
 
 
-### 5.3. `sort` and friends
+### 5.3. `map`
+Parallel mapping of a function over each element of an iterable via `foreachindex`:
+- `map!` (in-place), `map` (out-of-place)
+
+Function signature:
+```julia
+map!(
+    f, dst::AbstractArray, src::AbstractArray;
+
+    # CPU settings
+    scheduler=:threads,
+    max_tasks=Threads.nthreads(),
+    min_elems=1,
+
+    # GPU settings
+    block_size=256,    
+)
+```
+
+Example:
+```julia
+import Metal
+import AcceleratedKernels as AK
+
+x = MtlArray(rand(Float32, 100_000))
+y = similar(x)
+AK.map!(y, x) do x_elem
+    T = typeof(x_elem)
+    T(2) * x_elem + T(1)
+end
+```
+
+
+### 5.4. `sort` and friends
 Sorting algorithms with similar interface and default settings as the Julia Base ones, on GPUs:
 - `sort!` (in-place), `sort` (out-of-place)
 - `sortperm!`, `sortperm`
@@ -277,15 +312,23 @@ AK.sort!(v, temp=temp)
 ```
 
 
-### 5.4. `reduce`
+### 5.5. `reduce`
 Apply a custom binary operator reduction on all elements in an iterable; can be used to compute minima, sums, counts, etc.
 - **Other names**: `Kokkos:parallel_reduce`, `fold`, `aggregate`.
 
+**New in AcceleratedKernels 0.2.0: N-dimensional reductions via the `dims` keyword**
+
 Function signature:
 ```julia
-reduce(op, src::AbstractGPUVector; init,
-       block_size::Int=256, temp::Union{Nothing, AbstractGPUVector}=nothing,
-       switch_below::Int=0)
+reduce(
+    op, src::AbstractGPUArray;
+    init,
+    dims::Union{Nothing, Int}=nothing,
+
+    block_size::Int=256,
+    temp::Union{Nothing, AbstractGPUArray}=nothing,
+    switch_below::Int=0,
+)
 ```
 
 Example computing a sum:
@@ -307,15 +350,23 @@ end
 Yes, the lambda within the `do` block can equally well be executed on both CPU and GPU, no code changes/duplication required.
 
 
-### 5.5. `mapreduce`
+### 5.6. `mapreduce`
 Equivalent to `reduce(op, map(f, iterable))`, without saving the intermediate mapped collection; can be used to e.g. split documents into words (map) and count the frequency thereof (reduce).
 - **Other names**: `transform_reduce`, some `fold` implementations include the mapping function too.
 
+**New in AcceleratedKernels 0.2.0: N-dimensional reductions via the `dims` keyword**
+
 Function signature:
 ```julia
-mapreduce(f, op, src::AbstractGPUVector; init,
-          block_size::Int=256, temp::Union{Nothing, AbstractGPUVector}=nothing,
-          switch_below::Int=0)
+mapreduce(
+    f, op, src::AbstractGPUArray;
+    init,
+    dims::Union{Nothing, Int}=nothing,
+
+    block_size::Int=256,
+    temp::Union{Nothing, AbstractGPUArray}=nothing,
+    switch_below::Int=0,
+)
 ```
 
 Example computing the minimum of absolute values:
@@ -330,7 +381,7 @@ AK.mapreduce(abs, (x, y) -> x < y ? x : y, v, init=typemax(Int32))
 As for `reduce`, when there are fewer than `switch_below` elements left to reduce, they can be copied back to the host and we switch to a CPU reduction. The `init` initialiser has to be a neutral element for `op`, i.e. same type as returned from `f` (`f` can change the type of the collection, see the "Custom Structs" section below for an example). The temporary array `temp` needs to have at least `(length(src) + 2 * block_size - 1) ÷ (2 * block_size)` elements and have `eltype(src) === typeof(init)`.
 
 
-### 5.6. `accumulate`
+### 5.7. `accumulate`
 Compute accumulated running totals along a sequence by applying a binary operator to all elements up to the current one; often used in GPU programming as a first step in finding / extracting subsets of data.
 - `accumulate!` (in-place), `accumulate` (allocating); inclusive or exclusive.
 - **Other names**: prefix sum, `thrust::scan`, cumulative sum; inclusive (or exclusive) if the first element is included in the accumulation (or not).
@@ -359,7 +410,7 @@ AK.accumulate!(+, v, init=0)
 The temporaries `temp_v` and `temp_flags` should both have at least `(length(v) + 2 * block_size - 1) ÷ (2 * block_size)` elements; `eltype(v) === eltype(temp_v)`; the elements in `temp_flags` can be any integers, but `Int8` is used by default to reduce memory usage. 
 
 
-### 5.7. `searchsorted` and friends
+### 5.8. `searchsorted` and friends
 Find the indices where some elements `x` should be inserted into a sorted sequence `v` to maintain the sorted order. Effectively applying the Julia.Base functions in parallel on a GPU using `foreachindex`.
 - `searchsortedfirst!` (in-place), `searchsortedfirst` (allocating): index of first element in `v` >= `x[j]`.
 - `searchsortedlast!`, `searchsortedlast`: index of last element in `v` <= `x[j]`.
@@ -413,7 +464,7 @@ AK.searchsortedfirst!(ix, v, x)
 ```
 
 
-### 5.8. `all` / `any`
+### 5.9. `all` / `any`
 Apply a predicate to check if all / any elements in a collection return true. Could be implemented as a reduction, but is better optimised with stopping the search once a false / true is found.
 - **Other names**: not often implemented standalone on GPUs, typically included as part of a reduction.
 
 
@@ -1,4 +1,5 @@
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 
 [compat]
 
@@ -22,6 +22,7 @@ makedocs(;
         "Manual" =>[
             "Using Different Backends" => "api/using_backends.md",
             "General Loops" => "api/foreachindex.md",
+            "Map" => "api/map.md",
             "Sorting" => "api/sort.md",
             "Reduce" => "api/reduce.md",
             "MapReduce" => "api/mapreduce.md",
 
@@ -2,5 +2,5 @@
 
 ```@example
 import AcceleratedKernels as AK # hide
-AK.DocHelpers.readme_section("### 5.6. `accumulate`") # hide
+AK.DocHelpers.readme_section("### 5.7. `accumulate`") # hide
 ```
@@ -2,5 +2,5 @@
 
 ```@example
 import AcceleratedKernels as AK # hide
-AK.DocHelpers.readme_section("### 5.7. `searchsorted` and friends") # hide
+AK.DocHelpers.readme_section("### 5.8. `searchsorted` and friends") # hide
 ```
@@ -0,0 +1,6 @@
+### Map
+
+```@example
+import AcceleratedKernels as AK # hide
+AK.DocHelpers.readme_section("### 5.3. `map`") # hide
+```
@@ -2,5 +2,11 @@
 
 ```@example
 import AcceleratedKernels as AK # hide
-AK.DocHelpers.readme_section("### 5.5. `mapreduce`") # hide
+AK.DocHelpers.readme_section("### 5.6. `mapreduce`") # hide
+```
+
+---
+
+```@docs
+AcceleratedKernels.mapreduce
 ```
@@ -2,7 +2,7 @@
 
 ```@example
 import AcceleratedKernels as AK # hide
-AK.DocHelpers.readme_section("### 5.8. `all` / `any`") # hide
+AK.DocHelpers.readme_section("### 5.9. `all` / `any`") # hide
 ```
 
 **Note on the `cooperative` keyword**: some older platforms crash when multiple threads write to the same memory location in a global array (e.g. old Intel Graphics); if all threads were to write the same value, it is well-defined on others (e.g. CUDA F4.2 says "If a non-atomic instruction executed by a warp writes to the same location in global memory for more than one of the threads of the warp, only one thread performs a write and which thread does it is undefined."). This "cooperative" thread behaviour allows for a faster implementation; if you have a platform - the only one I know is Intel UHD Graphics - that crashes, set `cooperative=false` to use a safer `mapreduce`-based implementation.
@@ -2,5 +2,11 @@
 
 ```@example
 import AcceleratedKernels as AK # hide
-AK.DocHelpers.readme_section("### 5.4. `reduce`") # hide
+AK.DocHelpers.readme_section("### 5.5. `reduce`") # hide
+```
+
+---
+
+```@docs
+AcceleratedKernels.reduce
 ```
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`[deps]`
	`2`	`+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"`
`2`	`3`	`Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"`
`3`	`4`
`4`	`5`	`[compat]`