Skip to content

Commit 83730c1

Browse files
committed
2 parents bc34b4e + db0dc92 commit 83730c1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+501
-11615
lines changed

.vscode/settings.json

Lines changed: 79 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,115 @@
11
{
2+
"cSpell.words": [
3+
"ashvardanian",
4+
"blas",
5+
"CCCL",
6+
"constexpr",
7+
"cublas",
8+
"CUDA",
9+
"Kahan",
10+
"METALLIB",
11+
"openmp",
12+
"shfl",
13+
"SPIR",
14+
"STREQUAL",
15+
"threadgroup",
16+
"Vardanian",
17+
"wmma"
18+
],
219
"files.associations": {
20+
"*.cl": "cpp",
21+
"*.ipp": "cpp",
22+
"*.metal": "cpp",
23+
"*.tcc": "cpp",
24+
"__debug": "cpp",
25+
"__nullptr": "cpp",
26+
"algorithm": "cpp",
27+
"any": "cpp",
328
"array": "cpp",
429
"atomic": "cpp",
30+
"barrier": "cpp",
531
"bit": "cpp",
6-
"*.tcc": "cpp",
32+
"bitset": "cpp",
733
"cctype": "cpp",
34+
"cfenv": "cpp",
35+
"charconv": "cpp",
36+
"chrono": "cpp",
37+
"cinttypes": "cpp",
838
"clocale": "cpp",
939
"cmath": "cpp",
40+
"codecvt": "cpp",
1041
"compare": "cpp",
42+
"complex": "cpp",
1143
"concepts": "cpp",
44+
"condition_variable": "cpp",
45+
"csetjmp": "cpp",
46+
"csignal": "cpp",
1247
"cstdarg": "cpp",
1348
"cstddef": "cpp",
1449
"cstdint": "cpp",
1550
"cstdio": "cpp",
1651
"cstdlib": "cpp",
52+
"cstring": "cpp",
53+
"ctime": "cpp",
54+
"cuchar": "cpp",
1755
"cwchar": "cpp",
1856
"cwctype": "cpp",
19-
"map": "cpp",
20-
"unordered_map": "cpp",
21-
"vector": "cpp",
57+
"deque": "cpp",
2258
"exception": "cpp",
23-
"algorithm": "cpp",
24-
"functional": "cpp",
25-
"iterator": "cpp",
26-
"memory": "cpp",
27-
"memory_resource": "cpp",
28-
"numeric": "cpp",
29-
"optional": "cpp",
30-
"random": "cpp",
31-
"string": "cpp",
32-
"string_view": "cpp",
33-
"system_error": "cpp",
34-
"tuple": "cpp",
35-
"type_traits": "cpp",
36-
"utility": "cpp",
59+
"expected": "cpp",
60+
"forward_list": "cpp",
3761
"fstream": "cpp",
62+
"functional": "cpp",
63+
"future": "cpp",
64+
"hash_map": "cpp",
65+
"hash_set": "cpp",
3866
"initializer_list": "cpp",
67+
"iomanip": "cpp",
3968
"iosfwd": "cpp",
4069
"iostream": "cpp",
4170
"istream": "cpp",
71+
"iterator": "cpp",
72+
"latch": "cpp",
4273
"limits": "cpp",
74+
"list": "cpp",
75+
"locale": "cpp",
76+
"map": "cpp",
77+
"memory": "cpp",
78+
"memory_resource": "cpp",
79+
"mutex": "cpp",
4380
"new": "cpp",
4481
"numbers": "cpp",
82+
"numeric": "cpp",
83+
"optional": "cpp",
4584
"ostream": "cpp",
85+
"propagate_const": "cpp",
86+
"random": "cpp",
4687
"ranges": "cpp",
47-
"sstream": "cpp",
48-
"stdexcept": "cpp",
49-
"streambuf": "cpp",
50-
"cinttypes": "cpp",
51-
"typeinfo": "cpp",
52-
"deque": "cpp",
88+
"ratio": "cpp",
5389
"regex": "cpp",
54-
"forward_list": "cpp",
55-
"list": "cpp",
56-
"valarray": "cpp",
57-
"cstring": "cpp",
58-
"ctime": "cpp",
59-
"any": "cpp",
60-
"bitset": "cpp",
61-
"chrono": "cpp",
62-
"codecvt": "cpp",
63-
"complex": "cpp",
64-
"condition_variable": "cpp",
90+
"scoped_allocator": "cpp",
91+
"semaphore": "cpp",
6592
"set": "cpp",
66-
"unordered_set": "cpp",
67-
"ratio": "cpp",
68-
"future": "cpp",
69-
"iomanip": "cpp",
70-
"mutex": "cpp",
7193
"shared_mutex": "cpp",
94+
"span": "cpp",
95+
"sstream": "cpp",
96+
"stdexcept": "cpp",
7297
"stop_token": "cpp",
98+
"streambuf": "cpp",
99+
"string": "cpp",
100+
"string_view": "cpp",
101+
"strstream": "cpp",
102+
"system_error": "cpp",
73103
"thread": "cpp",
104+
"tuple": "cpp",
105+
"type_traits": "cpp",
74106
"typeindex": "cpp",
107+
"typeinfo": "cpp",
108+
"unordered_map": "cpp",
109+
"unordered_set": "cpp",
110+
"utility": "cpp",
111+
"valarray": "cpp",
75112
"variant": "cpp",
76-
"csetjmp": "cpp",
77-
"csignal": "cpp",
78-
"strstream": "cpp",
79-
"scoped_allocator": "cpp",
80-
"cfenv": "cpp",
81-
"hash_map": "cpp",
82-
"*.ipp": "cpp",
83-
"__debug": "cpp",
84-
"barrier": "cpp",
85-
"charconv": "cpp",
86-
"propagate_const": "cpp",
87-
"semaphore": "cpp",
88-
"span": "cpp",
89-
"expected": "cpp",
90-
"locale": "cpp",
91-
"__nullptr": "cpp",
92-
"cuchar": "cpp",
93-
"hash_set": "cpp",
94-
"latch": "cpp"
95-
},
96-
"cSpell.words": [
97-
"ashvardanian",
98-
"CCCL",
99-
"CUDA",
100-
"Kahan",
101-
"shfl",
102-
"SPIR",
103-
"STREQUAL"
104-
]
113+
"vector": "cpp"
114+
}
105115
}

CMakeLists.txt

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,50 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" OR CMAKE_CUDA_COMPILER_ID STREQUAL "
104104
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda")
105105
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_86,code=sm_86")
106106

107+
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" AND 0)
108+
set(METAL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/reduce_metal.msl)
109+
set(METALLIB_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/reduce_metal.metallib)
110+
111+
add_custom_command(
112+
OUTPUT ${METALLIB_OUTPUT}
113+
COMMAND xcrun -sdk macosx metal -c ${METAL_SRC} -o reduce_metal.air
114+
COMMAND xcrun -sdk macosx metallib reduce_metal.air -o ${METALLIB_OUTPUT}
115+
DEPENDS ${METAL_SRC}
116+
COMMENT "Compiling Metal shader: ${METAL_SRC} -> ${METALLIB_OUTPUT}"
117+
VERBATIM
118+
)
119+
120+
# Create a pseudo target to build the metallib
121+
add_custom_target(MetalLibBuild ALL
122+
DEPENDS ${METALLIB_OUTPUT}
123+
)
124+
125+
enable_language(OBJCXX)
126+
set_property(SOURCE reduce_metal.hpp PROPERTY LANGUAGE OBJCXX)
127+
set_property(SOURCE reduce_bench.cpp PROPERTY LANGUAGE OBJCXX)
128+
target_link_libraries(reduce_bench
129+
PRIVATE
130+
"-framework Metal"
131+
"-framework Foundation"
132+
)
133+
set_source_files_properties(
134+
reduce_bench.cpp
135+
PROPERTIES
136+
COMPILE_FLAGS "-x objective-c++ -fobjc-arc"
137+
)
138+
139+
# Make sure reduce_bench depends on MetalLibBuild so the .metallib is built first
140+
add_dependencies(reduce_bench MetalLibBuild)
141+
142+
# Copy the `metallib` to the same folder as reduce_bench
143+
add_custom_command(
144+
TARGET reduce_bench
145+
POST_BUILD
146+
COMMAND ${CMAKE_COMMAND} -E copy_if_different
147+
${METALLIB_OUTPUT}
148+
$<TARGET_FILE_DIR:reduce_bench>
149+
)
150+
107151
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
108152
message("-- Detected Clang Compiler")
109153
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ This repository contains several educational examples showcasing the performance
1414
- 🔜 NEON and SVE on Arm.
1515
- OpenMP `reduction` clause.
1616
- Thrust with its `thrust::reduce`.
17-
- CUDA kernels with warp-reductions.
17+
- CUDA kernels with and w/out warp-reductions.
1818
- OpenCL kernels, eight of them.
1919
- Parallel STL `<algorithm>` in GCC with Intel oneTBB.
2020

2121
Previously, it also compared ArrayFire, Halide, and Vulkan queues for SPIR-V kernels and SyCL.
2222
Examples were collected from early 2010s until 2019 and later updated in 2022.
2323

24-
- [Lecture Slides](blob/main/presentation/slides.pdf) from 2019.
24+
- [Lecture Slides](https://drive.google.com/file/d/16AicAl99t3ZZFnza04Wnw_Vuem0w8lc7/view?usp=sharing) from 2019.
2525
- [CppRussia Talk](https://youtu.be/AA4RI6o0h1U) in Russia in 2019.
2626
- [JetBrains Talk](https://youtu.be/BUtHOftDm_Y) in Germany & Russia in 2019.
2727

@@ -35,9 +35,9 @@ You are expected to build this on an x86 machine with CUDA drivers installed.
3535
```sh
3636
cmake -B build_release
3737
cmake --build build_release --config Release
38-
build_release/reduce_bench # To run all available benchmarks on default array size
39-
build_release/reduce_bench --benchmark_filter="" # Control Google Benchmark params
40-
PARALLEL_REDUCTIONS_LENGTH=1000 build_release/reduce_bench # Try different array size
38+
build_release/reduce_bench # Run all benchmarks
39+
build_release/reduce_bench --benchmark_filter="cuda" # Only CUDA-related
40+
PARALLEL_REDUCTIONS_LENGTH=1000 build_release/reduce_bench # Set a different input size
4141
```
4242

4343
Need a more fine-grained control to run only CUDA-based backends?

deprecated/ArrayFire/Tests.cpp

Lines changed: 0 additions & 91 deletions
This file was deleted.

deprecated/ArrayFire/Tests.hpp

Lines changed: 0 additions & 13 deletions
This file was deleted.

0 commit comments

Comments
 (0)