Skip to content

Commit 87fc57f

Browse files
authored
Merge pull request #26 from ashvardanian/gather-scatter
Gather 🔄 Scatter This PR introduces benchmarks for gather & scatter SIMD rarely-used instructions that can be used to accelerate lookups by ~30% on current x86 and Arm machines.
2 parents 2d97782 + 50e4613 commit 87fc57f

File tree

2 files changed

+174
-5
lines changed

2 files changed

+174
-5
lines changed

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
"excerise",
2727
"fconcepts",
2828
"Fedor",
29+
"Fugaku",
2930
"Giga",
3031
"Goodput",
3132
"GOPS",
@@ -46,6 +47,7 @@
4647
"mimalloc",
4748
"MSVC",
4849
"Müller",
50+
"Neoverse",
4951
"Niebler",
5052
"Niels",
5153
"nlohmann",

less_slow.cpp

Lines changed: 172 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1755,7 +1755,7 @@ BENCHMARK_CAPTURE(theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kern
17551755
#include <cassert> // `assert`
17561756
#include <fstream> // `std::ifstream`
17571757
#include <iterator> // `std::random_access_iterator_tag`
1758-
#include <memory> // `std::assume_aligned`
1758+
#include <memory> // `std::assume_aligned`, `std::unique_ptr`
17591759
#include <string> // `std::string`, `std::stoull`
17601760

17611761
/**
@@ -1863,6 +1863,13 @@ class strided_ptr {
18631863
// clang-format on
18641864
};
18651865

1866+
template <typename type_>
1867+
std::unique_ptr<type_[], decltype(&std::free)> make_aligned_array(std::size_t size, std::size_t alignment) {
1868+
type_ *raw_ptr = static_cast<type_ *>(std::aligned_alloc(alignment, sizeof(type_) * size));
1869+
if (!raw_ptr) throw std::bad_alloc();
1870+
return std::unique_ptr<type_[], decltype(&std::free)>(raw_ptr, &std::free);
1871+
}
1872+
18661873
#if defined(__aarch64__)
18671874
/**
18681875
* @brief Helper derived from `__aarch64_sync_cache_range` in `libgcc`, used to
@@ -1887,9 +1894,7 @@ static void memory_access(bm::State &state) {
18871894
// memory accesses may suffer from the same issues. For split-loads, pad our
18881895
// buffer with an extra `cache_line_width` bytes of space.
18891896
std::size_t const buffer_size = typical_l2_size + cache_line_width;
1890-
std::unique_ptr<std::byte, decltype(&std::free)> const buffer( //
1891-
reinterpret_cast<std::byte *>(std::aligned_alloc(cache_line_width, buffer_size)), //
1892-
&std::free);
1897+
auto const buffer = make_aligned_array<std::byte>(buffer_size, cache_line_width);
18931898
std::byte *const buffer_ptr = buffer.get();
18941899

18951900
// Let's initialize a strided range using out `strided_ptr` template, but
@@ -1932,6 +1937,168 @@ BENCHMARK(memory_access_aligned)->MinTime(10);
19321937

19331938
#pragma endregion // Alignment of Memory Accesses
19341939

1940+
#pragma region Gather & Scatter Operations for Spread Data
1941+
1942+
/**
1943+
* Sometimes, the variables of interest are scattered across memory, and we
1944+
* need to gather them into a contiguous buffer for processing. This is already
1945+
* common in sparse matrix operations, where only a few elements are non-zero,
1946+
* but can apply to any irregular data structure...
1947+
*
1948+
* The only question is: is there some smart way to gather these elements?
1949+
*
1950+
* Our benchmarks is following - generate 32-bit unsigned integers from 0 to N,
1951+
* random-shuffle and use them as gathering indices. For scatter operations,
1952+
* we will use the same indicies to overwrite information in a separate buffer.
1953+
*
1954+
* We will be looking at the ideal simplest case when the offset type and the
1955+
* data have identical size.
1956+
*/
1957+
using spread_index_t = std::uint32_t;
1958+
using spread_data_t = float;
1959+
1960+
/**
1961+
* @brief Perform a scalar gather operation.
1962+
* @param data The data buffer to gather from.
1963+
* @param indices The indices used to gather data.
1964+
* @param result The buffer where gathered data will be stored.
1965+
* @param size The number of elements to process.
1966+
*/
1967+
void spread_gather_scalar( //
1968+
spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) noexcept {
1969+
for (std::size_t i = 0; i < size; ++i) result[i] = data[indices[i]];
1970+
}
1971+
1972+
/**
1973+
* @brief Perform a scalar scatter operation.
1974+
* @param data The buffer to scatter data into.
1975+
* @param indices The indices used to scatter data.
1976+
* @param source The buffer containing data to scatter.
1977+
* @param size The number of elements to process.
1978+
*/
1979+
void spread_scatter_scalar( //
1980+
spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) noexcept {
1981+
for (std::size_t i = 0; i < size; ++i) data[indices[i]] = source[i];
1982+
}
1983+
1984+
template <typename kernel_type_>
1985+
static void spread_memory(bm::State &state, kernel_type_ kernel, std::size_t align = sizeof(spread_data_t)) {
1986+
1987+
std::size_t const size = static_cast<std::size_t>(state.range(0));
1988+
auto indices = make_aligned_array<spread_index_t>(size, align);
1989+
auto first = make_aligned_array<spread_data_t>(size, align);
1990+
auto second = make_aligned_array<spread_data_t>(size, align);
1991+
1992+
std::iota(indices.get(), indices.get() + size, 0);
1993+
std::random_device random_device;
1994+
std::mt19937 generator(random_device());
1995+
std::shuffle(indices.get(), indices.get() + size, generator);
1996+
1997+
for (auto _ : state) kernel(first.get(), indices.get(), second.get(), size);
1998+
}
1999+
2000+
BENCHMARK_CAPTURE(spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10, 1 << 20)->MinTime(5);
2001+
BENCHMARK_CAPTURE(spread_memory, scatter_scalar, spread_scatter_scalar)->Range(1 << 10, 1 << 20)->MinTime(5);
2002+
2003+
#if defined(__AVX512F__)
2004+
void spread_gather_avx512( //
2005+
spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
2006+
constexpr std::size_t simd_width_k = sizeof(__m512i) / sizeof(spread_data_t);
2007+
static_assert( //
2008+
sizeof(spread_data_t) == sizeof(spread_index_t), "Data and index types must have the same size");
2009+
std::size_t i = 0;
2010+
for (; i + simd_width_k <= size; i += simd_width_k)
2011+
_mm512_storeu_si512(&result[i], _mm512_i32gather_epi32(_mm512_loadu_si512(&indices[i]), data, 4));
2012+
for (; i < size; ++i) result[i] = data[indices[i]];
2013+
}
2014+
2015+
void spread_scatter_avx512( //
2016+
spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
2017+
constexpr std::size_t simd_width_k = sizeof(__m512i) / sizeof(spread_data_t);
2018+
static_assert( //
2019+
sizeof(spread_data_t) == sizeof(spread_index_t), "Data and index types must have the same size");
2020+
std::size_t i = 0;
2021+
for (; i + simd_width_k <= size; i += simd_width_k)
2022+
_mm512_i32scatter_epi32(data, _mm512_loadu_si512(&indices[i]), _mm512_loadu_si512(&source[i]), 4);
2023+
for (; i < size; ++i) data[indices[i]] = source[i];
2024+
}
2025+
2026+
BENCHMARK_CAPTURE(spread_memory, gather_avx512, spread_gather_avx512, 64)->Range(1 << 10, 1 << 20)->MinTime(5);
2027+
BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512, 64)->Range(1 << 10, 1 << 20)->MinTime(5);
2028+
2029+
/**
2030+
* For consistent timing, for AVX-512 we align allocations to the ZMM register
2031+
* size, which also coincides with the cache line width on x86 CPUs: @b 64!
2032+
*
2033+
* For short arrays under 4K elements, gathers can get up to 50% faster,
2034+
* dropping from @b 270ns to @b 136ns. On larger sizes gather can @b lose
2035+
* to serial code. Like on arrays of 65K entries it can be 50% slower!
2036+
* Scatters are even more questionable!
2037+
*/
2038+
#endif
2039+
2040+
#if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
2041+
2042+
/**
2043+
* Arm Scalable Vector Extension @b (SVE) is one of the weirdest current SIMD
2044+
* extensions. Unlike AVX2, AVX-512, or even RVV on RISC-V, it doesn't preset
2045+
* the register width at the ISA level! It's up to the physical implementation
2046+
* to choose any power of two between 128 and @b 2048 bits.
2047+
*
2048+
* In practice, Fugaku supercomputer likely has the largest SVE implementation
2049+
* at 512-bits length. The Arm Neoverse N2 core has 256-bit SVE. It also
2050+
* handles masking differently from AVX-512! Definitely worth reading about!
2051+
*
2052+
* @see "ARM's Scalable Vector Extensions: A Critical Look at SVE2 For Integer
2053+
* Workloads" by @ zingaburga:
2054+
* https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
2055+
*
2056+
*/
2057+
#include <arm_sve.h>
2058+
2059+
constexpr std::size_t max_sve_size_k = 2048 / CHAR_BIT;
2060+
2061+
void spread_gather_sve( //
2062+
spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
2063+
for (std::size_t i = 0; i < size; i += svcntw()) {
2064+
svbool_t pg = svwhilelt_b32(i, size);
2065+
svuint32_t sv_indices = svld1(pg, &indices[i]);
2066+
svfloat32_t sv_data = svld1_gather_offset(pg, data, sv_indices);
2067+
svst1(pg, &result[i], sv_data);
2068+
}
2069+
}
2070+
2071+
void spread_scatter_sve( //
2072+
spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
2073+
for (std::size_t i = 0; i < size; i += svcntw()) {
2074+
svbool_t pg = svwhilelt_b32(i, size);
2075+
svuint32_t sv_indices = svld1(pg, &indices[i]);
2076+
svfloat32_t sv_data = svld1(pg, &source[i]);
2077+
svst1_scatter_offset(pg, data, sv_indices, sv_data);
2078+
}
2079+
}
2080+
2081+
BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
2082+
BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
2083+
2084+
/**
2085+
* @b Finally! This may just be the first place where SVE supersedes NEON
2086+
* in functionality and may have a bigger improvement over scalar code than
2087+
* AVX-512 on a similar-level x86 platform!
2088+
*
2089+
* If you are very lucky with your input sizes, on small arrays under 65K
2090+
* on AWS Graviton, gathers can be up to 4x faster compared to serial code!
2091+
* On larger sizes, they again start losing to serial code. This makes
2092+
* their applicability very limited 😡
2093+
*
2094+
* Vectorized scatters are universally slower than serial code on Graviton
2095+
* for small inputs, but on larger ones over 1MB start winning up to 50%!
2096+
* Great way to get everyone confused 🤬
2097+
*/
2098+
#endif
2099+
2100+
#pragma endregion // Gather & Scatter Operations for Spread Data
2101+
19352102
#pragma region Non Uniform Memory Access
19362103

19372104
/**
@@ -3263,7 +3430,7 @@ inline std::byte *reallocate_from_arena( //
32633430
}
32643431
}
32653432

3266-
// If we cant grow in place, do: allocate new + copy + free old
3433+
// If we can't grow in place, do: allocate new + copy + free old
32673434
std::byte *new_ptr = allocate_from_arena(arena, new_size);
32683435
if (!new_ptr) return nullptr; // Out of memory
32693436

0 commit comments

Comments
 (0)