@@ -1755,7 +1755,7 @@ BENCHMARK_CAPTURE(theoretic_tops, i8u8_amx_avx512, tops_i8u8_amx_avx512_asm_kern
17551755#include < cassert> // `assert`
17561756#include < fstream> // `std::ifstream`
17571757#include < iterator> // `std::random_access_iterator_tag`
1758- #include < memory> // `std::assume_aligned`
1758+ #include < memory> // `std::assume_aligned`, `std::unique_ptr`
17591759#include < string> // `std::string`, `std::stoull`
17601760
17611761/* *
@@ -1863,6 +1863,13 @@ class strided_ptr {
18631863 // clang-format on
18641864};
18651865
1866+ template <typename type_>
1867+ std::unique_ptr<type_[], decltype (&std::free)> make_aligned_array (std::size_t size, std::size_t alignment) {
1868+ type_ *raw_ptr = static_cast <type_ *>(std::aligned_alloc (alignment, sizeof (type_) * size));
1869+ if (!raw_ptr) throw std::bad_alloc ();
1870+ return std::unique_ptr<type_[], decltype (&std::free)>(raw_ptr, &std::free);
1871+ }
1872+
18661873#if defined(__aarch64__)
18671874/* *
18681875 * @brief Helper derived from `__aarch64_sync_cache_range` in `libgcc`, used to
@@ -1887,9 +1894,7 @@ static void memory_access(bm::State &state) {
18871894 // memory accesses may suffer from the same issues. For split-loads, pad our
18881895 // buffer with an extra `cache_line_width` bytes of space.
18891896 std::size_t const buffer_size = typical_l2_size + cache_line_width;
1890- std::unique_ptr<std::byte, decltype (&std::free)> const buffer ( //
1891- reinterpret_cast <std::byte *>(std::aligned_alloc (cache_line_width, buffer_size)), //
1892- &std::free);
1897+ auto const buffer = make_aligned_array<std::byte>(buffer_size, cache_line_width);
18931898 std::byte *const buffer_ptr = buffer.get ();
18941899
18951900 // Let's initialize a strided range using out `strided_ptr` template, but
@@ -1932,6 +1937,168 @@ BENCHMARK(memory_access_aligned)->MinTime(10);
19321937
19331938#pragma endregion // Alignment of Memory Accesses
19341939
1940+ #pragma region Gather & Scatter Operations for Spread Data
1941+
1942+ /* *
1943+ * Sometimes, the variables of interest are scattered across memory, and we
1944+ * need to gather them into a contiguous buffer for processing. This is already
1945+ * common in sparse matrix operations, where only a few elements are non-zero,
1946+ * but can apply to any irregular data structure...
1947+ *
1948+ * The only question is: is there some smart way to gather these elements?
1949+ *
1950+ * Our benchmarks is following - generate 32-bit unsigned integers from 0 to N,
1951+ * random-shuffle and use them as gathering indices. For scatter operations,
1952+ * we will use the same indicies to overwrite information in a separate buffer.
1953+ *
1954+ * We will be looking at the ideal simplest case when the offset type and the
1955+ * data have identical size.
1956+ */
1957+ using spread_index_t = std::uint32_t ;
1958+ using spread_data_t = float ;
1959+
1960+ /* *
1961+ * @brief Perform a scalar gather operation.
1962+ * @param data The data buffer to gather from.
1963+ * @param indices The indices used to gather data.
1964+ * @param result The buffer where gathered data will be stored.
1965+ * @param size The number of elements to process.
1966+ */
1967+ void spread_gather_scalar ( //
1968+ spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) noexcept {
1969+ for (std::size_t i = 0 ; i < size; ++i) result[i] = data[indices[i]];
1970+ }
1971+
1972+ /* *
1973+ * @brief Perform a scalar scatter operation.
1974+ * @param data The buffer to scatter data into.
1975+ * @param indices The indices used to scatter data.
1976+ * @param source The buffer containing data to scatter.
1977+ * @param size The number of elements to process.
1978+ */
1979+ void spread_scatter_scalar ( //
1980+ spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) noexcept {
1981+ for (std::size_t i = 0 ; i < size; ++i) data[indices[i]] = source[i];
1982+ }
1983+
1984+ template <typename kernel_type_>
1985+ static void spread_memory (bm::State &state, kernel_type_ kernel, std::size_t align = sizeof (spread_data_t )) {
1986+
1987+ std::size_t const size = static_cast <std::size_t >(state.range (0 ));
1988+ auto indices = make_aligned_array<spread_index_t >(size, align);
1989+ auto first = make_aligned_array<spread_data_t >(size, align);
1990+ auto second = make_aligned_array<spread_data_t >(size, align);
1991+
1992+ std::iota (indices.get (), indices.get () + size, 0 );
1993+ std::random_device random_device;
1994+ std::mt19937 generator (random_device ());
1995+ std::shuffle (indices.get (), indices.get () + size, generator);
1996+
1997+ for (auto _ : state) kernel (first.get (), indices.get (), second.get (), size);
1998+ }
1999+
2000+ BENCHMARK_CAPTURE (spread_memory, gather_scalar, spread_gather_scalar)->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2001+ BENCHMARK_CAPTURE (spread_memory, scatter_scalar, spread_scatter_scalar)->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2002+
2003+ #if defined(__AVX512F__)
2004+ void spread_gather_avx512 ( //
2005+ spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
2006+ constexpr std::size_t simd_width_k = sizeof (__m512i) / sizeof (spread_data_t );
2007+ static_assert ( //
2008+ sizeof (spread_data_t ) == sizeof (spread_index_t ), " Data and index types must have the same size" );
2009+ std::size_t i = 0 ;
2010+ for (; i + simd_width_k <= size; i += simd_width_k)
2011+ _mm512_storeu_si512 (&result[i], _mm512_i32gather_epi32 (_mm512_loadu_si512 (&indices[i]), data, 4 ));
2012+ for (; i < size; ++i) result[i] = data[indices[i]];
2013+ }
2014+
2015+ void spread_scatter_avx512 ( //
2016+ spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
2017+ constexpr std::size_t simd_width_k = sizeof (__m512i) / sizeof (spread_data_t );
2018+ static_assert ( //
2019+ sizeof (spread_data_t ) == sizeof (spread_index_t ), " Data and index types must have the same size" );
2020+ std::size_t i = 0 ;
2021+ for (; i + simd_width_k <= size; i += simd_width_k)
2022+ _mm512_i32scatter_epi32 (data, _mm512_loadu_si512 (&indices[i]), _mm512_loadu_si512 (&source[i]), 4 );
2023+ for (; i < size; ++i) data[indices[i]] = source[i];
2024+ }
2025+
2026+ BENCHMARK_CAPTURE (spread_memory, gather_avx512, spread_gather_avx512, 64 )->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2027+ BENCHMARK_CAPTURE (spread_memory, scatter_avx512, spread_scatter_avx512, 64 )->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2028+
2029+ /* *
2030+ * For consistent timing, for AVX-512 we align allocations to the ZMM register
2031+ * size, which also coincides with the cache line width on x86 CPUs: @b 64!
2032+ *
2033+ * For short arrays under 4K elements, gathers can get up to 50% faster,
2034+ * dropping from @b 270ns to @b 136ns. On larger sizes gather can @b lose
2035+ * to serial code. Like on arrays of 65K entries it can be 50% slower!
2036+ * Scatters are even more questionable!
2037+ */
2038+ #endif
2039+
2040+ #if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
2041+
2042+ /* *
2043+ * Arm Scalable Vector Extension @b (SVE) is one of the weirdest current SIMD
2044+ * extensions. Unlike AVX2, AVX-512, or even RVV on RISC-V, it doesn't preset
2045+ * the register width at the ISA level! It's up to the physical implementation
2046+ * to choose any power of two between 128 and @b 2048 bits.
2047+ *
2048+ * In practice, Fugaku supercomputer likely has the largest SVE implementation
2049+ * at 512-bits length. The Arm Neoverse N2 core has 256-bit SVE. It also
2050+ * handles masking differently from AVX-512! Definitely worth reading about!
2051+ *
2052+ * @see "ARM's Scalable Vector Extensions: A Critical Look at SVE2 For Integer
2053+ * Workloads" by @ zingaburga:
2054+ * https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
2055+ *
2056+ */
2057+ #include < arm_sve.h>
2058+
2059+ constexpr std::size_t max_sve_size_k = 2048 / CHAR_BIT;
2060+
2061+ void spread_gather_sve ( //
2062+ spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
2063+ for (std::size_t i = 0 ; i < size; i += svcntw ()) {
2064+ svbool_t pg = svwhilelt_b32 (i, size);
2065+ svuint32_t sv_indices = svld1 (pg, &indices[i]);
2066+ svfloat32_t sv_data = svld1_gather_offset (pg, data, sv_indices);
2067+ svst1 (pg, &result[i], sv_data);
2068+ }
2069+ }
2070+
2071+ void spread_scatter_sve ( //
2072+ spread_data_t *data, spread_index_t const *indices, spread_data_t const *source, std::size_t size) {
2073+ for (std::size_t i = 0 ; i < size; i += svcntw ()) {
2074+ svbool_t pg = svwhilelt_b32 (i, size);
2075+ svuint32_t sv_indices = svld1 (pg, &indices[i]);
2076+ svfloat32_t sv_data = svld1 (pg, &source[i]);
2077+ svst1_scatter_offset (pg, data, sv_indices, sv_data);
2078+ }
2079+ }
2080+
2081+ BENCHMARK_CAPTURE (spread_memory, gather_sve, spread_gather_sve, max_sve_size_k)->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2082+ BENCHMARK_CAPTURE (spread_memory, scatter_sve, spread_scatter_sve, max_sve_size_k)->Range(1 << 10 , 1 << 20 )->MinTime(5 );
2083+
2084+ /* *
2085+ * @b Finally! This may just be the first place where SVE supersedes NEON
2086+ * in functionality and may have a bigger improvement over scalar code than
2087+ * AVX-512 on a similar-level x86 platform!
2088+ *
2089+ * If you are very lucky with your input sizes, on small arrays under 65K
2090+ * on AWS Graviton, gathers can be up to 4x faster compared to serial code!
2091+ * On larger sizes, they again start losing to serial code. This makes
2092+ * their applicability very limited 😡
2093+ *
2094+ * Vectorized scatters are universally slower than serial code on Graviton
2095+ * for small inputs, but on larger ones over 1MB start winning up to 50%!
2096+ * Great way to get everyone confused 🤬
2097+ */
2098+ #endif
2099+
2100+ #pragma endregion // Gather & Scatter Operations for Spread Data
2101+
19352102#pragma region Non Uniform Memory Access
19362103
19372104/* *
@@ -3263,7 +3430,7 @@ inline std::byte *reallocate_from_arena( //
32633430 }
32643431 }
32653432
3266- // If we can’ t grow in place, do: allocate new + copy + free old
3433+ // If we can' t grow in place, do: allocate new + copy + free old
32673434 std::byte *new_ptr = allocate_from_arena (arena, new_size);
32683435 if (!new_ptr) return nullptr ; // Out of memory
32693436
0 commit comments