Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions include/ylt/simd_util/avx2/str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma once
#include <immintrin.h>

#include <string>
#include <string_view>
#include <vector>
namespace ylt {
namespace avx2 {

template <typename T>
concept StringLike =
std::same_as<T, std::string> || std::same_as<T, std::string_view>;
template <StringLike StringLike>
__attribute__((__target__("avx2,bmi"))) inline std::vector<StringLike>
simd_str_split(std::string_view string, const char delim) {
auto* pstr = string.data();
size_t size = string.size();
size_t start = 0;

std::vector<StringLike> output;
size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL;
for (size_t i = 0; i < aligned32_size; i += 32) {
__m256i data =
_mm256_lddqu_si256(reinterpret_cast<const __m256i*>(&pstr[i]));
const __m256i match = _mm256_set1_epi8(delim);
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, match));
while (mask != 0) {
auto j = __builtin_ctzl(mask);
output.emplace_back(&pstr[start], i + j - start);
start = i + j + 1;
mask &= mask - 1;
}
}

size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL;
if (aligned32_size < aligned16_size) {
__m128i data = _mm_lddqu_si128(
reinterpret_cast<const __m128i*>(&pstr[aligned32_size]));
const __m128i match = _mm_set1_epi8(delim);
uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match));
while (mask != 0) {
auto j = __builtin_ctzl(mask);
output.emplace_back(&pstr[start], aligned32_size + j - start);
start = aligned32_size + j + 1;
mask &= mask - 1;
}
}

size_t i = aligned16_size;
do {
while (pstr[i] != delim && i != size) {
++i;
}
output.emplace_back(&pstr[start], i - start);
start = i = i + 1;
} while (i <= size);
return output;
}

} // namespace avx2
} // namespace ylt
76 changes: 76 additions & 0 deletions include/ylt/simd_util/avx512/str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#pragma once
#include <immintrin.h>

#include <string>
#include <string_view>
#include <vector>
namespace ylt {
namespace avx512 {

template <typename T>
concept StringLike =
std::same_as<T, std::string> || std::same_as<T, std::string_view>;
template <StringLike StringLike>
__attribute__((__target__("avx512bw,bmi")))
// auto chose target
inline std::vector<StringLike>
simd_str_split(std::string_view string, const char delim) {
auto* pstr = string.data();
size_t size = string.size();
size_t start = 0;

std::vector<StringLike> output;

size_t aligned64_size = size & 0xFFFFFFFFFFFFFFC0UL;
for (size_t i = 0; i < aligned64_size; i += 64) {
__m512i data = _mm512_loadu_si512(&pstr[i]);
const __m512i match = _mm512_set1_epi8(delim);
uint64_t mask = _mm512_cmpeq_epi8_mask(data, match);
while (mask != 0) {
auto j = __builtin_ctzll(mask);
output.emplace_back(&pstr[start], i + j - start);
start = i + j + 1;
mask &= mask - 1;
}
}

size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL;
if (aligned64_size < aligned32_size) {
__m256i data = _mm256_lddqu_si256(
reinterpret_cast<const __m256i*>(&pstr[aligned64_size]));
const __m256i match = _mm256_set1_epi8(delim);
uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, match));
while (mask != 0) {
auto j = __builtin_ctzl(mask);
output.emplace_back(&pstr[start], aligned64_size + j - start);
start = aligned64_size + j + 1;
mask &= mask - 1;
}
}

size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL;
if (aligned32_size < aligned16_size) {
__m128i data = _mm_lddqu_si128(
reinterpret_cast<const __m128i*>(&pstr[aligned32_size]));
const __m128i match = _mm_set1_epi8(delim);
uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match));
while (mask != 0) {
auto j = __builtin_ctzl(mask);
output.emplace_back(&pstr[start], aligned32_size + j - start);
start = aligned32_size + j + 1;
mask &= mask - 1;
}
}

size_t i = aligned16_size;
do {
while (pstr[i] != delim && i != size) {
++i;
}
output.emplace_back(&pstr[start], i - start);
start = i = i + 1;
} while (i <= size);
return output;
}
} // namespace avx512
} // namespace ylt
35 changes: 35 additions & 0 deletions include/ylt/simd_util/common/str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include <string>
#include <string_view>
#include <vector>

namespace ylt {

namespace common {
template <typename T>
concept StringLike =
std::same_as<T, std::string> || std::same_as<T, std::string_view>;
template <StringLike StringLike>
inline std::vector<StringLike> simd_str_split(std::string_view s,
const char delimiter) {
size_t start = 0;
size_t end = s.find_first_of(delimiter);

std::vector<StringLike> output;

while (end <= StringLike::npos) {
output.emplace_back(s.substr(start, end - start));

if (end == StringLike::npos)
break;

start = end + 1;
end = s.find_first_of(delimiter, start);
}

return output;
}
} // namespace common

} // namespace ylt
92 changes: 92 additions & 0 deletions include/ylt/simd_util/neon/str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#pragma once

#include <arm_neon.h>

#include <string>
#include <string_view>
#include <vector>

namespace ylt {
namespace neon {

template <typename T>
concept StringLike =
std::same_as<T, std::string> || std::same_as<T, std::string_view>;

template <StringLike StringLike>
inline std::vector<StringLike> simd_str_split(std::string_view string,
const char delim) {
auto* pstr = string.data();
size_t size = string.size();
size_t start = 0;

std::vector<StringLike> output;
// Similar to memchr implementation, the first round of 256-bit detection
size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL;
uint8x16_t match = vmovq_n_u8(delim);
for (size_t i = 0; i < aligned32_size; i += 32) {
uint8x16_t data1 = vld1q_u8(reinterpret_cast<const uint8_t*>(&pstr[i]));
uint8x16_t data2 =
vld1q_u8(reinterpret_cast<const uint8_t*>(&pstr[i + 16]));
uint8x16_t result1 = vceqq_u8(data1, match);
uint8x16_t result2 = vceqq_u8(data2, match);
// Quickly fold the 256-bit detection results to 64-bit. It cannot be
// accurately located, but can be used for quick skipping.
uint64x2_t result64 = vreinterpretq_u64_u8(vorrq_u8(result1, result2));
result64 = vpaddq_u64(result64, result64);
if (result64[0] != 0) {
// Convert the detection result from 0xFF to 0x01, 0x04, 0x10, 0x40
// The final fold will form 32 hit marks on 64 bits, and the alternate
// bits will take effect. For example, if all hits are found, the result
// will be 0x55555555555555555UL
uint8x16_t vmask =
vreinterpretq_u8_u64(vdupq_n_u64(0x4010040140100401UL));
result1 = vandq_u8(result1, vmask);
result2 = vandq_u8(result2, vmask);
result1 = vpaddq_u8(result1, result2);
result1 = vpaddq_u8(result1, result1);
uint64_t mask = vreinterpretq_u64_u8(result1)[0];
while (mask != 0) {
auto j = __builtin_ctzll(mask) >> 1;
output.emplace_back(&pstr[start], i + j - start);
start = i + j + 1;
mask &= mask - 1;
}
}
}
size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL;
if (aligned32_size < aligned16_size) {
uint8x16_t data =
vld1q_u8(reinterpret_cast<const uint8_t*>(&pstr[aligned32_size]));
uint8x16_t result = vceqq_u8(data, match);
uint64x2_t result64 = vreinterpretq_u64_u8(result);
result64 = vpaddq_u64(result64, result64);
if (result64[0] != 0) {
uint8x16_t vmask =
vreinterpretq_u8_u64(vdupq_n_u64(0x4010040140100401UL));
result = vandq_u8(result, vmask);
result = vpaddq_u8(result, result);
result = vpaddq_u8(result, result);
uint32_t mask = vreinterpretq_u32_u8(result)[0];
while (mask != 0) {
auto j = __builtin_ctzl(mask) >> 1;
output.emplace_back(&pstr[start], aligned32_size + j - start);
start = aligned32_size + j + 1;
mask &= mask - 1;
}
}
}

size_t i = aligned16_size;
do {
while (pstr[i] != delim && i != size) {
++i;
}
output.emplace_back(&pstr[start], i - start);
start = i = i + 1;
} while (i <= size);
return output;
}

} // namespace neon
} // namespace ylt
19 changes: 19 additions & 0 deletions include/ylt/simd_util/simd_str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "ylt_simd_dispatch.h"

#include INCLUDE_ARCH_FILE(str_split.h)

namespace ylt {
YLT_USING_ARCH_FUNC(simd_str_split);

static inline std::vector<std::string> split_str(std::string_view string,
const char delim) {
return simd_str_split<std::string>(string, delim);
}

static inline std::vector<std::string_view> split_sv(std::string_view string,
const char delim) {
return simd_str_split<std::string_view>(string, delim);
}
} // namespace ylt
49 changes: 49 additions & 0 deletions include/ylt/simd_util/sse/str_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#pragma once

#include <immintrin.h>

#include <string>
#include <string_view>
#include <vector>

namespace ylt {
namespace sse {

template <typename T>
concept StringLike =
std::same_as<T, std::string> || std::same_as<T, std::string_view>;
template <StringLike StringLike>
__attribute__((__target__("sse4.2"))) inline std::vector<StringLike>
simd_str_split(std::string_view string, const char delim) {
auto* pstr = string.data();
size_t size = string.size();
size_t start = 0;

std::vector<StringLike> output;
size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL;

for (size_t i = 0; i < aligned16_size; i += 16) {
__m128i data = _mm_lddqu_si128(reinterpret_cast<const __m128i*>(&pstr[i]));
const __m128i match = _mm_set1_epi8(delim);
uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match));
while (mask != 0) {
auto j = __builtin_ctzl(mask);
output.emplace_back(&pstr[start], i + j - start);
start = i + j + 1;
mask &= mask - 1;
}
}

size_t i = aligned16_size;
do {
while (pstr[i] != delim && i != size) {
++i;
}
output.emplace_back(&pstr[start], i - start);
start = i = i + 1;
} while (i <= size);
return output;
}

} // namespace sse
} // namespace ylt
21 changes: 21 additions & 0 deletions include/ylt/simd_util/ylt_cpu_feature.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#if defined(__APPLE__) || defined(__linux__) || defined(__FreeBSD__) || \
defined(__unix__)
#endif

#if defined(__SSE2__)
#if defined(__SSE4_2__)
#define YLT_HAVE_SSE
#endif
#if defined(__AVX2__)
#define YLT_HAVE_AVX2
#endif
#if defined(__AVX512F__)
#define YLT_HAVE_AVX512
#endif
#else
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#define YLT_HAVE_NEON
#endif
#endif
26 changes: 26 additions & 0 deletions include/ylt/simd_util/ylt_simd_dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#pragma once

#include "ylt_cpu_feature.h"
#include "ylt_simd_macro.h"

#if defined(YLT_HAVE_AVX512)
#define YLT_USING_ARCH_FUNC(func) using avx512::func
#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(avx512/file)
#elif defined(YLT_HAVE_AVX2)
#define YLT_USING_ARCH_FUNC(func) using avx2::func
#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(avx2/file)
#elif defined(YLT_HAVE_SSE)
#define YLT_USING_ARCH_FUNC(func) using sse::func
#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(sse/file)
#else
#if defined(YLT_HAVE_NEON)
#define YLT_USING_ARCH_FUNC(func) using neon::func
#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(neon/file)
#endif
#endif

#if !defined(YLT_HAVE_AVX2) && !defined(YLT_HAVE_AVX512) && \
!defined(YLT_HAVE_SSE) && !defined(YLT_HAVE_NEON)
#define YLT_USING_ARCH_FUNC(func) using common::func
#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(common/file)
#endif
Loading
Loading