Skip to content

Commit 130f98e

Browse files
committed
Improve swizzle
1 parent 967b6f4 commit 130f98e

File tree

1 file changed

+34
-16
lines changed

1 file changed

+34
-16
lines changed

cpp/src/arrow/util/bpacking_simd_impl_internal.h

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@ namespace arrow::internal {
3939
// - array to batch constant to xsimd
4040
// - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the
4141
// byte can be reused (when val_bit_width divides packed_max_byte_spread).
42-
// - Add unpack_exact to benchmarks
4342
// - Reduce input size on small bit width using a broadcast.
4443
// - For Avx2:
4544
// - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used?
4645
// - Investigate AVX2 with 128 bit register
4746
// - Fix overreading problem
47+
// - Improve Swizzle by computing which bigger swapable slots are free
4848

4949
template <typename Arr>
5050
constexpr Arr BuildConstantArray(typename Arr::value_type val) {
@@ -316,29 +316,30 @@ constexpr SwizzleBiLaneGenericPlan<T, N> BuildSwizzleBiLaneGenericPlan(
316316
plan.self_lane[k] = kAsZero;
317317
plan.cross_lane[k] = kAsZero;
318318
} else {
319-
if (is_first_lane_idx) {
320-
if (is_first_lane_mask) {
321-
plan.self_lane[k] = mask[k];
322-
plan.cross_lane[k] = kAsZero;
323-
} else {
324-
plan.self_lane[k] = kAsZero;
325-
plan.cross_lane[k] = mask[k] - kSizeHalf;
326-
}
319+
if (is_first_lane_idx == is_first_lane_mask) {
320+
plan.self_lane[k] = mask[k] % kSizeHalf;
321+
plan.cross_lane[k] = kAsZero;
327322
} else {
328-
if (is_first_lane_mask) {
329-
plan.self_lane[k] = kAsZero;
330-
plan.cross_lane[k] = mask[k]; // Indices given within lane
331-
} else {
332-
plan.self_lane[k] = mask[k] - kSizeHalf; // Indices given within lane
333-
plan.cross_lane[k] = kAsZero;
334-
}
323+
plan.self_lane[k] = kAsZero;
324+
plan.cross_lane[k] = mask[k] % kSizeHalf;
335325
}
336326
}
337327
}
338328

339329
return plan;
340330
}
341331

332+
template <typename T, typename A, T... Vals>
333+
constexpr bool isOnlyFromHigh(xsimd::batch_constant<T, A, Vals...>) {
334+
return ((Vals >= (sizeof...(Vals) / 2)) && ...);
335+
}
336+
337+
template <typename T, typename A, T... Vals>
338+
constexpr bool isOnlyFromLow(xsimd::batch_constant<T, A, Vals...>) {
339+
return ((Vals < (sizeof...(Vals) / 2)) && ...);
340+
}
341+
342+
/// Merged in xsimd 14.0, simply use swizzle
342343
template <typename Arch, uint8_t... kIdx>
343344
auto swizzle_bytes(const xsimd::batch<uint8_t, Arch>& batch,
344345
xsimd::batch_constant<uint8_t, Arch, kIdx...> mask) {
@@ -349,6 +350,23 @@ auto swizzle_bytes(const xsimd::batch<uint8_t, Arch>& batch,
349350
static constexpr auto kCrossSwizzleArr = kPlan.cross_lane;
350351
constexpr auto kCrossSwizzle = make_batch_constant<kCrossSwizzleArr, Arch>();
351352

353+
struct LaneMask {
354+
static constexpr uint8_t get(uint8_t i, uint8_t n) {
355+
constexpr auto kMask = std::array{kIdx...};
356+
return kMask[i] % (kMask.size() / 2);
357+
}
358+
};
359+
360+
constexpr auto kLaneMask = xsimd::make_batch_constant<uint8_t, Arch, LaneMask>();
361+
if constexpr (isOnlyFromLow(mask)) {
362+
auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x00); // [low | low]
363+
return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch());
364+
}
365+
if constexpr (isOnlyFromHigh(mask)) {
366+
auto broadcast = _mm256_permute2x128_si256(batch, batch, 0x11); // [high | high]
367+
return _mm256_shuffle_epi8(broadcast, kLaneMask.as_batch());
368+
}
369+
352370
auto self = _mm256_shuffle_epi8(batch, kSelfSwizzle.as_batch());
353371
auto swapped = _mm256_permute2x128_si256(batch, batch, 0x01);
354372
auto cross = _mm256_shuffle_epi8(swapped, kCrossSwizzle.as_batch());

0 commit comments

Comments
 (0)