@@ -39,12 +39,12 @@ namespace arrow::internal {
3939// - array to batch constant to xsimd
4040// - Shifts per swizzle can be improved when self.packed_max_byte_spread == 1 and the
4141// byte can be reused (when val_bit_width divides packed_max_byte_spread).
42- // - Add unpack_exact to benchmarks
4342// - Reduce input size on small bit width using a broadcast.
4443// - For Avx2:
4544// - Inspect how swizzle across lanes are handled: _mm256_shuffle_epi8 not used?
4645// - Investigate AVX2 with 128 bit register
4746// - Fix overreading problem
47+ // - Improve Swizzle by computing which bigger swapable slots are free
4848
4949template <typename Arr>
5050constexpr Arr BuildConstantArray (typename Arr::value_type val) {
@@ -316,29 +316,30 @@ constexpr SwizzleBiLaneGenericPlan<T, N> BuildSwizzleBiLaneGenericPlan(
316316 plan.self_lane [k] = kAsZero ;
317317 plan.cross_lane [k] = kAsZero ;
318318 } else {
319- if (is_first_lane_idx) {
320- if (is_first_lane_mask) {
321- plan.self_lane [k] = mask[k];
322- plan.cross_lane [k] = kAsZero ;
323- } else {
324- plan.self_lane [k] = kAsZero ;
325- plan.cross_lane [k] = mask[k] - kSizeHalf ;
326- }
319+ if (is_first_lane_idx == is_first_lane_mask) {
320+ plan.self_lane [k] = mask[k] % kSizeHalf ;
321+ plan.cross_lane [k] = kAsZero ;
327322 } else {
328- if (is_first_lane_mask) {
329- plan.self_lane [k] = kAsZero ;
330- plan.cross_lane [k] = mask[k]; // Indices given within lane
331- } else {
332- plan.self_lane [k] = mask[k] - kSizeHalf ; // Indices given within lane
333- plan.cross_lane [k] = kAsZero ;
334- }
323+ plan.self_lane [k] = kAsZero ;
324+ plan.cross_lane [k] = mask[k] % kSizeHalf ;
335325 }
336326 }
337327 }
338328
339329 return plan;
340330}
341331
332+ template <typename T, typename A, T... Vals>
333+ constexpr bool isOnlyFromHigh (xsimd::batch_constant<T, A, Vals...>) {
334+ return ((Vals >= (sizeof ...(Vals) / 2 )) && ...);
335+ }
336+
337+ template <typename T, typename A, T... Vals>
338+ constexpr bool isOnlyFromLow (xsimd::batch_constant<T, A, Vals...>) {
339+ return ((Vals < (sizeof ...(Vals) / 2 )) && ...);
340+ }
341+
342+ // / Merged in xsimd 14.0, simply use swizzle
342343template <typename Arch, uint8_t ... kIdx >
343344auto swizzle_bytes (const xsimd::batch<uint8_t , Arch>& batch,
344345 xsimd::batch_constant<uint8_t , Arch, kIdx ...> mask) {
@@ -349,6 +350,23 @@ auto swizzle_bytes(const xsimd::batch<uint8_t, Arch>& batch,
349350 static constexpr auto kCrossSwizzleArr = kPlan .cross_lane ;
350351 constexpr auto kCrossSwizzle = make_batch_constant<kCrossSwizzleArr , Arch>();
351352
353+ struct LaneMask {
354+ static constexpr uint8_t get (uint8_t i, uint8_t n) {
355+ constexpr auto kMask = std::array{kIdx ...};
356+ return kMask [i] % (kMask .size () / 2 );
357+ }
358+ };
359+
360+ constexpr auto kLaneMask = xsimd::make_batch_constant<uint8_t , Arch, LaneMask>();
361+ if constexpr (isOnlyFromLow (mask)) {
362+ auto broadcast = _mm256_permute2x128_si256 (batch, batch, 0x00 ); // [low | low]
363+ return _mm256_shuffle_epi8 (broadcast, kLaneMask .as_batch ());
364+ }
365+ if constexpr (isOnlyFromHigh (mask)) {
366+ auto broadcast = _mm256_permute2x128_si256 (batch, batch, 0x11 ); // [high | high]
367+ return _mm256_shuffle_epi8 (broadcast, kLaneMask .as_batch ());
368+ }
369+
352370 auto self = _mm256_shuffle_epi8 (batch, kSelfSwizzle .as_batch ());
353371 auto swapped = _mm256_permute2x128_si256 (batch, batch, 0x01 );
354372 auto cross = _mm256_shuffle_epi8 (swapped, kCrossSwizzle .as_batch ());
0 commit comments