1313// limitations under the License.
1414
1515
16+ #include < Columns/ColumnUtil.h>
1617#include < Columns/ColumnVector.h>
1718#include < Columns/countBytesInFilter.h>
1819#include < Columns/filterColumn.h>
@@ -130,6 +131,72 @@ ColumnPtr filterAVX2(ColumnPtr & col, IColumn::Filter & filt, ssize_t result_siz
130131 const UInt8 * filt_end = filt_pos + size;
131132 const Int64 * data_pos = &data[0 ];
132133
134+ const UInt8 * filt_end_aligned = filt_pos + (filt_end - filt_pos) / FILTER_SIMD_BYTES * FILTER_SIMD_BYTES;
135+ while (filt_pos < filt_end_aligned)
136+ {
137+ UInt64 mask = ToBits64 (filt_pos);
138+ if likely (0 != mask)
139+ {
140+ if (const UInt8 prefix_to_copy = prefixToCopy (mask); 0xFF != prefix_to_copy)
141+ {
142+ res_data.insert (data_pos, data_pos + prefix_to_copy);
143+ }
144+ else
145+ {
146+ if (const UInt8 suffix_to_copy = suffixToCopy (mask); 0xFF != suffix_to_copy)
147+ {
148+ res_data.insert (data_pos + FILTER_SIMD_BYTES - suffix_to_copy, data_pos + FILTER_SIMD_BYTES);
149+ }
150+ else
151+ {
152+ while (mask)
153+ {
154+ size_t index = std::countr_zero (mask);
155+ res_data.push_back (data_pos[index]);
156+ mask &= mask - 1 ;
157+ }
158+ }
159+ }
160+ }
161+
162+ filt_pos += FILTER_SIMD_BYTES;
163+ data_pos += FILTER_SIMD_BYTES;
164+ }
165+
166+ // / Process the tail.
167+ while (filt_pos < filt_end)
168+ {
169+ if (*filt_pos)
170+ res_data.push_back (*data_pos);
171+ ++filt_pos;
172+ ++data_pos;
173+ }
174+
175+ return res;
176+ }
177+
178+ ColumnPtr filterCurrent (ColumnPtr & col, IColumn::Filter & filt, ssize_t result_size_hint)
179+ {
180+ const auto & data = typeid_cast<const ColumnVector<Int64> *>(col.get ())->getData ();
181+ size_t size = col->size ();
182+ if (size != filt.size ())
183+ throw Exception (" Size of filter doesn't match size of column." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
184+
185+ auto res = ColumnVector<Int64>::create ();
186+ using Container = ColumnVector<Int64>::Container;
187+ Container & res_data = res->getData ();
188+
189+ if (result_size_hint)
190+ {
191+ if (result_size_hint < 0 )
192+ result_size_hint = countBytesInFilter (filt);
193+ res_data.reserve (result_size_hint);
194+ }
195+
196+ const UInt8 * filt_pos = &filt[0 ];
197+ const UInt8 * filt_end = filt_pos + size;
198+ const Int64 * data_pos = &data[0 ];
199+
133200 filterImpl (filt_pos, filt_end, data_pos, res_data);
134201
135202 return res;
@@ -139,6 +206,7 @@ enum class FilterVersion
139206{
140207 SSE2,
141208 AVX2,
209+ Current,
142210};
143211
144212template <typename ... Args>
@@ -158,45 +226,66 @@ void columnFilter(benchmark::State & state, Args &&... args)
158226 {
159227 for (auto _ : state)
160228 {
161- auto t = filterSSE2 (col, filter, set_n * sizeof (Int64));
229+ auto t = filterSSE2 (col, filter, set_n);
230+ benchmark::DoNotOptimize (t);
231+ }
232+ }
233+ else if (version == FilterVersion::AVX2)
234+ {
235+ for (auto _ : state)
236+ {
237+ auto t = filterAVX2 (col, filter, set_n);
162238 benchmark::DoNotOptimize (t);
163239 }
164240 }
165241 else
166242 {
167243 for (auto _ : state)
168244 {
169- auto t = filterAVX2 (col, filter, set_n * sizeof (Int64) );
245+ auto t = filterCurrent (col, filter, set_n);
170246 benchmark::DoNotOptimize (t);
171247 }
172248 }
173249}
174250
175251BENCHMARK_CAPTURE (columnFilter, sse2_00, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.00 );
176252BENCHMARK_CAPTURE (columnFilter, avx2_00, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.00 );
253+ BENCHMARK_CAPTURE (columnFilter, cur_00, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.00 );
177254BENCHMARK_CAPTURE (columnFilter, sse2_01, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.01 );
178255BENCHMARK_CAPTURE (columnFilter, avx2_01, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.01 );
256+ BENCHMARK_CAPTURE (columnFilter, cur_01, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.01 );
179257BENCHMARK_CAPTURE (columnFilter, sse2_10, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.10 );
180258BENCHMARK_CAPTURE (columnFilter, avx2_10, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.10 );
259+ BENCHMARK_CAPTURE (columnFilter, cur_10, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.10 );
181260BENCHMARK_CAPTURE (columnFilter, sse2_20, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.20 );
182261BENCHMARK_CAPTURE (columnFilter, avx2_20, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.20 );
262+ BENCHMARK_CAPTURE (columnFilter, cur_20, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.20 );
183263BENCHMARK_CAPTURE (columnFilter, sse2_30, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.30 );
184264BENCHMARK_CAPTURE (columnFilter, avx2_30, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.30 );
265+ BENCHMARK_CAPTURE (columnFilter, cur_30, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.30 );
185266BENCHMARK_CAPTURE (columnFilter, sse2_40, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.40 );
186267BENCHMARK_CAPTURE (columnFilter, avx2_40, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.40 );
268+ BENCHMARK_CAPTURE (columnFilter, cur_40, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.40 );
187269BENCHMARK_CAPTURE (columnFilter, sse2_50, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.50 );
188270BENCHMARK_CAPTURE (columnFilter, avx2_50, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.50 );
271+ BENCHMARK_CAPTURE (columnFilter, cur_50, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.50 );
189272BENCHMARK_CAPTURE (columnFilter, sse2_60, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.60 );
190273BENCHMARK_CAPTURE (columnFilter, avx2_60, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.60 );
274+ BENCHMARK_CAPTURE (columnFilter, cur_60, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.60 );
191275BENCHMARK_CAPTURE (columnFilter, sse2_70, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.70 );
192276BENCHMARK_CAPTURE (columnFilter, avx2_70, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.70 );
277+ BENCHMARK_CAPTURE (columnFilter, cur_70, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.70 );
193278BENCHMARK_CAPTURE (columnFilter, sse2_80, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.80 );
194279BENCHMARK_CAPTURE (columnFilter, avx2_80, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.80 );
280+ BENCHMARK_CAPTURE (columnFilter, cur_80, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.80 );
195281BENCHMARK_CAPTURE (columnFilter, sse2_90, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.90 );
196282BENCHMARK_CAPTURE (columnFilter, avx2_90, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.90 );
283+ BENCHMARK_CAPTURE (columnFilter, cur_90, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.90 );
197284BENCHMARK_CAPTURE (columnFilter, sse2_99, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.99 );
198285BENCHMARK_CAPTURE (columnFilter, avx2_99, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.99 );
286+ BENCHMARK_CAPTURE (columnFilter, cur_99, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.99 );
199287BENCHMARK_CAPTURE (columnFilter, sse2_100, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 1.00 );
200288BENCHMARK_CAPTURE (columnFilter, avx2_100, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 1.00 );
289+ BENCHMARK_CAPTURE (columnFilter, cur_100, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 1.00 );
201290
202291} // namespace bench
0 commit comments