Skip to content

Commit fdbdde5

Browse files
ti-chi-botLloyd-PottigerJaySon-Huang
authored
Columns: optimize ColumnString filter when selectivity is high (#9987) (#10036)
ref #9699, close #10029 optimize the performance of ColumnString filter when the selectivity of filter is high: For example, when filter is `0111111111111111011111111111111101111111111111110111111111111111`, the mask will be `11111111111111110111111111111111101111111111111111011111111111111110`, since it does not be `[0]*[1]+` or `[1]+[0]*`, we need to copy each selected row one by one. Now, we can copy 15 rows at once. The total elapsed time of TPC-H 50 reduce from 42.9s to 41.1s. Signed-off-by: Lloyd-Pottiger <[email protected]> Signed-off-by: JaySon-Huang <[email protected]> Co-authored-by: Lloyd-Pottiger <[email protected]> Co-authored-by: Lloyd-Pottiger <[email protected]> Co-authored-by: JaySon-Huang <[email protected]>
1 parent 022e483 commit fdbdde5

File tree

7 files changed

+650
-93
lines changed

7 files changed

+650
-93
lines changed

dbms/src/Columns/filterColumn.cpp

Lines changed: 63 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,17 @@ namespace DB
3030

3131
namespace
3232
{
33+
34+
constexpr std::array<UInt64, 65> MASKS = [] constexpr {
35+
std::array<UInt64, 65> masks = {};
36+
for (int i = 0; i < 64; ++i)
37+
{
38+
masks[i] = ~((1ULL << i) - 1);
39+
}
40+
masks[64] = 0;
41+
return masks;
42+
}();
43+
3344
/// Implementation details of filterArraysImpl function, used as template parameter.
3445
/// Allow to build or not to build offsets array.
3546

@@ -134,45 +145,81 @@ void filterArraysImplGeneric(
134145
while (filt_pos < filt_end_aligned)
135146
{
136147
auto mask = ToBits64(filt_pos);
148+
while (mask)
149+
{
150+
// 100011111000 -> index: 3, length: 5, mask: 100000000000
151+
size_t index = std::countr_zero(mask);
152+
size_t length = std::countr_one(mask >> index);
153+
copy_chunk(offsets_pos + index, length);
154+
mask &= MASKS[index + length];
155+
}
156+
157+
filt_pos += FILTER_SIMD_BYTES;
158+
offsets_pos += FILTER_SIMD_BYTES;
159+
}
160+
161+
while (filt_pos < filt_end)
162+
{
163+
if (*filt_pos)
164+
copy_chunk(offsets_pos, 1);
165+
166+
++filt_pos;
167+
++offsets_pos;
168+
}
169+
}
170+
171+
/// filterImplAligned is used for aligned part of filter.
172+
template <typename T, typename Container>
173+
inline void filterImplAligned(
174+
const UInt8 *& filt_pos,
175+
const UInt8 *& filt_end_aligned,
176+
const T *& data_pos,
177+
Container & res_data)
178+
{
179+
while (filt_pos < filt_end_aligned)
180+
{
181+
UInt64 mask = ToBits64(filt_pos);
137182
if likely (0 != mask)
138183
{
139-
if (const auto prefix_to_copy = prefixToCopy(mask); 0xFF != prefix_to_copy)
184+
if (const UInt8 prefix_to_copy = prefixToCopy(mask); 0xFF != prefix_to_copy)
140185
{
141-
copy_chunk(offsets_pos, prefix_to_copy);
186+
res_data.insert(data_pos, data_pos + prefix_to_copy);
142187
}
143188
else
144189
{
145-
if (const auto suffix_to_copy = suffixToCopy(mask); 0xFF != suffix_to_copy)
190+
if (const UInt8 suffix_to_copy = suffixToCopy(mask); 0xFF != suffix_to_copy)
146191
{
147-
copy_chunk(offsets_pos + FILTER_SIMD_BYTES - suffix_to_copy, suffix_to_copy);
192+
res_data.insert(data_pos + FILTER_SIMD_BYTES - suffix_to_copy, data_pos + FILTER_SIMD_BYTES);
148193
}
149194
else
150195
{
151196
while (mask)
152197
{
153198
size_t index = std::countr_zero(mask);
154-
copy_chunk(offsets_pos + index, 1);
199+
res_data.push_back(data_pos[index]);
155200
mask &= mask - 1;
156201
}
157202
}
158203
}
159204
}
205+
// There is an alternative implementation which is similar to the one in filterArraysImplGeneric.
206+
// But according to the micro benchmark, the below implementation is slower.
207+
// So we choose to still use the above implementation.
208+
// while (mask)
209+
// {
210+
// // 100011111000 -> index: 3, length: 5, mask: 100000000000
211+
// size_t index = std::countr_zero(mask);
212+
// size_t length = std::countr_one(mask >> index);
213+
// res_data.insert(data_pos + index, data_pos + index + length);
214+
// mask &= MASKS[index + length];
215+
// }
160216

161217
filt_pos += FILTER_SIMD_BYTES;
162-
offsets_pos += FILTER_SIMD_BYTES;
163-
}
164-
165-
while (filt_pos < filt_end)
166-
{
167-
if (*filt_pos)
168-
copy_chunk(offsets_pos, 1);
169-
170-
++filt_pos;
171-
++offsets_pos;
218+
data_pos += FILTER_SIMD_BYTES;
172219
}
173220
}
174-
} // namespace
175221

222+
} // namespace
176223

177224
template <typename T>
178225
void filterArraysImpl(
@@ -239,49 +286,6 @@ INSTANTIATE(Float64)
239286

240287
#undef INSTANTIATE
241288

242-
namespace
243-
{
244-
template <typename T, typename Container>
245-
inline void filterImplAligned(
246-
const UInt8 *& filt_pos,
247-
const UInt8 *& filt_end_aligned,
248-
const T *& data_pos,
249-
Container & res_data)
250-
{
251-
while (filt_pos < filt_end_aligned)
252-
{
253-
UInt64 mask = ToBits64(filt_pos);
254-
if likely (0 != mask)
255-
{
256-
if (const UInt8 prefix_to_copy = prefixToCopy(mask); 0xFF != prefix_to_copy)
257-
{
258-
res_data.insert(data_pos, data_pos + prefix_to_copy);
259-
}
260-
else
261-
{
262-
if (const UInt8 suffix_to_copy = suffixToCopy(mask); 0xFF != suffix_to_copy)
263-
{
264-
res_data.insert(data_pos + FILTER_SIMD_BYTES - suffix_to_copy, data_pos + FILTER_SIMD_BYTES);
265-
}
266-
else
267-
{
268-
while (mask)
269-
{
270-
size_t index = std::countr_zero(mask);
271-
res_data.push_back(data_pos[index]);
272-
mask &= mask - 1;
273-
}
274-
}
275-
}
276-
}
277-
278-
filt_pos += FILTER_SIMD_BYTES;
279-
data_pos += FILTER_SIMD_BYTES;
280-
}
281-
}
282-
} // namespace
283-
284-
285289
template <typename T, typename Container>
286290
void filterImpl(const UInt8 * filt_pos, const UInt8 * filt_end, const T * data_pos, Container & res_data)
287291
{

dbms/src/Columns/tests/bench_column_filter.cpp

Lines changed: 91 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515

16+
#include <Columns/ColumnUtil.h>
1617
#include <Columns/ColumnVector.h>
1718
#include <Columns/countBytesInFilter.h>
1819
#include <Columns/filterColumn.h>
@@ -130,6 +131,72 @@ ColumnPtr filterAVX2(ColumnPtr & col, IColumn::Filter & filt, ssize_t result_siz
130131
const UInt8 * filt_end = filt_pos + size;
131132
const Int64 * data_pos = &data[0];
132133

134+
const UInt8 * filt_end_aligned = filt_pos + (filt_end - filt_pos) / FILTER_SIMD_BYTES * FILTER_SIMD_BYTES;
135+
while (filt_pos < filt_end_aligned)
136+
{
137+
UInt64 mask = ToBits64(filt_pos);
138+
if likely (0 != mask)
139+
{
140+
if (const UInt8 prefix_to_copy = prefixToCopy(mask); 0xFF != prefix_to_copy)
141+
{
142+
res_data.insert(data_pos, data_pos + prefix_to_copy);
143+
}
144+
else
145+
{
146+
if (const UInt8 suffix_to_copy = suffixToCopy(mask); 0xFF != suffix_to_copy)
147+
{
148+
res_data.insert(data_pos + FILTER_SIMD_BYTES - suffix_to_copy, data_pos + FILTER_SIMD_BYTES);
149+
}
150+
else
151+
{
152+
while (mask)
153+
{
154+
size_t index = std::countr_zero(mask);
155+
res_data.push_back(data_pos[index]);
156+
mask &= mask - 1;
157+
}
158+
}
159+
}
160+
}
161+
162+
filt_pos += FILTER_SIMD_BYTES;
163+
data_pos += FILTER_SIMD_BYTES;
164+
}
165+
166+
/// Process the tail.
167+
while (filt_pos < filt_end)
168+
{
169+
if (*filt_pos)
170+
res_data.push_back(*data_pos);
171+
++filt_pos;
172+
++data_pos;
173+
}
174+
175+
return res;
176+
}
177+
178+
ColumnPtr filterCurrent(ColumnPtr & col, IColumn::Filter & filt, ssize_t result_size_hint)
179+
{
180+
const auto & data = typeid_cast<const ColumnVector<Int64> *>(col.get())->getData();
181+
size_t size = col->size();
182+
if (size != filt.size())
183+
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
184+
185+
auto res = ColumnVector<Int64>::create();
186+
using Container = ColumnVector<Int64>::Container;
187+
Container & res_data = res->getData();
188+
189+
if (result_size_hint)
190+
{
191+
if (result_size_hint < 0)
192+
result_size_hint = countBytesInFilter(filt);
193+
res_data.reserve(result_size_hint);
194+
}
195+
196+
const UInt8 * filt_pos = &filt[0];
197+
const UInt8 * filt_end = filt_pos + size;
198+
const Int64 * data_pos = &data[0];
199+
133200
filterImpl(filt_pos, filt_end, data_pos, res_data);
134201

135202
return res;
@@ -139,6 +206,7 @@ enum class FilterVersion
139206
{
140207
SSE2,
141208
AVX2,
209+
Current,
142210
};
143211

144212
template <typename... Args>
@@ -158,45 +226,66 @@ void columnFilter(benchmark::State & state, Args &&... args)
158226
{
159227
for (auto _ : state)
160228
{
161-
auto t = filterSSE2(col, filter, set_n * sizeof(Int64));
229+
auto t = filterSSE2(col, filter, set_n);
230+
benchmark::DoNotOptimize(t);
231+
}
232+
}
233+
else if (version == FilterVersion::AVX2)
234+
{
235+
for (auto _ : state)
236+
{
237+
auto t = filterAVX2(col, filter, set_n);
162238
benchmark::DoNotOptimize(t);
163239
}
164240
}
165241
else
166242
{
167243
for (auto _ : state)
168244
{
169-
auto t = filterAVX2(col, filter, set_n * sizeof(Int64));
245+
auto t = filterCurrent(col, filter, set_n);
170246
benchmark::DoNotOptimize(t);
171247
}
172248
}
173249
}
174250

175251
BENCHMARK_CAPTURE(columnFilter, sse2_00, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.00);
176252
BENCHMARK_CAPTURE(columnFilter, avx2_00, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.00);
253+
BENCHMARK_CAPTURE(columnFilter, cur_00, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.00);
177254
BENCHMARK_CAPTURE(columnFilter, sse2_01, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.01);
178255
BENCHMARK_CAPTURE(columnFilter, avx2_01, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.01);
256+
BENCHMARK_CAPTURE(columnFilter, cur_01, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.01);
179257
BENCHMARK_CAPTURE(columnFilter, sse2_10, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.10);
180258
BENCHMARK_CAPTURE(columnFilter, avx2_10, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.10);
259+
BENCHMARK_CAPTURE(columnFilter, cur_10, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.10);
181260
BENCHMARK_CAPTURE(columnFilter, sse2_20, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.20);
182261
BENCHMARK_CAPTURE(columnFilter, avx2_20, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.20);
262+
BENCHMARK_CAPTURE(columnFilter, cur_20, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.20);
183263
BENCHMARK_CAPTURE(columnFilter, sse2_30, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.30);
184264
BENCHMARK_CAPTURE(columnFilter, avx2_30, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.30);
265+
BENCHMARK_CAPTURE(columnFilter, cur_30, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.30);
185266
BENCHMARK_CAPTURE(columnFilter, sse2_40, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.40);
186267
BENCHMARK_CAPTURE(columnFilter, avx2_40, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.40);
268+
BENCHMARK_CAPTURE(columnFilter, cur_40, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.40);
187269
BENCHMARK_CAPTURE(columnFilter, sse2_50, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.50);
188270
BENCHMARK_CAPTURE(columnFilter, avx2_50, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.50);
271+
BENCHMARK_CAPTURE(columnFilter, cur_50, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.50);
189272
BENCHMARK_CAPTURE(columnFilter, sse2_60, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.60);
190273
BENCHMARK_CAPTURE(columnFilter, avx2_60, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.60);
274+
BENCHMARK_CAPTURE(columnFilter, cur_60, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.60);
191275
BENCHMARK_CAPTURE(columnFilter, sse2_70, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.70);
192276
BENCHMARK_CAPTURE(columnFilter, avx2_70, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.70);
277+
BENCHMARK_CAPTURE(columnFilter, cur_70, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.70);
193278
BENCHMARK_CAPTURE(columnFilter, sse2_80, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.80);
194279
BENCHMARK_CAPTURE(columnFilter, avx2_80, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.80);
280+
BENCHMARK_CAPTURE(columnFilter, cur_80, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.80);
195281
BENCHMARK_CAPTURE(columnFilter, sse2_90, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.90);
196282
BENCHMARK_CAPTURE(columnFilter, avx2_90, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.90);
283+
BENCHMARK_CAPTURE(columnFilter, cur_90, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.90);
197284
BENCHMARK_CAPTURE(columnFilter, sse2_99, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 0.99);
198285
BENCHMARK_CAPTURE(columnFilter, avx2_99, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 0.99);
286+
BENCHMARK_CAPTURE(columnFilter, cur_99, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 0.99);
199287
BENCHMARK_CAPTURE(columnFilter, sse2_100, FilterVersion::SSE2, DEFAULT_BLOCK_SIZE, 1.00);
200288
BENCHMARK_CAPTURE(columnFilter, avx2_100, FilterVersion::AVX2, DEFAULT_BLOCK_SIZE, 1.00);
289+
BENCHMARK_CAPTURE(columnFilter, cur_100, FilterVersion::Current, DEFAULT_BLOCK_SIZE, 1.00);
201290

202291
} // namespace bench

0 commit comments

Comments
 (0)