Skip to content
This repository was archived by the owner on May 9, 2024. It is now read-only.

Commit 0e89ec1

Browse files
committed
Generate optimized overload of non-vectorized bitmap generator
This function can be selected on systems supporting BMI2, which introduces a shift instruction without modifying flags. This simplifes the uop pipeline and improves performance.
1 parent 0543443 commit 0e89ec1

File tree

3 files changed

+24
-18
lines changed

3 files changed

+24
-18
lines changed

omniscidb/ResultSet/BitmapGenerators.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,15 @@
1010

1111
#include <cstring>
1212

13-
#include <iostream>
14-
1513
#ifndef _WIN32
16-
size_t __attribute__((target("avx512f", "avx512bw"), optimize("no-tree-vectorize")))
14+
15+
#if defined(__clang__)
16+
#define AVX512_TARGET target("avx512bw")
17+
#else
18+
#define AVX512_TARGET target("avx512f", "avx512bw")
19+
#endif
20+
21+
size_t __attribute__((AVX512_TARGET, optimize("no-tree-vectorize")))
1722
gen_null_bitmap_8_impl(uint8_t* dst,
1823
const uint8_t* src,
1924
size_t size,
@@ -40,7 +45,7 @@ gen_null_bitmap_8_impl(uint8_t* dst,
4045
return null_count;
4146
}
4247

43-
size_t __attribute__((target("avx512f", "avx512bw"), optimize("no-tree-vectorize")))
48+
size_t __attribute__((AVX512_TARGET, optimize("no-tree-vectorize")))
4449
gen_null_bitmap_16_impl(uint8_t* dst,
4550
const uint16_t* src,
4651
size_t size,
@@ -67,7 +72,7 @@ gen_null_bitmap_16_impl(uint8_t* dst,
6772
return null_count;
6873
}
6974

70-
size_t __attribute__((target("avx512f", "avx512bw"), optimize("no-tree-vectorize")))
75+
size_t __attribute__((AVX512_TARGET, optimize("no-tree-vectorize")))
7176
gen_null_bitmap_32_impl(uint8_t* dst,
7277
const uint32_t* src,
7378
size_t size,
@@ -93,7 +98,7 @@ gen_null_bitmap_32_impl(uint8_t* dst,
9398
return null_count;
9499
}
95100

96-
size_t __attribute__((target("avx512f", "avx512bw"), optimize("no-tree-vectorize")))
101+
size_t __attribute__((AVX512_TARGET, optimize("no-tree-vectorize")))
97102
gen_null_bitmap_64_impl(uint8_t* dst,
98103
const uint64_t* src,
99104
size_t size,
@@ -120,11 +125,15 @@ gen_null_bitmap_64_impl(uint8_t* dst,
120125
}
121126
#endif
122127

128+
#if defined(_MSC_VER) || defined(__clang__)
129+
#define BITMAP_GEN_DEFAULT_TARGET_ATTRS
130+
#else
131+
#define BITMAP_GEN_DEFAULT_TARGET_ATTRS __attribute__((target_clones("bmi2", "default")))
132+
#endif
133+
123134
template <typename TYPE>
124-
size_t gen_null_bitmap_default(uint8_t* dst,
125-
const TYPE* src,
126-
size_t size,
127-
const TYPE null_val) {
135+
size_t BITMAP_GEN_DEFAULT_TARGET_ATTRS
136+
gen_null_bitmap_default(uint8_t* dst, const TYPE* src, size_t size, const TYPE null_val) {
128137
size_t null_count = 0;
129138
TYPE loaded_data[8];
130139

@@ -156,7 +165,6 @@ size_t BITMAP_GEN_TARGET_ATTRS gen_null_bitmap_8_impl(uint8_t* dst,
156165
const uint8_t* src,
157166
size_t size,
158167
const uint8_t null_val) {
159-
// std::cout << "Using default impl " << std::endl;
160168
return gen_null_bitmap_default<uint8_t>(dst, src, size, null_val);
161169
}
162170

omniscidb/Tests/BitmapGeneratorBenchmark.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@ static std::unique_ptr<T[]> gen_data(size_t size, const T null_val, float percen
4545
return data_buf;
4646
}
4747

48-
#include <iostream>
49-
5048
static auto alloc_bitmap(size_t input_sz) {
5149
// allocate extra space to ensure we can align
5250
size_t crt_sz = input_sz;
@@ -61,7 +59,6 @@ static auto alloc_bitmap(size_t input_sz) {
6159
return bitmap_buf_owned;
6260
}
6361

64-
// TODO: get null count and verify
6562
static void null_bitmap_8(benchmark::State& state) {
6663
// generate a 4096 element buffer with 25% nulls (+ a few more since the null sentinel
6764
// will match the max size in the buffer)

omniscidb/Tests/CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,11 @@ endif()
5858

5959
# Tests + Microbenchmarks
6060
add_executable(StringDictionaryBenchmark StringDictionaryBenchmark.cpp)
61-
add_executable(BitmapGeneratorBenchmark BitmapGeneratorBenchmark.cpp)
62-
target_link_libraries(BitmapGeneratorBenchmark ResultSet)
61+
if (NOT WIN32)
62+
# aligned_alloc is missing on windows
63+
add_executable(BitmapGeneratorBenchmark BitmapGeneratorBenchmark.cpp)
64+
target_link_libraries(BitmapGeneratorBenchmark ResultSet benchmark)
65+
endif()
6366

6467
if(ENABLE_L0)
6568
add_executable(L0MgrExecuteTest L0MgrExecuteTest.cpp)
@@ -136,8 +139,6 @@ else()
136139
target_link_libraries(StringDictionaryBenchmark benchmark gtest StringDictionary Logger Utils $<$<AND:$<CXX_COMPILER_ID:GNU>,$<VERSION_LESS:$<CXX_COMPILER_VERSION>,9.0>>:stdc++fs> ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES})
137140
endif()
138141

139-
target_link_libraries(BitmapGeneratorBenchmark benchmark)
140-
141142
if(ENABLE_CUDA)
142143
target_link_libraries(GpuSharedMemoryTest gtest Logger QueryEngine)
143144
endif()

0 commit comments

Comments
 (0)