Skip to content

Commit e459279

Browse files
authored
[Chore](hash) use google/crc32c to instead rocksdb/crc32c and crc_hash (#58557)
doris have crc32c from rocksdb now, but it has poorly performance than google/crc32c. 66663538 rows int crc32c-rocksdb 684.879ms crc32c-google 206.360ms 66663538 rows varchar crc32c-rocksdb 1sec368ms crc32c-google 391.290ms We already have unit tests for rocksdb/crc32c([be/test/util/crc32c_test.cpp](https://github.com/apache/doris/blob/master/be/test/util/crc32c_test.cpp)), so this change is safe This pull request updates the codebase to use the more efficient and modern CRC32C hashing algorithm in place of the older CRC32 implementation. The changes include switching hash functions throughout the code, updating the CRC32C utility implementation to use an external library, and adding the required third-party dependency. This improves hash performance and consistency, and prepares the codebase for future compatibility. **Hashing algorithm migration:** * Replaced all usages of `HashUtil::crc_hash` with `HashUtil::crc32c_hash` in `block_bloom_filter.hpp`, `column_dictionary.h`, and `function_string.h` to utilize CRC32C for better performance and reliability. [[1]](diffhunk://#diff-635476edd1321096d1d32eb6453bed4624e8f23d0580750d515aaad9dfe5404eL79-R79) [[2]](diffhunk://#diff-635476edd1321096d1d32eb6453bed4624e8f23d0580750d515aaad9dfe5404eL108-R108) [[3]](diffhunk://#diff-bf8bb38b6a6eae6cccd7ed62ff64b1a77fbd273a614348b096330abea8331b4dL348-R348) [[4]](diffhunk://#diff-9cc694af32a330f9ffd947df039bdfc12be67b2107c9e612d7861b17c5018176L4601-R4601) * Added the new `crc32c_hash` method to `HashUtil` and marked the old `crc_hash` as deprecated, retaining it only for backward compatibility with historical data. [[1]](diffhunk://#diff-92d951e58f5e0b824254f5eb0d931b604518e4bfbe666b665cd56ed9435667bbL52-R58) [[2]](diffhunk://#diff-92d951e58f5e0b824254f5eb0d931b604518e4bfbe666b665cd56ed9435667bbR68-R69) [[3]](diffhunk://#diff-92d951e58f5e0b824254f5eb0d931b604518e4bfbe666b665cd56ed9435667bbL120-L124) **CRC32C utility refactor and dependency management:** * Refactored `crc32c.cpp` and `crc32c.h` to use the external `crc32c` library, removing the previous custom implementation and lookup tables. Added new utility functions for CRC32C operations. [[1]](diffhunk://#diff-1a21d70259827997bdfd54da21acd6db2ae0a29465873b53dbf8c7e9c6a7e265L18-R38) [[2]](diffhunk://#diff-72d5c6ec3fe2da095fe1413472778c1d56027242035bdb83c62339ccfcca6ed6L18-R33) * Added the `crc32c` third-party dependency in the build configuration to support the new CRC32C utility. **Build and header updates:** * Updated includes in `hash_util.hpp` to reference the new CRC32C utility.
1 parent 3e785ee commit e459279

File tree

7 files changed

+28
-273
lines changed

7 files changed

+28
-273
lines changed

be/cmake/thirdparty.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ add_thirdparty(curl)
6868
add_thirdparty(lz4)
6969
add_thirdparty(thrift)
7070
add_thirdparty(thriftnb)
71+
add_thirdparty(crc32c)
7172

7273
add_thirdparty(libevent_core LIBNAME "lib/libevent_core.a")
7374
add_thirdparty(libevent_openssl LIBNAME "lib/libevent_openssl.a")

be/src/exprs/block_bloom_filter.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class BlockBloomFilter {
7676
// Same as above with convenience of hashing the key.
7777
void insert(const StringRef& key) noexcept {
7878
if (key.data) {
79-
insert(HashUtil::crc_hash(key.data, uint32_t(key.size), _hash_seed));
79+
insert(HashUtil::crc32c_hash(key.data, uint32_t(key.size), _hash_seed));
8080
}
8181
}
8282

@@ -105,7 +105,7 @@ class BlockBloomFilter {
105105
// Same as above with convenience of hashing the key.
106106
bool find(const StringRef& key) const noexcept {
107107
if (key.data) {
108-
return find(HashUtil::crc_hash(key.data, uint32_t(key.size), _hash_seed));
108+
return find(HashUtil::crc32c_hash(key.data, uint32_t(key.size), _hash_seed));
109109
}
110110
return false;
111111
}

be/src/util/crc32c.cpp

Lines changed: 11 additions & 243 deletions
Large diffs are not rendered by default.

be/src/util/crc32c.h

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,39 +15,22 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
// the following code are modified from RocksDB:
19-
// https://github.com/facebook/rocksdb/blob/master/util/crc32c.h
20-
2118
#pragma once
2219

23-
#include <stddef.h>
24-
#include <stdint.h>
25-
2620
#include <vector>
2721

2822
#include "util/slice.h"
2923

3024
namespace doris {
3125
namespace crc32c {
3226

33-
// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
34-
// crc32c of some string A. Extend() is often used to maintain the
35-
// crc32c of a stream of data.
36-
extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
27+
uint32_t Extend(uint32_t crc, const char* data, size_t n);
3728

3829
// Return the crc32c of data[0,n-1]
39-
inline uint32_t Value(const char* data, size_t n) {
40-
return Extend(0, data, n);
41-
}
30+
uint32_t Value(const char* data, size_t n);
4231

4332
// Return the crc32c of data content in all slices
44-
inline uint32_t Value(const std::vector<Slice>& slices) {
45-
uint32_t crc = 0;
46-
for (auto& slice : slices) {
47-
crc = Extend(crc, slice.get_data(), slice.get_size());
48-
}
49-
return crc;
50-
}
33+
uint32_t Value(const std::vector<Slice>& slices);
5134

5235
} // namespace crc32c
5336
} // namespace doris

be/src/util/hash_util.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
#include "common/compiler_util.h" // IWYU pragma: keep
3232
#include "util/cpu_info.h"
33+
#include "util/crc32c.h"
3334
#include "util/hash/city.h"
3435
#include "util/murmur_hash3.h"
3536
#include "util/sse_util.hpp"
@@ -49,7 +50,12 @@ class HashUtil {
4950
return (uint32_t)crc32(hash, (const unsigned char*)(&INT_VALUE), 4);
5051
}
5152

52-
#if defined(__SSE4_2__) || defined(__aarch64__)
53+
// ATTN: crc32c's result is different with zlib_crc32 coz of different polynomial
54+
// crc32c have better performance than zlib_crc32/crc_hash
55+
static uint32_t crc32c_hash(const void* data, uint32_t bytes, uint32_t hash) {
56+
return crc32c::Extend(hash, static_cast<const char*>(data), bytes);
57+
}
58+
5359
// Compute the Crc32 hash for data using SSE4 instructions. The input hash parameter is
5460
// the current hash/seed value.
5561
// This should only be called if SSE is supported.
@@ -59,6 +65,8 @@ class HashUtil {
5965
// NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn.
6066
// TODO: crc32 hashes with different seeds do not result in different hash functions.
6167
// The resulting hashes are correlated.
68+
// ATTN: prefer do not use this function anymore, use crc32c_hash instead
69+
// This function is retained because it is not certain whether there are compatibility issues with historical data.
6270
static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
6371
if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
6472
return zlib_crc_hash(data, bytes, hash);
@@ -117,11 +125,6 @@ class HashUtil {
117125

118126
return converter.u64;
119127
}
120-
#else
121-
static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
122-
return zlib_crc_hash(data, bytes, hash);
123-
}
124-
#endif
125128

126129
// refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
127130
static const uint32_t MURMUR3_32_SEED = 104729;

be/src/vec/columns/column_dictionary.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ class ColumnDictI32 final : public COWHelper<IColumn, ColumnDictI32> {
345345
if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
346346
len = strnlen(sv.data, sv.size);
347347
}
348-
uint32_t hash_val = HashUtil::crc_hash(sv.data, static_cast<uint32_t>(len), 0);
348+
uint32_t hash_val = HashUtil::crc32c_hash(sv.data, static_cast<uint32_t>(len), 0);
349349
_hash_values[code] = hash_val;
350350
_compute_hash_value_flags[code] = 1;
351351
return _hash_values[code];

be/src/vec/functions/function_string.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4598,7 +4598,7 @@ class FunctionNgramSearch : public IFunction {
45984598

45994599
uint32_t sub_str_hash(const char* data, int32_t length) const {
46004600
constexpr static uint32_t seed = 0;
4601-
return HashUtil::crc_hash(data, length, seed);
4601+
return HashUtil::crc32c_hash(data, length, seed);
46024602
}
46034603

46044604
template <bool column_const>

0 commit comments

Comments
 (0)