@@ -21,23 +21,32 @@ use databend_common_expression::types::BinaryType;
21
21
use databend_common_expression:: types:: DataType ;
22
22
use databend_common_expression:: types:: GenericType ;
23
23
use databend_common_expression:: types:: NullableType ;
24
+ use databend_common_expression:: types:: NumberDataType ;
24
25
use databend_common_expression:: types:: NumberType ;
25
26
use databend_common_expression:: types:: ReturnType ;
27
+ use databend_common_expression:: types:: StringType ;
26
28
use databend_common_expression:: types:: ValueType ;
29
+ use databend_common_expression:: types:: ALL_NUMERICS_TYPES ;
30
+ use databend_common_expression:: vectorize_with_builder_1_arg;
27
31
use databend_common_expression:: vectorize_with_builder_2_arg;
32
+ use databend_common_expression:: with_number_mapped_type;
28
33
use databend_common_expression:: Column ;
29
34
use databend_common_expression:: FixedLengthEncoding ;
30
35
use databend_common_expression:: Function ;
31
36
use databend_common_expression:: FunctionDomain ;
32
37
use databend_common_expression:: FunctionEval ;
38
+ use databend_common_expression:: FunctionProperty ;
33
39
use databend_common_expression:: FunctionRegistry ;
34
40
use databend_common_expression:: FunctionSignature ;
35
41
use databend_common_expression:: ScalarRef ;
36
42
use databend_common_expression:: Value ;
43
+ use rand:: rngs:: SmallRng ;
44
+ use rand:: Rng ;
45
+ use rand:: SeedableRng ;
37
46
38
47
/// Registers Hilbert curve related functions with the function registry.
39
48
pub fn register ( registry : & mut FunctionRegistry ) {
40
- // Register the hilbert_range_index function that calculates Hilbert indices for multi-dimensional data
49
+ // Register the hilbert_range_index function that calculates Hilbert indices for multidimensional data
41
50
registry. register_function_factory ( "hilbert_range_index" , |_, args_type| {
42
51
let args_num = args_type. len ( ) ;
43
52
// The function supports 2, 3, 4, or 5 dimensions (each dimension requires 2 arguments)
@@ -96,7 +105,7 @@ pub fn register(registry: &mut FunctionRegistry) {
96
105
points. push ( key) ;
97
106
}
98
107
99
- // Convert the multi-dimensional point to a Hilbert index
108
+ // Convert the multidimensional point to a Hilbert index
100
109
// This maps the n-dimensional point to a 1-dimensional value
101
110
let points = points
102
111
. iter ( )
@@ -151,6 +160,88 @@ pub fn register(registry: &mut FunctionRegistry) {
151
160
builder. push ( id) ;
152
161
} ) ,
153
162
) ;
163
+
164
+ // We use true randomness by appending a random u8 value at the end of the binary key.
165
+ // This introduces noise to break tie cases in clustering keys that are not uniformly distributed.
166
+ // Although this may slightly affect the accuracy of range_bound estimation,
167
+ // it ensures that Hilbert index + scatter will no longer suffer from data skew.
168
+ // Moreover, since the noise is added at the tail, the original order of the keys is preserved.
169
+ registry. properties . insert (
170
+ "add_noise" . to_string ( ) ,
171
+ FunctionProperty :: default ( ) . non_deterministic ( ) ,
172
+ ) ;
173
+
174
+ registry. register_passthrough_nullable_1_arg :: < StringType , BinaryType , _ , _ > (
175
+ "add_noise" ,
176
+ |_, _| FunctionDomain :: Full ,
177
+ vectorize_with_builder_1_arg :: < StringType , BinaryType > ( |val, builder, _| {
178
+ let mut bytes = val. as_bytes ( ) . to_vec ( ) ;
179
+ let mut rng = SmallRng :: from_entropy ( ) ;
180
+ bytes. push ( rng. gen :: < u8 > ( ) ) ;
181
+ builder. put_slice ( & bytes) ;
182
+ builder. commit_row ( ) ;
183
+ } ) ,
184
+ ) ;
185
+
186
+ for ty in ALL_NUMERICS_TYPES {
187
+ with_number_mapped_type ! ( |NUM_TYPE | match ty {
188
+ NumberDataType :: NUM_TYPE => {
189
+ registry
190
+ . register_passthrough_nullable_1_arg:: <NumberType <NUM_TYPE >, BinaryType , _, _>(
191
+ "add_noise" ,
192
+ |_, _| FunctionDomain :: Full ,
193
+ vectorize_with_builder_1_arg:: <NumberType <NUM_TYPE >, BinaryType >(
194
+ |val, builder, _| {
195
+ let mut encoded = val. encode( ) . to_vec( ) ;
196
+ let mut rng = SmallRng :: from_entropy( ) ;
197
+ encoded. push( rng. gen :: <u8 >( ) ) ;
198
+ builder. put_slice( & encoded) ;
199
+ builder. commit_row( ) ;
200
+ } ,
201
+ ) ,
202
+ ) ;
203
+ }
204
+ } )
205
+ }
206
+
207
+ registry. register_passthrough_nullable_2_arg :: < StringType , NumberType < u64 > , BinaryType , _ , _ > (
208
+ "add_noise" ,
209
+ |_, _, _| FunctionDomain :: Full ,
210
+ vectorize_with_builder_2_arg :: < StringType , NumberType < u64 > , BinaryType > (
211
+ |val, level, builder, _| {
212
+ let mut bytes = val. as_bytes ( ) . to_vec ( ) ;
213
+ let mut rng = SmallRng :: from_entropy ( ) ;
214
+ for _ in 0 ..level {
215
+ bytes. push ( rng. gen :: < u8 > ( ) ) ;
216
+ }
217
+ builder. put_slice ( & bytes) ;
218
+ builder. commit_row ( ) ;
219
+ } ,
220
+ ) ,
221
+ ) ;
222
+
223
+ for ty in ALL_NUMERICS_TYPES {
224
+ with_number_mapped_type ! ( |NUM_TYPE | match ty {
225
+ NumberDataType :: NUM_TYPE => {
226
+ registry
227
+ . register_passthrough_nullable_2_arg:: <NumberType <NUM_TYPE >, NumberType <u64 >, BinaryType , _, _>(
228
+ "add_noise" ,
229
+ |_, _, _| FunctionDomain :: Full ,
230
+ vectorize_with_builder_2_arg:: <NumberType <NUM_TYPE >, NumberType <u64 >, BinaryType >(
231
+ |val, level, builder, _| {
232
+ let mut encoded = val. encode( ) . to_vec( ) ;
233
+ let mut rng = SmallRng :: from_entropy( ) ;
234
+ for _ in 0 ..level {
235
+ encoded. push( rng. gen :: <u8 >( ) ) ;
236
+ }
237
+ builder. put_slice( & encoded) ;
238
+ builder. commit_row( ) ;
239
+ } ,
240
+ ) ,
241
+ ) ;
242
+ }
243
+ } )
244
+ }
154
245
}
155
246
156
247
/// Calculates the partition ID for a value based on range boundaries.
0 commit comments