Skip to content

Commit fea543c

Browse files
authored
[Feature](func) Support function QUANTILE_STATE_TO/FROM_BASE64 (apache#59664)
### What problem does this PR solve? Issue Number: close #xxx Related PR: apache#59607 Problem Summary: ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [x] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [ ] Yes. <!-- Add document PR link here. eg: apache/doris-website#1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into -->
1 parent e1e58d1 commit fea543c

File tree

9 files changed

+627
-0
lines changed

9 files changed

+627
-0
lines changed

be/src/vec/functions/function_quantile_state.cpp

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "common/compiler_util.h" // IWYU pragma: keep
3030
#include "common/status.h"
3131
#include "util/quantile_state.h"
32+
#include "util/url_coding.h"
3233
#include "vec/aggregate_functions/aggregate_function.h"
3334
#include "vec/columns/column.h"
3435
#include "vec/columns/column_complex.h"
@@ -45,9 +46,11 @@
4546
#include "vec/data_types/data_type_nullable.h"
4647
#include "vec/data_types/data_type_number.h"
4748
#include "vec/data_types/data_type_quantilestate.h" // IWYU pragma: keep
49+
#include "vec/data_types/data_type_string.h"
4850
#include "vec/functions/function.h"
4951
#include "vec/functions/function_const.h"
5052
#include "vec/functions/function_helpers.h"
53+
#include "vec/functions/function_totype.h"
5154
#include "vec/functions/simple_function_factory.h"
5255
#include "vec/utils/util.hpp"
5356

@@ -210,10 +213,134 @@ class FunctionQuantileStatePercent : public IFunction {
210213
}
211214
};
212215

216+
class FunctionQuantileStateFromBase64 : public IFunction {
217+
public:
218+
static constexpr auto name = "quantile_state_from_base64";
219+
String get_name() const override { return name; }
220+
221+
static FunctionPtr create() { return std::make_shared<FunctionQuantileStateFromBase64>(); }
222+
223+
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
224+
return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeQuantileState>());
225+
}
226+
227+
size_t get_number_of_arguments() const override { return 1; }
228+
229+
bool use_default_implementation_for_nulls() const override { return true; }
230+
231+
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
232+
uint32_t result, size_t input_rows_count) const override {
233+
auto res_null_map = ColumnUInt8::create(input_rows_count, 0);
234+
auto res_data_column = ColumnQuantileState::create();
235+
auto& null_map = res_null_map->get_data();
236+
auto& res = res_data_column->get_data();
237+
238+
auto& argument_column = block.get_by_position(arguments[0]).column;
239+
const auto& str_column = static_cast<const ColumnString&>(*argument_column);
240+
const ColumnString::Chars& data = str_column.get_chars();
241+
const ColumnString::Offsets& offsets = str_column.get_offsets();
242+
243+
res.reserve(input_rows_count);
244+
245+
std::string decode_buff;
246+
int64_t last_decode_buff_len = 0;
247+
int64_t curr_decode_buff_len = 0;
248+
for (size_t i = 0; i < input_rows_count; ++i) {
249+
const char* src_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
250+
int64_t src_size = offsets[i] - offsets[i - 1];
251+
252+
if (src_size == 0 || 0 != src_size % 4) {
253+
res.emplace_back();
254+
null_map[i] = 1;
255+
continue;
256+
}
257+
258+
curr_decode_buff_len = src_size + 3;
259+
if (curr_decode_buff_len > last_decode_buff_len) {
260+
decode_buff.resize(curr_decode_buff_len);
261+
last_decode_buff_len = curr_decode_buff_len;
262+
}
263+
auto outlen = base64_decode(src_str, src_size, decode_buff.data());
264+
if (outlen < 0) {
265+
res.emplace_back();
266+
null_map[i] = 1;
267+
} else {
268+
doris::Slice decoded_slice(decode_buff.data(), outlen);
269+
doris::QuantileState quantile_state;
270+
if (!quantile_state.deserialize(decoded_slice)) {
271+
return Status::RuntimeError(fmt::format(
272+
"quantile_state_from_base64 decode failed: base64: {}", src_str));
273+
} else {
274+
res.emplace_back(std::move(quantile_state));
275+
}
276+
}
277+
}
278+
279+
block.get_by_position(result).column =
280+
ColumnNullable::create(std::move(res_data_column), std::move(res_null_map));
281+
return Status::OK();
282+
}
283+
};
284+
285+
struct NameQuantileStateToBase64 {
286+
static constexpr auto name = "quantile_state_to_base64";
287+
};
288+
289+
struct QuantileStateToBase64 {
290+
using ReturnType = DataTypeString;
291+
static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_QUANTILE_STATE;
292+
using Type = DataTypeQuantileState::FieldType;
293+
using ReturnColumnType = ColumnString;
294+
using Chars = ColumnString::Chars;
295+
using Offsets = ColumnString::Offsets;
296+
297+
static Status vector(const std::vector<QuantileState>& data, Chars& chars, Offsets& offsets) {
298+
size_t size = data.size();
299+
offsets.resize(size);
300+
size_t output_char_size = 0;
301+
for (size_t i = 0; i < size; ++i) {
302+
auto& quantile_state_val = const_cast<QuantileState&>(data[i]);
303+
auto ser_size = quantile_state_val.get_serialized_size();
304+
output_char_size += (int)(4.0 * ceil((double)ser_size / 3.0));
305+
}
306+
ColumnString::check_chars_length(output_char_size, size);
307+
chars.resize(output_char_size);
308+
auto* chars_data = chars.data();
309+
310+
size_t cur_ser_size = 0;
311+
size_t last_ser_size = 0;
312+
std::string ser_buff;
313+
size_t encoded_offset = 0;
314+
for (size_t i = 0; i < size; ++i) {
315+
auto& quantile_state_val = const_cast<QuantileState&>(data[i]);
316+
317+
cur_ser_size = quantile_state_val.get_serialized_size();
318+
if (cur_ser_size > last_ser_size) {
319+
last_ser_size = cur_ser_size;
320+
ser_buff.resize(cur_ser_size);
321+
}
322+
size_t real_size =
323+
quantile_state_val.serialize(reinterpret_cast<uint8_t*>(ser_buff.data()));
324+
auto outlen = base64_encode((const unsigned char*)ser_buff.data(), real_size,
325+
chars_data + encoded_offset);
326+
DCHECK(outlen > 0);
327+
328+
encoded_offset += outlen;
329+
offsets[i] = cast_set<uint32_t>(encoded_offset);
330+
}
331+
return Status::OK();
332+
}
333+
};
334+
335+
using FunctionQuantileStateToBase64 =
336+
FunctionUnaryToType<QuantileStateToBase64, NameQuantileStateToBase64>;
337+
213338
void register_function_quantile_state(SimpleFunctionFactory& factory) {
214339
factory.register_function<FunctionConst<QuantileStateEmpty, false>>();
215340
factory.register_function<FunctionQuantileStatePercent>();
216341
factory.register_function<FunctionToQuantileState>();
342+
factory.register_function<FunctionQuantileStateFromBase64>();
343+
factory.register_function<FunctionQuantileStateToBase64>();
217344
}
218345

219346
} // namespace doris::vectorized
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
#include <gtest/gtest.h>
18+
19+
#include <string>
20+
21+
#include "function_test_util.h"
22+
#include "runtime/define_primitive_type.h"
23+
#include "util/quantile_state.h"
24+
#include "util/url_coding.h"
25+
#include "vec/core/types.h"
26+
#include "vec/data_types/data_type_quantilestate.h"
27+
#include "vec/data_types/data_type_string.h"
28+
29+
namespace doris::vectorized {
30+
31+
TEST(function_quantile_state_test, function_quantile_state_to_base64) {
32+
std::string func_name = "quantile_state_to_base64";
33+
InputTypeSet input_types = {PrimitiveType::TYPE_QUANTILE_STATE};
34+
35+
QuantileState empty_quantile_state;
36+
37+
QuantileState single_quantile_state;
38+
single_quantile_state.add_value(1.0);
39+
40+
QuantileState multi_quantile_state;
41+
multi_quantile_state.add_value(1.0);
42+
multi_quantile_state.add_value(2.0);
43+
multi_quantile_state.add_value(3.0);
44+
multi_quantile_state.add_value(4.0);
45+
multi_quantile_state.add_value(5.0);
46+
47+
QuantileState explicit_quantile_state;
48+
for (int i = 0; i < 100; i++) {
49+
explicit_quantile_state.add_value(static_cast<double>(i));
50+
}
51+
52+
QuantileState tdigest_quantile_state;
53+
for (int i = 0; i < 3000; i++) {
54+
tdigest_quantile_state.add_value(static_cast<double>(i));
55+
}
56+
57+
uint8_t buf[65536];
58+
unsigned char encoded_buf[131072];
59+
60+
std::string empty_base64;
61+
{
62+
size_t len = empty_quantile_state.serialize(buf);
63+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
64+
empty_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
65+
}
66+
67+
std::string single_base64;
68+
{
69+
size_t len = single_quantile_state.serialize(buf);
70+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
71+
single_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
72+
}
73+
74+
std::string multi_base64;
75+
{
76+
size_t len = multi_quantile_state.serialize(buf);
77+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
78+
multi_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
79+
}
80+
81+
std::string explicit_base64;
82+
{
83+
size_t len = explicit_quantile_state.serialize(buf);
84+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
85+
explicit_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
86+
}
87+
88+
std::string tdigest_base64;
89+
{
90+
size_t len = tdigest_quantile_state.serialize(buf);
91+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
92+
tdigest_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
93+
}
94+
95+
{
96+
DataSet data_set = {{{&empty_quantile_state}, empty_base64},
97+
{{&single_quantile_state}, single_base64},
98+
{{&multi_quantile_state}, multi_base64},
99+
{{&explicit_quantile_state}, explicit_base64},
100+
{{&tdigest_quantile_state}, tdigest_base64}};
101+
102+
static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set));
103+
}
104+
}
105+
106+
TEST(function_quantile_state_test, function_quantile_state_from_base64) {
107+
std::string func_name = "quantile_state_from_base64";
108+
InputTypeSet input_types = {PrimitiveType::TYPE_STRING};
109+
110+
// Create quantile states for comparison
111+
QuantileState empty_quantile_state;
112+
113+
QuantileState single_quantile_state;
114+
single_quantile_state.add_value(1.0);
115+
116+
QuantileState multi_quantile_state;
117+
multi_quantile_state.add_value(1.0);
118+
multi_quantile_state.add_value(2.0);
119+
multi_quantile_state.add_value(3.0);
120+
multi_quantile_state.add_value(4.0);
121+
multi_quantile_state.add_value(5.0);
122+
123+
uint8_t buf[65536];
124+
unsigned char encoded_buf[131072];
125+
std::string empty_base64;
126+
std::string single_base64;
127+
std::string multi_base64;
128+
129+
{
130+
size_t len = empty_quantile_state.serialize(buf);
131+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
132+
empty_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
133+
}
134+
135+
{
136+
size_t len = single_quantile_state.serialize(buf);
137+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
138+
single_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
139+
}
140+
141+
{
142+
size_t len = multi_quantile_state.serialize(buf);
143+
size_t encoded_len = base64_encode(buf, len, encoded_buf);
144+
multi_base64 = std::string(reinterpret_cast<char*>(encoded_buf), encoded_len);
145+
}
146+
147+
{
148+
char decoded_buf[65536];
149+
int decoded_len = base64_decode(empty_base64.c_str(), empty_base64.length(), decoded_buf);
150+
EXPECT_GT(decoded_len, 0);
151+
152+
QuantileState decoded_empty;
153+
doris::Slice slice(decoded_buf, decoded_len);
154+
EXPECT_TRUE(decoded_empty.deserialize(slice));
155+
156+
EXPECT_TRUE(std::isnan(empty_quantile_state.get_value_by_percentile(0.5)));
157+
EXPECT_TRUE(std::isnan(decoded_empty.get_value_by_percentile(0.5)));
158+
}
159+
160+
{
161+
char decoded_buf[65536];
162+
int decoded_len = base64_decode(single_base64.c_str(), single_base64.length(), decoded_buf);
163+
EXPECT_GT(decoded_len, 0);
164+
165+
QuantileState decoded_single;
166+
doris::Slice slice(decoded_buf, decoded_len);
167+
EXPECT_TRUE(decoded_single.deserialize(slice));
168+
169+
EXPECT_NEAR(single_quantile_state.get_value_by_percentile(0.5),
170+
decoded_single.get_value_by_percentile(0.5), 0.01);
171+
}
172+
173+
{
174+
char decoded_buf[65536];
175+
int decoded_len = base64_decode(multi_base64.c_str(), multi_base64.length(), decoded_buf);
176+
EXPECT_GT(decoded_len, 0);
177+
178+
QuantileState decoded_multi;
179+
doris::Slice slice(decoded_buf, decoded_len);
180+
EXPECT_TRUE(decoded_multi.deserialize(slice));
181+
182+
EXPECT_NEAR(multi_quantile_state.get_value_by_percentile(0.5),
183+
decoded_multi.get_value_by_percentile(0.5), 0.01);
184+
EXPECT_NEAR(multi_quantile_state.get_value_by_percentile(0.9),
185+
decoded_multi.get_value_by_percentile(0.9), 0.01);
186+
}
187+
}
188+
189+
TEST(function_quantile_state_test, function_quantile_state_roundtrip) {
190+
QuantileState original;
191+
for (int i = 0; i < 50; i++) {
192+
original.add_value(static_cast<double>(i * 2));
193+
}
194+
195+
uint8_t ser_buf[65536];
196+
size_t ser_len = original.serialize(ser_buf);
197+
198+
unsigned char encoded_buf[131072];
199+
size_t encoded_len = base64_encode(ser_buf, ser_len, encoded_buf);
200+
std::string base64_str(reinterpret_cast<char*>(encoded_buf), encoded_len);
201+
202+
char decoded_buf[65536];
203+
int decoded_len = base64_decode(base64_str.c_str(), base64_str.length(), decoded_buf);
204+
EXPECT_GT(decoded_len, 0);
205+
206+
QuantileState recovered;
207+
doris::Slice slice(decoded_buf, decoded_len);
208+
EXPECT_TRUE(recovered.deserialize(slice));
209+
210+
EXPECT_NEAR(original.get_value_by_percentile(0.5), recovered.get_value_by_percentile(0.5),
211+
0.01);
212+
EXPECT_NEAR(original.get_value_by_percentile(0.9), recovered.get_value_by_percentile(0.9),
213+
0.01);
214+
}
215+
216+
} // namespace doris::vectorized

0 commit comments

Comments
 (0)