Skip to content

Commit dbd6f53

Browse files
authored
chore: added the enable_binary_to_utf8_lossy option for inserting lossy utf8 data (#18532)
* chore: added the `enable_binary_to_utf8_lossy` option for inserting lossy utf8 data * chore: codefmt
1 parent 72d5c82 commit dbd6f53

File tree

6 files changed

+35
-3
lines changed

6 files changed

+35
-3
lines changed

src/query/expression/src/function.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ pub struct FunctionContext {
169169
pub random_function_seed: bool,
170170
pub week_start: u8,
171171
pub date_format_style: String,
172+
pub enable_binary_to_utf8_lossy: bool,
172173
}
173174

174175
impl Default for FunctionContext {
@@ -186,6 +187,7 @@ impl Default for FunctionContext {
186187
random_function_seed: false,
187188
week_start: 0,
188189
date_format_style: "oracle".to_string(),
190+
enable_binary_to_utf8_lossy: false,
189191
}
190192
}
191193
}

src/query/functions/src/scalars/binary.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use std::borrow::Cow;
1516
use std::io::Write;
1617

1718
use databend_common_expression::error_to_null;
@@ -209,11 +210,16 @@ fn eval_binary_to_string(val: Value<BinaryType>, ctx: &mut EvalContext) -> Value
209210
vectorize_binary_to_string(
210211
|col| col.total_bytes_len(),
211212
|val, output, ctx| {
212-
if let Ok(val) = simdutf8::basic::from_utf8(val) {
213-
output.put_str(val);
213+
let val = if ctx.func_ctx.enable_binary_to_utf8_lossy {
214+
String::from_utf8_lossy(val)
215+
} else if let Ok(val) = simdutf8::basic::from_utf8(val) {
216+
Cow::Borrowed(val)
214217
} else {
215218
ctx.set_error(output.len(), "invalid utf8 sequence");
216-
}
219+
output.commit_row();
220+
return;
221+
};
222+
output.put_str(&val);
217223
output.commit_row();
218224
},
219225
)(val, ctx)

src/query/service/src/sessions/query_ctx.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ impl TableContext for QueryContext {
10091009
let week_start = settings.get_week_start()? as u8;
10101010
let date_format_style = settings.get_date_format_style()?;
10111011
let random_function_seed = settings.get_random_function_seed()?;
1012+
let enable_binary_to_utf8_lossy = settings.get_enable_binary_to_utf8_lossy()?;
10121013

10131014
Ok(FunctionContext {
10141015
now,
@@ -1023,6 +1024,7 @@ impl TableContext for QueryContext {
10231024
random_function_seed,
10241025
week_start,
10251026
date_format_style,
1027+
enable_binary_to_utf8_lossy,
10261028
})
10271029
}
10281030

src/query/settings/src/settings_default.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,13 @@ impl DefaultSettings {
14241424
scope: SettingScope::Both,
14251425
range: Some(SettingRange::Numeric(0..=1)),
14261426
}),
1427+
("enable_binary_to_utf8_lossy", DefaultSettingValue {
1428+
value: UserSettingValue::UInt64(0),
1429+
desc: "Enable binary-to-UTF8 lossy conversion, default is 0, 1 for enable",
1430+
mode: SettingMode::Both,
1431+
scope: SettingScope::Both,
1432+
range: Some(SettingRange::Numeric(0..=1)),
1433+
}),
14271434
]);
14281435

14291436
Ok(Arc::new(DefaultSettings {

src/query/settings/src/settings_getter_setter.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,4 +1055,8 @@ impl Settings {
10551055
pub fn get_enforce_local(&self) -> Result<bool> {
10561056
Ok(self.try_get_u64("enforce_local")? == 1)
10571057
}
1058+
1059+
pub fn get_enable_binary_to_utf8_lossy(&self) -> Result<bool> {
1060+
Ok(self.try_get_u64("enable_binary_to_utf8_lossy")? == 1)
1061+
}
10581062
}

tests/sqllogictests/suites/query/functions/cast.test

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ select to_string('a')
2828
----
2929
a
3030

31+
statement error
32+
select to_string(UNHEX('C328'));
33+
34+
statement ok
35+
set enable_binary_to_utf8_lossy = 1;
36+
37+
query T
38+
select to_string(UNHEX('C328'));
39+
----
40+
�(
41+
3142
query I
3243
select try_cast(3 as int);
3344
----

0 commit comments

Comments
 (0)