Skip to content

Commit 577cd56

Browse files
authored
Merge pull request ClickHouse#84025 from ClickHouse/ncb/fix-client-replxx-crash
fix crash with clickhouse-client with syntax highlighting
2 parents 313e9c5 + 957e19c commit 577cd56

File tree

3 files changed

+73
-1
lines changed

3 files changed

+73
-1
lines changed

src/Client/ClientBaseHelpers.cpp

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,42 @@ void highlight(const String & query, std::vector<replxx::Replxx::Color> & colors
413413

414414
String highlighted(const String & query, const Context & context)
415415
{
416-
size_t num_code_points = UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(query.data()), query.size());
416+
// Issue: https://github.com/ClickHouse/ClickHouse/issues/83987
417+
/// Previously utf-8 code points were calculated in the following way:
418+
/// size_t num_code_points = UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(query.data()), query.size());
419+
/// But, `UTF8::countCodePoints` and `UTF8::seqLength` seem to handle invalid UTF-8 sequences inconsistently
420+
/// (e.g., hex literals like x'A0'), causing count mismatches and crashes.
421+
/// For a quick fix, since the highlight function uses `UTF8::seqLength` for iteration, use the same logic for
422+
/// counting to ensure consistency when invalid UTF-8 bytes are detected (use UTF8::countCodePoints for all other
423+
/// cases to mainly for performance concerns).
424+
/// TODO: @bharatnc Fix UTF8::countCodePoints to handle invalid UTF-8 sequences consistently with seqLength so that
425+
/// this logic can be removed.
426+
bool has_invalid_utf8 = false;
427+
for (const char c : query)
428+
{
429+
/// Standalone UTF-8 continuation bytes (0x80-0xBF, e.g. 0xA0 from hex literals)
430+
/// cause countCodePoints/seqLength inconsistency.
431+
if (static_cast<unsigned char>(c) >= 0x80 && static_cast<unsigned char>(c) <= 0xBF)
432+
{
433+
has_invalid_utf8 = true;
434+
break;
435+
}
436+
}
437+
size_t num_code_points;
438+
if (has_invalid_utf8)
439+
{
440+
num_code_points = 0;
441+
const char * pos = query.data();
442+
const char * end = pos + query.size();
443+
while (pos < end)
444+
{
445+
pos += UTF8::seqLength(*pos);
446+
++num_code_points;
447+
}
448+
}
449+
else
450+
num_code_points = UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(query.data()), query.size());
451+
417452
std::vector<replxx::Replxx::Color> colors(num_code_points, replxx::Replxx::Color::DEFAULT);
418453
highlight(query, colors, context, 0);
419454

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/expect -f
2+
3+
set basedir [file dirname $argv0]
4+
set basename [file tail $argv0]
5+
if {[info exists env(CLICKHOUSE_TMP)]} {
6+
set CLICKHOUSE_TMP $env(CLICKHOUSE_TMP)
7+
} else {
8+
set CLICKHOUSE_TMP "."
9+
}
10+
exp_internal -f $CLICKHOUSE_TMP/$basename.debuglog 0
11+
set history_file $CLICKHOUSE_TMP/$basename.history
12+
13+
log_user 0
14+
set timeout 60
15+
match_max 100000
16+
17+
expect_after {
18+
# Do not ignore eof from expect
19+
-i $any_spawn_id eof { exp_continue }
20+
# A default timeout action is to do nothing, change it to fail
21+
-i $any_spawn_id timeout { exit 1 }
22+
}
23+
24+
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \${CLICKHOUSE_CLIENT_EXPECT_OPT/--highlight 0/--highlight 1} --history_file=$history_file"
25+
expect "\n:) "
26+
27+
# Validate that hex character sequence doesn't cause the client to crash.
28+
send "SELECT x'A0' FORMAT TSV;\r"
29+
30+
# expect at least one byte and a newline to pass
31+
expect -re {.+\r\n};
32+
33+
expect ":) "
34+
35+
# Ctrl+D
36+
send -- "\4"
37+
expect eof

tests/queries/0_stateless/03568_client_hex_literal_crash.reference

Whitespace-only changes.

0 commit comments

Comments
 (0)