@@ -413,7 +413,42 @@ void highlight(const String & query, std::vector<replxx::Replxx::Color> & colors
413413
414414String highlighted (const String & query, const Context & context)
415415{
416- size_t num_code_points = UTF8::countCodePoints (reinterpret_cast <const UInt8 *>(query.data ()), query.size ());
416+ // Issue: https://github.com/ClickHouse/ClickHouse/issues/83987
417+ // / Previously utf-8 code points were calculated in the following way:
418+ // / size_t num_code_points = UTF8::countCodePoints(reinterpret_cast<const UInt8 *>(query.data()), query.size());
419+ // / But, `UTF8::countCodePoints` and `UTF8::seqLength` seem to handle invalid UTF-8 sequences inconsistently
420+ // / (e.g., hex literals like x'A0'), causing count mismatches and crashes.
421+ // / For a quick fix, since the highlight function uses `UTF8::seqLength` for iteration, use the same logic for
422+ // / counting to ensure consistency when invalid UTF-8 bytes are detected (use UTF8::countCodePoints for all other
423+ // / cases to mainly for performance concerns).
424+ // / TODO: @bharatnc Fix UTF8::countCodePoints to handle invalid UTF-8 sequences consistently with seqLength so that
425+ // / this logic can be removed.
426+ bool has_invalid_utf8 = false ;
427+ for (const char c : query)
428+ {
429+ // / Standalone UTF-8 continuation bytes (0x80-0xBF, e.g. 0xA0 from hex literals)
430+ // / cause countCodePoints/seqLength inconsistency.
431+ if (static_cast <unsigned char >(c) >= 0x80 && static_cast <unsigned char >(c) <= 0xBF )
432+ {
433+ has_invalid_utf8 = true ;
434+ break ;
435+ }
436+ }
437+ size_t num_code_points;
438+ if (has_invalid_utf8)
439+ {
440+ num_code_points = 0 ;
441+ const char * pos = query.data ();
442+ const char * end = pos + query.size ();
443+ while (pos < end)
444+ {
445+ pos += UTF8::seqLength (*pos);
446+ ++num_code_points;
447+ }
448+ }
449+ else
450+ num_code_points = UTF8::countCodePoints (reinterpret_cast <const UInt8 *>(query.data ()), query.size ());
451+
417452 std::vector<replxx::Replxx::Color> colors (num_code_points, replxx::Replxx::Color::DEFAULT);
418453 highlight (query, colors, context, 0 );
419454
0 commit comments