From 2943d3936e42105468342cd028fe53e203544bfc Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Fri, 17 Oct 2025 10:47:21 +0200 Subject: [PATCH 1/2] [Clang] Do not warn on UTF-16 -> UTF-32 conversions. UTF-16 to UTF-16 conversions seems widespread, and lone surrogate have a distinct representation in UTF-32. Lets not warn on this case to make the warning easier to adopt. This follows SG-16 guideline https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3695r2.html#changes-since-r1 Fixes #163719 --- clang/lib/Sema/SemaChecking.cpp | 9 ++++++++- clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp | 8 ++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 4f409ca0f414d..33ed69e67cd76 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -12309,13 +12309,20 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, SourceLocation CC) { assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() && Source != Target); + + // Lone surrogates have a distinct representation in UTF-32. + // Converting betweem UTF-16 and UTF-32 codepoint seems very widespread, + // so don't warn on such conversion. + if (Source->isChar16Type() && Target->isChar32Type()) + return; + Expr::EvalResult Result; if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects, S.isConstantEvaluatedContext())) { llvm::APSInt Value(32); Value = Result.Val.getInt(); bool IsASCII = Value <= 0x7F; - bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF); + bool IsBMP = Value <= 0xDFFF || (Value >= 0xE000 && Value <= 0xFFFF); bool ConversionPreservesSemantics = IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP); diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp index fcff006d0e028..f17f20ca25295 100644 --- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp +++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp @@ -14,7 +14,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' may lose precision and change the meaning of the represented code unit}} c32(u8); // expected-warning {{implicit conversion from 'char8_t' to 'char32_t' may change the meaning of the represented code unit}} - c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' may change the meaning of the represented code unit}} + c32(u16); c32(u32); @@ -30,7 +30,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c16(char32_t(0x7f)); c16(char32_t(0x80)); c16(char32_t(0xD7FF)); - c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}} + c16(char32_t(0xD800)); c16(char32_t(0xE000)); c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}} @@ -44,8 +44,8 @@ void test(char8_t u8, char16_t u16, char32_t u32) { c32(char16_t(0x80)); c32(char16_t(0xD7FF)); - c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}} - c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}} + c32(char16_t(0xD800)); + c32(char16_t(0xDFFF)); c32(char16_t(0xE000)); c32(char16_t(u'☕')); From d0f94e96441a06f66713fd2a62b0f6de05bb6217 Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Fri, 17 Oct 2025 14:27:51 +0200 Subject: [PATCH 2/2] fix typos --- clang/lib/Sema/SemaChecking.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 33ed69e67cd76..484f00a2a8e15 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -12311,7 +12311,7 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source, Source != Target); // Lone surrogates have a distinct representation in UTF-32. - // Converting betweem UTF-16 and UTF-32 codepoint seems very widespread, + // Converting between UTF-16 and UTF-32 codepoints seems very widespread, // so don't warn on such conversion. if (Source->isChar16Type() && Target->isChar32Type()) return;