Skip to content

Commit 4efe182

Browse files
klauslergithub-actions[bot]
authored andcommitted
Automerge: [flang] Fix UTF-8 minimality checks (#159142)
UTF-8 encodings are required to be minimal, but the checks for minimality of 3-byte and 4-byte sequences were incorrect. Fix.
2 parents a7e81b8 + fdd989d commit 4efe182

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

flang/lib/Parser/characters.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,21 +158,24 @@ DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
158158
const char *cp, std::size_t bytes) {
159159
auto p{reinterpret_cast<const std::uint8_t *>(cp)};
160160
char32_t ch{*p};
161-
if (ch <= 0x7f) {
161+
// Valid UTF-8 encodings must be minimal.
162+
if (ch <= 0x7f) { // 1 byte: 7 bits of payload
162163
return {ch, 1};
163-
} else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
164-
((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
164+
} else if ((ch & 0xf8) == 0xf0 && bytes >= 4 &&
165+
((p[1] | p[2] | p[3]) & 0xc0) == 0x80 && (ch > 0xf0 || p[1] > 0x8f)) {
166+
// 4 bytes: 3+6+6+6=21 bits of payload
165167
ch = ((ch & 7) << 6) | (p[1] & 0x3f);
166168
ch = (ch << 6) | (p[2] & 0x3f);
167169
ch = (ch << 6) | (p[3] & 0x3f);
168170
return {ch, 4};
169-
} else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
170-
((p[1] | p[2]) & 0xc0) == 0x80) {
171+
} else if ((ch & 0xf0) == 0xe0 && bytes >= 3 &&
172+
((p[1] | p[2]) & 0xc0) == 0x80 && (ch > 0xe0 || p[1] > 0x9f)) {
173+
// 3 bytes: 4+6+6=16 bits of payload
171174
ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
172175
ch = (ch << 6) | (p[2] & 0x3f);
173176
return {ch, 3};
174177
} else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
175-
(p[1] & 0xc0) == 0x80) {
178+
(p[1] & 0xc0) == 0x80) { // 2 bytes: 5+6=11 bits of payload
176179
ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
177180
return {ch, 2};
178181
} else {

flang/test/Parser/utf8-01.f90

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
2+
3+
character(kind=4), parameter :: c(2) = [character(kind=4) :: &
4+
4_'🍌', 4_'' ]
5+
print *, '🍌'
6+
print *, 4_'🍌'
7+
print *, ''
8+
print *, 4_''
9+
end
10+
11+
!CHECK: CHARACTER(KIND=4_4), PARAMETER :: c(2_4) = [CHARACTER(KIND=4,LEN=1)::4_"\360\237\215\214",4_"\346\260\264"]
12+
!CHECK: PRINT *, "\360\237\215\214"
13+
!CHECK: PRINT *, 4_"\360\237\215\214"
14+
!CHECK: PRINT *, "\346\260\264"
15+
!CHECK: PRINT *, 4_"\346\260\264"

0 commit comments

Comments
 (0)