Skip to content

Commit 143f3fc

Browse files
authored
[flang] Accept a non-breaking space character in source (#106611)
Accept non-breaking space characters (Latin-1 '\xa0', UTF-8 '\xc2' '\xa0') in source code, converting them into regular spaces in the cooked character stream when not in character literals.
1 parent 500f6cc commit 143f3fc

File tree

2 files changed

+54
-26
lines changed

2 files changed

+54
-26
lines changed

flang/lib/Parser/prescan.cpp

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,23 @@ Prescanner::Prescanner(const Prescanner &that, bool isNestedInIncludeDirective)
4646
compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
4747
compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
4848

49+
// Returns number of bytes to skip
50+
static inline int IsSpace(const char *p) {
51+
if (*p == ' ') {
52+
return 1;
53+
} else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
54+
return 1;
55+
} else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
56+
return 2;
57+
} else {
58+
return 0;
59+
}
60+
}
61+
62+
static inline int IsSpaceOrTab(const char *p) {
63+
return *p == '\t' ? 1 : IsSpace(p);
64+
}
65+
4966
static inline constexpr bool IsFixedFormCommentChar(char ch) {
5067
return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
5168
}
@@ -126,8 +143,8 @@ void Prescanner::Statement() {
126143
if (inFixedForm_) {
127144
CHECK(IsFixedFormCommentChar(*at_));
128145
} else {
129-
while (*at_ == ' ' || *at_ == '\t') {
130-
++at_, ++column_;
146+
while (int n{IsSpaceOrTab(at_)}) {
147+
at_ += n, ++column_;
131148
}
132149
CHECK(*at_ == '!');
133150
}
@@ -159,10 +176,10 @@ void Prescanner::Statement() {
159176
++sp, ++at_, ++column_) {
160177
EmitChar(tokens, *sp);
161178
}
162-
if (*at_ == ' ' || *at_ == '\t') {
179+
if (IsSpaceOrTab(at_)) {
163180
EmitChar(tokens, ' ');
164-
while (*at_ == ' ' || *at_ == '\t') {
165-
++at_, ++column_;
181+
while (int n{IsSpaceOrTab(at_)}) {
182+
at_ += n, ++column_;
166183
}
167184
}
168185
tokens.CloseToken();
@@ -361,7 +378,7 @@ void Prescanner::LabelField(TokenSequence &token) {
361378
column_ = 7;
362379
break;
363380
}
364-
if (*at_ != ' ' &&
381+
if (int n{IsSpace(at_)}; n == 0 &&
365382
!(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
366383
EmitChar(token, *at_);
367384
++outCol;
@@ -493,7 +510,9 @@ bool Prescanner::MustSkipToEndOfLine() const {
493510

494511
void Prescanner::NextChar() {
495512
CHECK(*at_ != '\n');
496-
++at_, ++column_;
513+
int n{IsSpace(at_)};
514+
at_ += n ? n : 1;
515+
++column_;
497516
while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
498517
// UTF-8 byte order mark - treat this file as UTF-8
499518
at_ += 3;
@@ -556,23 +575,23 @@ void Prescanner::SkipCComments() {
556575
}
557576

558577
void Prescanner::SkipSpaces() {
559-
while (*at_ == ' ' || *at_ == '\t') {
578+
while (IsSpaceOrTab(at_)) {
560579
NextChar();
561580
}
562581
insertASpace_ = false;
563582
}
564583

565584
const char *Prescanner::SkipWhiteSpace(const char *p) {
566-
while (*p == ' ' || *p == '\t') {
567-
++p;
585+
while (int n{IsSpaceOrTab(p)}) {
586+
p += n;
568587
}
569588
return p;
570589
}
571590

572591
const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
573592
while (true) {
574-
if (*p == ' ' || *p == '\t') {
575-
++p;
593+
if (int n{IsSpaceOrTab(p)}) {
594+
p += n;
576595
} else if (IsCComment(p)) {
577596
if (const char *after{SkipCComment(p)}) {
578597
p = after;
@@ -613,7 +632,7 @@ bool Prescanner::NextToken(TokenSequence &tokens) {
613632
}
614633
SkipCComments();
615634
}
616-
if (*at_ == ' ' || *at_ == '\t') {
635+
if (IsSpaceOrTab(at_)) {
617636
// Compress free-form white space into a single space character.
618637
const auto theSpace{at_};
619638
char previous{at_ <= start_ ? ' ' : at_[-1]};
@@ -976,8 +995,8 @@ bool Prescanner::IsFixedFormCommentLine(const char *start) const {
976995
}
977996
bool anyTabs{false};
978997
while (true) {
979-
if (*p == ' ') {
980-
++p;
998+
if (int n{IsSpace(p)}) {
999+
p += n;
9811000
} else if (*p == '\t') {
9821001
anyTabs = true;
9831002
++p;
@@ -1089,7 +1108,8 @@ void Prescanner::FortranInclude(const char *firstQuote) {
10891108

10901109
const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
10911110
const char *p{start};
1092-
for (; *p == ' '; ++p) {
1111+
while (int n{IsSpace(p)}) {
1112+
p += n;
10931113
}
10941114
if (*p == '#') {
10951115
if (inFixedForm_ && p == start + 5) {
@@ -1178,9 +1198,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
11781198
return nullptr;
11791199
}
11801200
}
1181-
char col6{nextLine_[5]};
1182-
if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1183-
if (nextLine_[6] != ' ' && mightNeedSpace) {
1201+
const char *col6{nextLine_ + 5};
1202+
if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1203+
if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
11841204
insertASpace_ = true;
11851205
}
11861206
return nextLine_ + 6;
@@ -1207,9 +1227,9 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
12071227
features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
12081228
nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
12091229
nextLine_[4] == ' ') {
1210-
char col6{nextLine_[5]};
1211-
if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
1212-
if ((col6 == 'i' || col6 == 'I') && IsIncludeLine(nextLine_)) {
1230+
const char *col6{nextLine_ + 5};
1231+
if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
1232+
if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
12131233
// It's An INCLUDE line, not a continuation
12141234
} else {
12151235
return nextLine_ + 6;
@@ -1356,7 +1376,7 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
13561376
char sentinel[5], *sp{sentinel};
13571377
int column{2};
13581378
for (; column < 6; ++column, ++p) {
1359-
if (*p == ' ' || *p == '\n' || *p == '\t') {
1379+
if (*p == '\n' || IsSpaceOrTab(p)) {
13601380
break;
13611381
}
13621382
if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
@@ -1366,8 +1386,10 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
13661386
*sp++ = ToLowerCaseLetter(*p);
13671387
}
13681388
if (column == 6) {
1369-
if (*p == ' ' || *p == '\t' || *p == '0') {
1389+
if (*p == '0') {
13701390
++p;
1391+
} else if (int n{IsSpaceOrTab(p)}) {
1392+
p += n;
13711393
} else {
13721394
// This is a Continuation line, not an initial directive line.
13731395
return std::nullopt;
@@ -1442,10 +1464,10 @@ std::optional<std::pair<const char *, const char *>>
14421464
Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
14431465
char sentinel[8];
14441466
for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
1445-
if (*p == ' ' || *p == '\t' || *p == '&') {
1467+
if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
14461468
if (j > 0) {
14471469
sentinel[j] = '\0';
1448-
p = SkipWhiteSpace(p + 1);
1470+
p = SkipWhiteSpace(p + n);
14491471
if (*p != '!') {
14501472
if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
14511473
return std::make_pair(sp, p);
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
! RUN: %flang_fc1 -fsyntax-only %s
2+
! This line contains the Latin-1 NBSP (non-breaking space) character '\xa0'
3+
x= 1.
4+
! This line contains the UTF-8 encoding of NBSP ('\xc2' '\xa0')
5+
x= 1.
6+
end

0 commit comments

Comments
 (0)