Skip to content

Commit e6f42fc

Browse files
omochinkcsgexi
authored andcommitted
[Parse] split lexUnknown function from lexImpl
1 parent 0a69bd7 commit e6f42fc

File tree

2 files changed

+71
-57
lines changed

2 files changed

+71
-57
lines changed

include/swift/Parse/Lexer.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,9 @@ class Lexer {
531531
/// end of the marker in diff3 or Perforce style respectively.
532532
bool tryLexConflictMarker(bool EatNewline);
533533

534+
/// Returns it should be tokenize.
535+
bool lexUnknown();
536+
534537
NulCharacterKind getNulCharacterKind(const char *Ptr) const;
535538
};
536539

lib/Parse/Lexer.cpp

Lines changed: 68 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,6 +1864,72 @@ bool Lexer::tryLexConflictMarker(bool EatNewline) {
18641864
return false;
18651865
}
18661866

1867+
bool Lexer::lexUnknown() {
1868+
const char *Tmp = CurPtr - 1;
1869+
1870+
if (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd)) {
1871+
// If this is a valid identifier continuation, but not a valid identifier
1872+
// start, attempt to recover by eating more continuation characters.
1873+
diagnose(CurPtr - 1, diag::lex_invalid_identifier_start_character);
1874+
while (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd))
1875+
;
1876+
CurPtr = Tmp;
1877+
return true;
1878+
}
1879+
1880+
// This character isn't allowed in Swift source.
1881+
uint32_t Codepoint = validateUTF8CharacterAndAdvance(Tmp, BufferEnd);
1882+
if (Codepoint == ~0U) {
1883+
diagnose(CurPtr - 1, diag::lex_invalid_utf8)
1884+
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");
1885+
CurPtr = Tmp;
1886+
return false; // Skip presumed whitespace.
1887+
} else if (Codepoint == 0x0000201D) {
1888+
// If this is an end curly quote, just diagnose it with a fixit hint.
1889+
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
1890+
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), "\"");
1891+
CurPtr = Tmp;
1892+
return true;
1893+
} else if (Codepoint == 0x0000201C) {
1894+
auto EndPtr = Tmp;
1895+
// If this is a start curly quote, do a fuzzy match of a string literal
1896+
// to improve recovery.
1897+
if (auto Tmp2 = findEndOfCurlyQuoteStringLiteral(Tmp))
1898+
Tmp = Tmp2;
1899+
1900+
// Note, we intentionally diagnose the end quote before the start quote,
1901+
// so that the IDE suggests fixing the end quote before the start quote.
1902+
// This, in turn, works better with our error recovery because we won't
1903+
// diagnose an end curly quote in the middle of a straight quoted
1904+
// literal.
1905+
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
1906+
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(EndPtr),
1907+
"\"");
1908+
CurPtr = Tmp;
1909+
return true;
1910+
}
1911+
1912+
diagnose(CurPtr - 1, diag::lex_invalid_character)
1913+
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");
1914+
1915+
char ExpectedCodepoint;
1916+
if ((ExpectedCodepoint =
1917+
confusable::tryConvertConfusableCharacterToASCII(Codepoint))) {
1918+
1919+
llvm::SmallString<4> ConfusedChar;
1920+
EncodeToUTF8(Codepoint, ConfusedChar);
1921+
llvm::SmallString<1> ExpectedChar;
1922+
ExpectedChar += ExpectedCodepoint;
1923+
diagnose(CurPtr - 1, diag::lex_confusable_character, ConfusedChar,
1924+
ExpectedChar)
1925+
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
1926+
ExpectedChar);
1927+
}
1928+
1929+
CurPtr = Tmp;
1930+
return false; // Skip presumed whitespace.
1931+
}
1932+
18671933
Lexer::NulCharacterKind Lexer::getNulCharacterKind(const char *Ptr) const {
18681934
assert(Ptr != nullptr && *Ptr == 0);
18691935
if (Ptr == CodeCompletionPtr) {
@@ -2100,66 +2166,11 @@ void Lexer::lexImpl() {
21002166

21012167
if (advanceIfValidStartOfOperator(Tmp, BufferEnd))
21022168
return lexOperatorIdentifier();
2103-
2104-
if (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd)) {
2105-
// If this is a valid identifier continuation, but not a valid identifier
2106-
// start, attempt to recover by eating more continuation characters.
2107-
diagnose(CurPtr-1, diag::lex_invalid_identifier_start_character);
2108-
while (advanceIfValidContinuationOfIdentifier(Tmp, BufferEnd));
2109-
CurPtr = Tmp;
2110-
return formToken(tok::unknown, TokStart);
2111-
}
21122169

2113-
// This character isn't allowed in Swift source.
2114-
uint32_t Codepoint = validateUTF8CharacterAndAdvance(Tmp, BufferEnd);
2115-
if (Codepoint == ~0U) {
2116-
diagnose(CurPtr - 1, diag::lex_invalid_utf8)
2117-
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");
2118-
CurPtr = Tmp;
2119-
goto Restart; // Skip presumed whitespace.
2120-
} else if (Codepoint == 0x0000201D) {
2121-
// If this is an end curly quote, just diagnose it with a fixit hint.
2122-
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
2123-
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), "\"");
2124-
CurPtr = Tmp;
2125-
return formToken(tok::unknown, TokStart);
2126-
} else if (Codepoint == 0x0000201C) {
2127-
auto EndPtr = Tmp;
2128-
// If this is a start curly quote, do a fuzzy match of a string literal
2129-
// to improve recovery.
2130-
if (auto Tmp2 = findEndOfCurlyQuoteStringLiteral(Tmp))
2131-
Tmp = Tmp2;
2132-
2133-
// Note, we intentionally diagnose the end quote before the start quote,
2134-
// so that the IDE suggests fixing the end quote before the start quote.
2135-
// This, in turn, works better with our error recovery because we won't
2136-
// diagnose an end curly quote in the middle of a straight quoted
2137-
// literal.
2138-
diagnose(CurPtr - 1, diag::lex_invalid_curly_quote)
2139-
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(EndPtr),
2140-
"\"");
2141-
CurPtr = Tmp;
2170+
bool ShouldTokenize = lexUnknown();
2171+
if (ShouldTokenize) {
21422172
return formToken(tok::unknown, TokStart);
21432173
}
2144-
2145-
diagnose(CurPtr - 1, diag::lex_invalid_character)
2146-
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp), " ");
2147-
2148-
char ExpectedCodepoint;
2149-
if ((ExpectedCodepoint =
2150-
confusable::tryConvertConfusableCharacterToASCII(Codepoint))) {
2151-
2152-
llvm::SmallString<4> ConfusedChar;
2153-
EncodeToUTF8(Codepoint, ConfusedChar);
2154-
llvm::SmallString<1> ExpectedChar;
2155-
ExpectedChar += ExpectedCodepoint;
2156-
diagnose(CurPtr - 1, diag::lex_confusable_character, ConfusedChar,
2157-
ExpectedChar)
2158-
.fixItReplaceChars(getSourceLoc(CurPtr - 1), getSourceLoc(Tmp),
2159-
ExpectedChar);
2160-
}
2161-
2162-
CurPtr = Tmp;
21632174
goto Restart; // Skip presumed whitespace.
21642175
}
21652176

0 commit comments

Comments
 (0)