@@ -1864,6 +1864,72 @@ bool Lexer::tryLexConflictMarker(bool EatNewline) {
1864
1864
return false ;
1865
1865
}
1866
1866
1867
+ bool Lexer::lexUnknown () {
1868
+ const char *Tmp = CurPtr - 1 ;
1869
+
1870
+ if (advanceIfValidContinuationOfIdentifier (Tmp, BufferEnd)) {
1871
+ // If this is a valid identifier continuation, but not a valid identifier
1872
+ // start, attempt to recover by eating more continuation characters.
1873
+ diagnose (CurPtr - 1 , diag::lex_invalid_identifier_start_character);
1874
+ while (advanceIfValidContinuationOfIdentifier (Tmp, BufferEnd))
1875
+ ;
1876
+ CurPtr = Tmp;
1877
+ return true ;
1878
+ }
1879
+
1880
+ // This character isn't allowed in Swift source.
1881
+ uint32_t Codepoint = validateUTF8CharacterAndAdvance (Tmp, BufferEnd);
1882
+ if (Codepoint == ~0U ) {
1883
+ diagnose (CurPtr - 1 , diag::lex_invalid_utf8)
1884
+ .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " " );
1885
+ CurPtr = Tmp;
1886
+ return false ; // Skip presumed whitespace.
1887
+ } else if (Codepoint == 0x0000201D ) {
1888
+ // If this is an end curly quote, just diagnose it with a fixit hint.
1889
+ diagnose (CurPtr - 1 , diag::lex_invalid_curly_quote)
1890
+ .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " \" " );
1891
+ CurPtr = Tmp;
1892
+ return true ;
1893
+ } else if (Codepoint == 0x0000201C ) {
1894
+ auto EndPtr = Tmp;
1895
+ // If this is a start curly quote, do a fuzzy match of a string literal
1896
+ // to improve recovery.
1897
+ if (auto Tmp2 = findEndOfCurlyQuoteStringLiteral (Tmp))
1898
+ Tmp = Tmp2;
1899
+
1900
+ // Note, we intentionally diagnose the end quote before the start quote,
1901
+ // so that the IDE suggests fixing the end quote before the start quote.
1902
+ // This, in turn, works better with our error recovery because we won't
1903
+ // diagnose an end curly quote in the middle of a straight quoted
1904
+ // literal.
1905
+ diagnose (CurPtr - 1 , diag::lex_invalid_curly_quote)
1906
+ .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (EndPtr),
1907
+ " \" " );
1908
+ CurPtr = Tmp;
1909
+ return true ;
1910
+ }
1911
+
1912
+ diagnose (CurPtr - 1 , diag::lex_invalid_character)
1913
+ .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " " );
1914
+
1915
+ char ExpectedCodepoint;
1916
+ if ((ExpectedCodepoint =
1917
+ confusable::tryConvertConfusableCharacterToASCII (Codepoint))) {
1918
+
1919
+ llvm::SmallString<4 > ConfusedChar;
1920
+ EncodeToUTF8 (Codepoint, ConfusedChar);
1921
+ llvm::SmallString<1 > ExpectedChar;
1922
+ ExpectedChar += ExpectedCodepoint;
1923
+ diagnose (CurPtr - 1 , diag::lex_confusable_character, ConfusedChar,
1924
+ ExpectedChar)
1925
+ .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp),
1926
+ ExpectedChar);
1927
+ }
1928
+
1929
+ CurPtr = Tmp;
1930
+ return false ; // Skip presumed whitespace.
1931
+ }
1932
+
1867
1933
Lexer::NulCharacterKind Lexer::getNulCharacterKind (const char *Ptr) const {
1868
1934
assert (Ptr != nullptr && *Ptr == 0 );
1869
1935
if (Ptr == CodeCompletionPtr) {
@@ -2100,66 +2166,11 @@ void Lexer::lexImpl() {
2100
2166
2101
2167
if (advanceIfValidStartOfOperator (Tmp, BufferEnd))
2102
2168
return lexOperatorIdentifier ();
2103
-
2104
- if (advanceIfValidContinuationOfIdentifier (Tmp, BufferEnd)) {
2105
- // If this is a valid identifier continuation, but not a valid identifier
2106
- // start, attempt to recover by eating more continuation characters.
2107
- diagnose (CurPtr-1 , diag::lex_invalid_identifier_start_character);
2108
- while (advanceIfValidContinuationOfIdentifier (Tmp, BufferEnd));
2109
- CurPtr = Tmp;
2110
- return formToken (tok::unknown, TokStart);
2111
- }
2112
2169
2113
- // This character isn't allowed in Swift source.
2114
- uint32_t Codepoint = validateUTF8CharacterAndAdvance (Tmp, BufferEnd);
2115
- if (Codepoint == ~0U ) {
2116
- diagnose (CurPtr - 1 , diag::lex_invalid_utf8)
2117
- .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " " );
2118
- CurPtr = Tmp;
2119
- goto Restart; // Skip presumed whitespace.
2120
- } else if (Codepoint == 0x0000201D ) {
2121
- // If this is an end curly quote, just diagnose it with a fixit hint.
2122
- diagnose (CurPtr - 1 , diag::lex_invalid_curly_quote)
2123
- .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " \" " );
2124
- CurPtr = Tmp;
2125
- return formToken (tok::unknown, TokStart);
2126
- } else if (Codepoint == 0x0000201C ) {
2127
- auto EndPtr = Tmp;
2128
- // If this is a start curly quote, do a fuzzy match of a string literal
2129
- // to improve recovery.
2130
- if (auto Tmp2 = findEndOfCurlyQuoteStringLiteral (Tmp))
2131
- Tmp = Tmp2;
2132
-
2133
- // Note, we intentionally diagnose the end quote before the start quote,
2134
- // so that the IDE suggests fixing the end quote before the start quote.
2135
- // This, in turn, works better with our error recovery because we won't
2136
- // diagnose an end curly quote in the middle of a straight quoted
2137
- // literal.
2138
- diagnose (CurPtr - 1 , diag::lex_invalid_curly_quote)
2139
- .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (EndPtr),
2140
- " \" " );
2141
- CurPtr = Tmp;
2170
+ bool ShouldTokenize = lexUnknown ();
2171
+ if (ShouldTokenize) {
2142
2172
return formToken (tok::unknown, TokStart);
2143
2173
}
2144
-
2145
- diagnose (CurPtr - 1 , diag::lex_invalid_character)
2146
- .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp), " " );
2147
-
2148
- char ExpectedCodepoint;
2149
- if ((ExpectedCodepoint =
2150
- confusable::tryConvertConfusableCharacterToASCII (Codepoint))) {
2151
-
2152
- llvm::SmallString<4 > ConfusedChar;
2153
- EncodeToUTF8 (Codepoint, ConfusedChar);
2154
- llvm::SmallString<1 > ExpectedChar;
2155
- ExpectedChar += ExpectedCodepoint;
2156
- diagnose (CurPtr - 1 , diag::lex_confusable_character, ConfusedChar,
2157
- ExpectedChar)
2158
- .fixItReplaceChars (getSourceLoc (CurPtr - 1 ), getSourceLoc (Tmp),
2159
- ExpectedChar);
2160
- }
2161
-
2162
- CurPtr = Tmp;
2163
2174
goto Restart; // Skip presumed whitespace.
2164
2175
}
2165
2176
0 commit comments