@@ -91,6 +91,20 @@ constexpr char32_t left_double_quote = U'\u201c';
9191constexpr char32_t right_single_quote = U' \u2019 ' ;
9292constexpr char32_t right_double_quote = U' \u201d ' ;
9393
94+ struct Confusable_Symbol {
95+ char32_t confusable;
96+ Char8 confusable_name[20 ];
97+ Char8 symbol;
98+ Char8 symbol_name[20 ];
99+ Token_Type symbol_token_type;
100+ };
101+
102+ Confusable_Symbol confusable_symbols[] = {
103+ {0x037e , u8" Greek Question Mark" , u8 ' ;' , u8" semicolon" ,
104+ Token_Type::semicolon},
105+ // TODO(strager): Add more.
106+ };
107+
94108bool look_up_in_unicode_table (const std::uint8_t * table, std::size_t table_size,
95109 char32_t code_point) {
96110 constexpr int bits_per_byte = 8 ;
@@ -1817,7 +1831,9 @@ Lexer::Parsed_Identifier Lexer::parse_identifier_slow(
18171831 : this ->is_identifier_character (code_point, kind);
18181832 if (!is_legal_character) {
18191833 if (this ->is_ascii_character (code_point) ||
1820- this ->is_non_ascii_whitespace_character (code_point)) {
1834+ this ->is_non_ascii_whitespace_character (code_point) ||
1835+ // Confusable symbols are handled by parse_non_ascii.
1836+ this ->is_confusable_symbol_character (code_point)) {
18211837 break ;
18221838 } else {
18231839 this ->diag_reporter_ ->report (Diag_Character_Disallowed_In_Identifiers{
@@ -1850,21 +1866,39 @@ QLJS_WARNING_POP
18501866void Lexer::parse_non_ascii () {
18511867 Decode_UTF8_Result character = decode_utf_8 (Padded_String_View (
18521868 this ->input_ , this ->original_input_ .null_terminator ()));
1869+ // FIXME(strager): We probably need to check character.ok.
1870+
18531871 if (character.code_point == left_single_quote ||
18541872 character.code_point == right_single_quote ||
18551873 character.code_point == left_double_quote ||
18561874 character.code_point == right_double_quote) {
18571875 this ->input_ = this ->parse_smart_quote_string_literal (character);
18581876 this ->last_token_ .type = Token_Type::string;
18591877 this ->last_token_ .end = this ->input_ ;
1860- } else {
1861- Parsed_Identifier ident = this ->parse_identifier_slow (
1862- this ->input_ , this ->input_ , Identifier_Kind::javascript);
1863- this ->input_ = ident.after ;
1864- this ->last_token_ .normalized_identifier = ident.normalized ;
1865- this ->last_token_ .end = ident.after ;
1866- this ->last_token_ .type = Token_Type::identifier;
1878+ return ;
18671879 }
1880+
1881+ for (const Confusable_Symbol& confusable : confusable_symbols) {
1882+ if (character.code_point == confusable.confusable ) {
1883+ this ->input_ += character.size ;
1884+ this ->last_token_ .end = this ->input_ ;
1885+ this ->last_token_ .type = confusable.symbol_token_type ;
1886+ this ->diag_reporter_ ->report (Diag_Confusable_Symbol{
1887+ .confusable = this ->last_token_ .span (),
1888+ .confusable_name = confusable.confusable_name ,
1889+ .symbol = confusable.symbol ,
1890+ .symbol_name = confusable.symbol_name ,
1891+ });
1892+ return ;
1893+ }
1894+ }
1895+
1896+ Parsed_Identifier ident = this ->parse_identifier_slow (
1897+ this ->input_ , this ->input_ , Identifier_Kind::javascript);
1898+ this ->input_ = ident.after ;
1899+ this ->last_token_ .normalized_identifier = ident.normalized ;
1900+ this ->last_token_ .end = ident.after ;
1901+ this ->last_token_ .type = Token_Type::identifier;
18681902}
18691903
18701904QLJS_WARNING_PUSH
@@ -2319,6 +2353,15 @@ bool Lexer::is_ascii_character(char32_t code_point) {
23192353 return code_point < 0x80 ;
23202354}
23212355
2356+ bool Lexer::is_confusable_symbol_character (char32_t code_point) {
2357+ for (const Confusable_Symbol& confusable : confusable_symbols) {
2358+ if (code_point == confusable.confusable ) {
2359+ return true ;
2360+ }
2361+ }
2362+ return false ;
2363+ }
2364+
23222365int Lexer::newline_character_size (const Char8* input) {
23232366 if (input[0 ] == u8 ' \n ' || input[0 ] == u8 ' \r ' ) {
23242367 return 1 ;
0 commit comments