From c0ed20e7e03b23c8fc208598a3b677348f195318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Mon, 4 Nov 2024 23:22:30 +0800 Subject: [PATCH 1/2] Add delimiter meta to demote calls with quoted identifier --- lib/elixir/src/elixir_parser.yrl | 19 ++++++++++----- lib/elixir/src/elixir_tokenizer.erl | 24 +++++++++---------- lib/elixir/test/elixir/kernel/parser_test.exs | 12 +++++++++- lib/elixir/unicode/security.ex | 4 ++-- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/lib/elixir/src/elixir_parser.yrl b/lib/elixir/src/elixir_parser.yrl index 8b28b2ff91c..b32fc533add 100644 --- a/lib/elixir/src/elixir_parser.yrl +++ b/lib/elixir/src/elixir_parser.yrl @@ -890,8 +890,15 @@ build_dot_container(Dot, Left, Right, Extra) -> build_dot(Dot, Left, {_, Location, _} = Right) -> Meta = meta_from_token(Dot), - IdentifierLocation = meta_from_location(Location), - {'.', Meta, IdentifierLocation, [Left, extract_identifier(Right)]}. + IdentifierMeta0 = meta_from_location(Location), + IdentifierMeta1 = + case Location of + {_Line, _Column, {_Unencoded, Delimiter}} when Delimiter =/= nil -> + delimiter(<>) ++ IdentifierMeta0; + _ -> + IdentifierMeta0 + end, + {'.', Meta, IdentifierMeta1, [Left, extract_identifier(Right)]}. extract_identifier({Kind, _, Identifier}) when Kind == identifier; Kind == bracket_identifier; Kind == paren_identifier; @@ -916,8 +923,8 @@ build_no_parens_do_block(Expr, Args, {BlockMeta, Block}) -> build_no_parens(Expr, Args) -> build_call(Expr, Args). -build_identifier({'.', Meta, IdentifierLocation, DotArgs}) -> - {{'.', Meta, DotArgs}, [{no_parens, true} | IdentifierLocation], []}; +build_identifier({'.', Meta, IdentifierMeta, DotArgs}) -> + {{'.', Meta, DotArgs}, [{no_parens, true} | IdentifierMeta], []}; build_identifier({'.', Meta, _} = Dot) -> {Dot, [{no_parens, true} | Meta], []}; @@ -925,8 +932,8 @@ build_identifier({'.', Meta, _} = Dot) -> build_identifier({_, Location, Identifier}) -> {Identifier, meta_from_location(Location), nil}. -build_call({'.', Meta, IdentifierLocation, DotArgs}, Args) -> - {{'.', Meta, DotArgs}, IdentifierLocation, Args}; +build_call({'.', Meta, IdentifierMeta, DotArgs}, Args) -> + {{'.', Meta, DotArgs}, IdentifierMeta, Args}; build_call({'.', Meta, _} = Dot, Args) -> {Dot, Meta, Args}; diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl index 45ee82c244e..6dee340926c 100644 --- a/lib/elixir/src/elixir_tokenizer.erl +++ b/lib/elixir/src/elixir_tokenizer.erl @@ -540,7 +540,7 @@ tokenize([$: | String] = Original, Line, Column, Scope, Tokens) -> {_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} -> NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope), TrackedScope = track_ascii(Ascii, NewScope), - Token = {atom, {Line, Column, Unencoded}, Atom}, + Token = {atom, {Line, Column, {Unencoded, nil}}, Atom}, tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]); empty when Scope#elixir_tokenizer.cursor_completion == false -> unexpected_token(Original, Line, Column, Scope, Tokens); @@ -651,7 +651,7 @@ tokenize(String, Line, Column, OriginalScope, Tokens) -> case Rest of [$: | T] when ?is_space(hd(T)) -> - Token = {kw_identifier, {Line, Column, Unencoded}, Atom}, + Token = {kw_identifier, {Line, Column, {Unencoded, nil}}, Atom}, tokenize(T, Line, Column + Length + 1, Scope, [Token | Tokens]); [$: | T] when hd(T) =/= $: -> @@ -671,7 +671,7 @@ tokenize(String, Line, Column, OriginalScope, Tokens) -> _ when Kind == identifier -> NewScope = maybe_warn_for_ambiguous_bang_before_equals(identifier, Unencoded, Rest, Line, Column, Scope), - Token = check_call_identifier(Line, Column, Unencoded, Atom, Rest), + Token = check_call_identifier(Line, Column, Unencoded, nil, Atom, Rest), tokenize(Rest, Line, Column + Length, NewScope, [Token | Tokens]); _ -> @@ -918,7 +918,7 @@ handle_dot([$., H | T] = Original, Line, Column, DotInfo, Scope, Tokens) when ?i case unsafe_to_atom(UnescapedPart, Line, Column, NewScope) of {ok, Atom} -> - Token = check_call_identifier(Line, Column, Part, Atom, Rest), + Token = check_call_identifier(Line, Column, Part, $", Atom, Rest), TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, NewLine, NewColumn, NewScope, [Token | TokensSoFar]); @@ -937,7 +937,7 @@ handle_dot([$. | Rest], Line, Column, DotInfo, Scope, Tokens) -> tokenize(Rest, Line, Column, Scope, TokensSoFar). handle_call_identifier(Rest, Line, Column, DotInfo, Length, UnencodedOp, Scope, Tokens) -> - Token = check_call_identifier(Line, Column, UnencodedOp, list_to_atom(UnencodedOp), Rest), + Token = check_call_identifier(Line, Column, UnencodedOp, nil, list_to_atom(UnencodedOp), Rest), TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, Line, Column + Length, Scope, [Token | TokensSoFar]). @@ -1324,18 +1324,18 @@ tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scop error(Reason, Unencoded ++ Rest, Scope, Tokens); true -> - AliasesToken = {alias, {Line, Column, Unencoded}, Atom}, + AliasesToken = {alias, {Line, Column, {Unencoded, nil}}, Atom}, tokenize(Rest, Line, Column + Length, Scope, [AliasesToken | Tokens]) end. %% Check if it is a call identifier (paren | bracket | do) -check_call_identifier(Line, Column, Unencoded, Atom, [$( | _]) -> - {paren_identifier, {Line, Column, Unencoded}, Atom}; -check_call_identifier(Line, Column, Unencoded, Atom, [$[ | _]) -> - {bracket_identifier, {Line, Column, Unencoded}, Atom}; -check_call_identifier(Line, Column, Unencoded, Atom, _Rest) -> - {identifier, {Line, Column, Unencoded}, Atom}. +check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, [$( | _]) -> + {paren_identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}; +check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, [$[ | _]) -> + {bracket_identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}; +check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, _Rest) -> + {identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}. add_token_with_eol({unary_op, _, _} = Left, T) -> [Left | T]; add_token_with_eol(Left, [{eol, _} | T]) -> [Left | T]; diff --git a/lib/elixir/test/elixir/kernel/parser_test.exs b/lib/elixir/test/elixir/kernel/parser_test.exs index 70a9188289e..b18f8a34982 100644 --- a/lib/elixir/test/elixir/kernel/parser_test.exs +++ b/lib/elixir/test/elixir/kernel/parser_test.exs @@ -128,7 +128,17 @@ defmodule Kernel.ParserTest do end test "handles graphemes inside quoted identifiers" do - assert {{:., _, [{:foo, _, nil}, :"➡️"]}, _, []} = Code.string_to_quoted!(~s|foo."➡️"|) + assert { + {:., _, [{:foo, _, nil}, :"➡️"]}, + [no_parens: true, delimiter: ~S["], line: 1], + [] + } = Code.string_to_quoted!(~S|foo."➡️"|, token_metadata: true) + + assert { + {:., _, [{:foo, _, nil}, :"➡️"]}, + [closing: [line: 1], delimiter: ~S["], line: 1], + [] + } = Code.string_to_quoted!(~S|foo."➡️"()|, token_metadata: true) end end diff --git a/lib/elixir/unicode/security.ex b/lib/elixir/unicode/security.ex index 90886d58565..2b2bb3ed117 100644 --- a/lib/elixir/unicode/security.ex +++ b/lib/elixir/unicode/security.ex @@ -40,7 +40,7 @@ defmodule String.Tokenizer.Security do ] defp check_token_for_confusability( - {kind, {_line, _column, [_ | _] = name} = info, _}, + {kind, {_line, _column, {[_ | _] = name, _delimiter}} = info, _}, skeletons ) when kind in @identifiers do @@ -50,7 +50,7 @@ defmodule String.Tokenizer.Security do {_, _, ^name} -> {:ok, skeletons} - {line, _, previous_name} when name != previous_name -> + {line, _, {previous_name, _delimiter}} when name != previous_name -> {:warn, "confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <> "but they are written using different characters" <> dir_compare(name, previous_name)} From d138487c405d165e28f38f3e5dd1e0ebdeaf1fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Tue, 5 Nov 2024 19:34:01 +0800 Subject: [PATCH 2/2] Store delimiter in dot token instead --- lib/elixir/src/elixir_parser.yrl | 4 ++-- lib/elixir/src/elixir_tokenizer.erl | 27 ++++++++++++++------------- lib/elixir/unicode/security.ex | 4 ++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/lib/elixir/src/elixir_parser.yrl b/lib/elixir/src/elixir_parser.yrl index b32fc533add..8777a44d742 100644 --- a/lib/elixir/src/elixir_parser.yrl +++ b/lib/elixir/src/elixir_parser.yrl @@ -892,8 +892,8 @@ build_dot(Dot, Left, {_, Location, _} = Right) -> Meta = meta_from_token(Dot), IdentifierMeta0 = meta_from_location(Location), IdentifierMeta1 = - case Location of - {_Line, _Column, {_Unencoded, Delimiter}} when Delimiter =/= nil -> + case Dot of + {'.', {_Line, _Column, Delimiter}} when Delimiter =/= nil -> delimiter(<>) ++ IdentifierMeta0; _ -> IdentifierMeta0 diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl index 6dee340926c..ebe290d35cc 100644 --- a/lib/elixir/src/elixir_tokenizer.erl +++ b/lib/elixir/src/elixir_tokenizer.erl @@ -540,7 +540,7 @@ tokenize([$: | String] = Original, Line, Column, Scope, Tokens) -> {_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} -> NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope), TrackedScope = track_ascii(Ascii, NewScope), - Token = {atom, {Line, Column, {Unencoded, nil}}, Atom}, + Token = {atom, {Line, Column, Unencoded}, Atom}, tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]); empty when Scope#elixir_tokenizer.cursor_completion == false -> unexpected_token(Original, Line, Column, Scope, Tokens); @@ -651,7 +651,7 @@ tokenize(String, Line, Column, OriginalScope, Tokens) -> case Rest of [$: | T] when ?is_space(hd(T)) -> - Token = {kw_identifier, {Line, Column, {Unencoded, nil}}, Atom}, + Token = {kw_identifier, {Line, Column, Unencoded}, Atom}, tokenize(T, Line, Column + Length + 1, Scope, [Token | Tokens]); [$: | T] when hd(T) =/= $: -> @@ -671,7 +671,7 @@ tokenize(String, Line, Column, OriginalScope, Tokens) -> _ when Kind == identifier -> NewScope = maybe_warn_for_ambiguous_bang_before_equals(identifier, Unencoded, Rest, Line, Column, Scope), - Token = check_call_identifier(Line, Column, Unencoded, nil, Atom, Rest), + Token = check_call_identifier(Line, Column, Unencoded, Atom, Rest), tokenize(Rest, Line, Column + Length, NewScope, [Token | Tokens]); _ -> @@ -918,8 +918,9 @@ handle_dot([$., H | T] = Original, Line, Column, DotInfo, Scope, Tokens) when ?i case unsafe_to_atom(UnescapedPart, Line, Column, NewScope) of {ok, Atom} -> - Token = check_call_identifier(Line, Column, Part, $", Atom, Rest), - TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), + Token = check_call_identifier(Line, Column, Part, Atom, Rest), + DotInfo1 = setelement(3, DotInfo, $"), + TokensSoFar = add_token_with_eol({'.', DotInfo1}, Tokens), tokenize(Rest, NewLine, NewColumn, NewScope, [Token | TokensSoFar]); {error, Reason} -> @@ -937,7 +938,7 @@ handle_dot([$. | Rest], Line, Column, DotInfo, Scope, Tokens) -> tokenize(Rest, Line, Column, Scope, TokensSoFar). handle_call_identifier(Rest, Line, Column, DotInfo, Length, UnencodedOp, Scope, Tokens) -> - Token = check_call_identifier(Line, Column, UnencodedOp, nil, list_to_atom(UnencodedOp), Rest), + Token = check_call_identifier(Line, Column, UnencodedOp, list_to_atom(UnencodedOp), Rest), TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, Line, Column + Length, Scope, [Token | TokensSoFar]). @@ -1324,18 +1325,18 @@ tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scop error(Reason, Unencoded ++ Rest, Scope, Tokens); true -> - AliasesToken = {alias, {Line, Column, {Unencoded, nil}}, Atom}, + AliasesToken = {alias, {Line, Column, Unencoded}, Atom}, tokenize(Rest, Line, Column + Length, Scope, [AliasesToken | Tokens]) end. %% Check if it is a call identifier (paren | bracket | do) -check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, [$( | _]) -> - {paren_identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}; -check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, [$[ | _]) -> - {bracket_identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}; -check_call_identifier(Line, Column, Unencoded, Delimiter, Atom, _Rest) -> - {identifier, {Line, Column, {Unencoded, Delimiter}}, Atom}. +check_call_identifier(Line, Column, Unencoded, Atom, [$( | _]) -> + {paren_identifier, {Line, Column, Unencoded}, Atom}; +check_call_identifier(Line, Column, Unencoded, Atom, [$[ | _]) -> + {bracket_identifier, {Line, Column, Unencoded}, Atom}; +check_call_identifier(Line, Column, Unencoded, Atom, _Rest) -> + {identifier, {Line, Column, Unencoded}, Atom}. add_token_with_eol({unary_op, _, _} = Left, T) -> [Left | T]; add_token_with_eol(Left, [{eol, _} | T]) -> [Left | T]; diff --git a/lib/elixir/unicode/security.ex b/lib/elixir/unicode/security.ex index 2b2bb3ed117..90886d58565 100644 --- a/lib/elixir/unicode/security.ex +++ b/lib/elixir/unicode/security.ex @@ -40,7 +40,7 @@ defmodule String.Tokenizer.Security do ] defp check_token_for_confusability( - {kind, {_line, _column, {[_ | _] = name, _delimiter}} = info, _}, + {kind, {_line, _column, [_ | _] = name} = info, _}, skeletons ) when kind in @identifiers do @@ -50,7 +50,7 @@ defmodule String.Tokenizer.Security do {_, _, ^name} -> {:ok, skeletons} - {line, _, {previous_name, _delimiter}} when name != previous_name -> + {line, _, previous_name} when name != previous_name -> {:warn, "confusable identifier: '#{name}' looks like '#{previous_name}' on line #{line}, " <> "but they are written using different characters" <> dir_compare(name, previous_name)}