Skip to content

Commit c032839

Browse files
sabiwarajosevalim
authored andcommitted
Parser honors :static_atoms_encoder for multi-letter sigils (#12706)
* Unify sigil token generation * Parser honors :static_atoms_encoder for multi-letter sigils * Remove chars from token, reverse engineer from atom
1 parent 52cf991 commit c032839

File tree

6 files changed

+79
-27
lines changed

6 files changed

+79
-27
lines changed

lib/elixir/lib/code.ex

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1136,7 +1136,10 @@ defmodule Code do
11361136
* syntax keywords (`fn`, `do`, `else`, and so on)
11371137
11381138
* atoms containing interpolation (`:"#{1 + 1} is two"`), as these
1139-
atoms are constructed at runtime.
1139+
atoms are constructed at runtime
1140+
1141+
* atoms used to represent single-letter sigils like `:sigil_X`
1142+
(but multi-letter sigils like `:sigil_XYZ` are encoded).
11401143
11411144
"""
11421145
@spec string_to_quoted(List.Chars.t(), keyword) ::

lib/elixir/src/elixir_errors.erl

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -170,13 +170,22 @@ parse_error(Location, File, <<"syntax error before: ">>, Keyword, Input)
170170

171171
%% Produce a human-readable message for errors before a sigil
172172
parse_error(Location, File, <<"syntax error before: ">>, <<"{sigil,", _Rest/binary>> = Full, Input) ->
173-
{sigil, _, Sigil, [Content | _], _, _, _} = parse_erl_term(Full),
173+
{sigil, _, Atom, [Content | _], _, _, _} = parse_erl_term(Full),
174174
Content2 = case is_binary(Content) of
175175
true -> Content;
176176
false -> <<>>
177177
end,
178-
SigilName = list_to_binary(Sigil),
179-
Message = <<"syntax error before: sigil \~", SigilName/binary, " starting with content '", Content2/binary, "'">>,
178+
179+
% :static_atoms_encoder might encode :sigil_ atoms as arbitrary terms
180+
MaybeSigil = case is_atom(Atom) of
181+
true -> case atom_to_binary(Atom) of
182+
<<"sigil_", Chars/binary>> -> <<"\~", Chars/binary, " ">>;
183+
_ -> <<>>
184+
end;
185+
false -> <<>>
186+
end,
187+
188+
Message = <<"syntax error before: sigil ", MaybeSigil/binary, "starting with content '", Content2/binary, "'">>,
180189
raise_snippet(Location, File, Input, 'Elixir.SyntaxError', Message);
181190

182191
%% Binaries (and interpolation) are wrapped in [<<...>>]

lib/elixir/src/elixir_parser.yrl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,11 +939,11 @@ build_access(Expr, {List, Meta}) ->
939939

940940
%% Interpolation aware
941941

942-
build_sigil({sigil, Location, Sigil, Parts, Modifiers, Indentation, Delimiter}) ->
942+
build_sigil({sigil, Location, Atom, Parts, Modifiers, Indentation, Delimiter}) ->
943943
Meta = meta_from_location(Location),
944944
MetaWithDelimiter = [{delimiter, Delimiter} | Meta],
945945
MetaWithIndentation = meta_with_indentation(Meta, Indentation),
946-
{list_to_atom("sigil_" ++ Sigil),
946+
{Atom,
947947
MetaWithDelimiter,
948948
[{'<<>>', MetaWithIndentation, string_parts(Parts)}, Modifiers]}.
949949

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1550,7 +1550,7 @@ tokenize_sigil_name([S | T], NameAcc, Line, Column, Scope, Tokens) when ?is_upca
15501550
tokenize_sigil_name(T, [S | NameAcc], Line, Column + 1, Scope, Tokens);
15511551
% With a lowercase letter and a non-empty NameAcc we return an error.
15521552
tokenize_sigil_name([S | _T] = Original, [_ | _] = NameAcc, _Line, _Column, _Scope, _Tokens) when ?is_downcase(S) ->
1553-
Message = "invalid sigil name, it should be either a one-letter lowercase letter or a" ++
1553+
Message = "invalid sigil name, it should be either a one-letter lowercase letter or a" ++
15541554
" sequence of uppercase letters only, got: ",
15551555
{error, Message, [$~] ++ lists:reverse(NameAcc) ++ Original};
15561556
% We finished the letters, so the name is over.
@@ -1561,12 +1561,8 @@ tokenize_sigil_contents([H, H, H | T] = Original, [S | _] = SigilName, Line, Col
15611561
when ?is_quote(H) ->
15621562
case extract_heredoc_with_interpolation(Line, Column, Scope, ?is_downcase(S), T, H) of
15631563
{ok, NewLine, NewColumn, Parts, Rest, NewScope} ->
1564-
{Final, Modifiers} = collect_modifiers(Rest, []),
15651564
Indentation = NewColumn - 4,
1566-
TokenColumn = Column - 1 - length(SigilName),
1567-
Token = {sigil, {Line, TokenColumn, nil}, SigilName, Parts, Modifiers, Indentation, <<H, H, H>>},
1568-
NewColumnWithModifiers = NewColumn + length(Modifiers),
1569-
tokenize(Final, NewLine, NewColumnWithModifiers, NewScope, [Token | Tokens]);
1565+
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, Parts, Rest, NewScope, Tokens, Indentation, <<H, H, H>>);
15701566

15711567
{error, Reason} ->
15721568
error(Reason, [$~] ++ SigilName ++ Original, Scope, Tokens)
@@ -1576,12 +1572,8 @@ tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, S
15761572
when ?is_sigil(H) ->
15771573
case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of
15781574
{NewLine, NewColumn, Parts, Rest, NewScope} ->
1579-
{Final, Modifiers} = collect_modifiers(Rest, []),
15801575
Indentation = nil,
1581-
TokenColumn = Column - 1 - length(SigilName),
1582-
Token = {sigil, {Line, TokenColumn, nil}, SigilName, tokens_to_binary(Parts), Modifiers, Indentation, <<H>>},
1583-
NewColumnWithModifiers = NewColumn + length(Modifiers),
1584-
tokenize(Final, NewLine, NewColumnWithModifiers, NewScope, [Token | Tokens]);
1576+
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, tokens_to_binary(Parts), Rest, NewScope, Tokens, Indentation, <<H>>);
15851577

15861578
{error, Reason} ->
15871579
Sigil = [$~, S, H],
@@ -1601,6 +1593,24 @@ tokenize_sigil_contents([H | _] = Original, SigilName, Line, Column, Scope, Toke
16011593
tokenize_sigil_contents([], _SigilName, Line, Column, Scope, Tokens) ->
16021594
tokenize([], Line, Column, Scope, Tokens).
16031595

1596+
add_sigil_token(SigilName, Line, Column, NewLine, NewColumn, Parts, Rest, Scope, Tokens, Indentation, Delimiter) ->
1597+
TokenColumn = Column - 1 - length(SigilName),
1598+
MaybeEncoded = case SigilName of
1599+
% Single-letter sigils present no risk of atom exhaustion (limited possibilities)
1600+
[_Char] -> {ok, list_to_atom("sigil_" ++ SigilName)};
1601+
_ -> unsafe_to_atom("sigil_" ++ SigilName, Line, TokenColumn, Scope)
1602+
end,
1603+
case MaybeEncoded of
1604+
{ok, Atom} ->
1605+
{Final, Modifiers} = collect_modifiers(Rest, []),
1606+
Token = {sigil, {Line, TokenColumn, nil}, Atom, Parts, Modifiers, Indentation, Delimiter},
1607+
NewColumnWithModifiers = NewColumn + length(Modifiers),
1608+
tokenize(Final, NewLine, NewColumnWithModifiers, Scope, [Token | Tokens]);
1609+
1610+
{error, Reason} ->
1611+
error(Reason, Rest, Scope, Tokens)
1612+
end.
1613+
16041614
%% Fail early on invalid do syntax. For example, after
16051615
%% most keywords, after comma and so on.
16061616
tokenize_keyword_terminator(DoLine, DoColumn, do, [{identifier, {Line, Column, Meta}, Atom} | T]) ->

lib/elixir/test/elixir/kernel/parser_test.exs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ defmodule Kernel.ParserTest do
182182
assert Code.string_to_quoted(":there_is_no_such_atom", existing_atoms_only: true) ==
183183
{:error,
184184
{[line: 1, column: 1], "unsafe atom does not exist: ", "there_is_no_such_atom"}}
185+
186+
assert Code.string_to_quoted("~UNKNOWN'foo bar'", existing_atoms_only: true) ==
187+
{:error, {[line: 1, column: 1], "unsafe atom does not exist: ", "sigil_UNKNOWN"}}
185188
end
186189

187190
test "encodes atoms" do
@@ -228,6 +231,20 @@ defmodule Kernel.ParserTest do
228231
)
229232
end
230233

234+
test "encodes multi-letter sigils" do
235+
ref = make_ref()
236+
237+
encoder = fn atom, meta ->
238+
assert atom == "sigil_UNKNOWN"
239+
assert meta[:line] == 1
240+
assert meta[:column] == 1
241+
{:ok, ref}
242+
end
243+
244+
assert {:ok, {^ref, [delimiter: "'", line: 1], [{:<<>>, [line: 1], ["abc"]}, []]}} =
245+
Code.string_to_quoted("~UNKNOWN'abc'", static_atoms_encoder: encoder)
246+
end
247+
231248
test "addresses ambiguities" do
232249
encoder = fn string, _meta -> {:ok, {:atom, string}} end
233250

@@ -254,6 +271,16 @@ defmodule Kernel.ParserTest do
254271
Code.string_to_quoted("[do: 1, true: 2, end: 3]", static_atoms_encoder: encoder)
255272
end
256273

274+
test "does not encode one-letter sigils" do
275+
encoder = fn atom, _meta -> raise "shouldn't be invoked for #{atom}" end
276+
277+
assert {:ok, {:sigil_z, [{:delimiter, "'"}, {:line, 1}], [{:<<>>, [line: 1], ["foo"]}, []]}} =
278+
Code.string_to_quoted("~z'foo'", static_atoms_encoder: encoder)
279+
280+
assert {:ok, {:sigil_Z, [{:delimiter, "'"}, {:line, 1}], [{:<<>>, [line: 1], ["foo"]}, []]}} =
281+
Code.string_to_quoted("~Z'foo'", static_atoms_encoder: encoder)
282+
end
283+
257284
test "returns errors on long atoms even when using static_atoms_encoder" do
258285
atom = String.duplicate("a", 256)
259286

@@ -271,6 +298,9 @@ defmodule Kernel.ParserTest do
271298

272299
assert {:error, {[line: 1, column: 1], "Invalid atom name: ", "there_is_no_such_atom"}} =
273300
Code.string_to_quoted(":there_is_no_such_atom", static_atoms_encoder: encoder)
301+
302+
assert {:error, {[line: 1, column: 1], "Invalid atom name: ", "sigil_UNKNOWN"}} =
303+
Code.string_to_quoted("~UNKNOWN'foo bar'", static_atoms_encoder: encoder)
274304
end
275305

276306
test "may return tuples" do

lib/elixir/test/erlang/tokenizer_test.erl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -261,22 +261,22 @@ vc_merge_conflict_test() ->
261261
tokenize_error("<<<<<<< HEAD\n[1, 2, 3]").
262262

263263
sigil_terminator_test() ->
264-
[{sigil, {1, 1, nil}, "r", [<<"foo">>], "", nil, <<"/">>}] = tokenize("~r/foo/"),
265-
[{sigil, {1, 1, nil}, "r", [<<"foo">>], "", nil, <<"[">>}] = tokenize("~r[foo]"),
266-
[{sigil, {1, 1, nil}, "r", [<<"foo">>], "", nil, <<"\"">>}] = tokenize("~r\"foo\""),
267-
[{sigil, {1, 1, nil}, "r", [<<"foo">>], "", nil, <<"/">>},
264+
[{sigil, {1, 1, nil}, sigil_r, [<<"foo">>], "", nil, <<"/">>}] = tokenize("~r/foo/"),
265+
[{sigil, {1, 1, nil}, sigil_r, [<<"foo">>], "", nil, <<"[">>}] = tokenize("~r[foo]"),
266+
[{sigil, {1, 1, nil}, sigil_r, [<<"foo">>], "", nil, <<"\"">>}] = tokenize("~r\"foo\""),
267+
[{sigil, {1, 1, nil}, sigil_r, [<<"foo">>], "", nil, <<"/">>},
268268
{comp_op, {1, 9, nil}, '=='},
269269
{identifier, {1, 12, _}, bar}] = tokenize("~r/foo/ == bar"),
270-
[{sigil, {1, 1, nil}, "r", [<<"foo">>], "iu", nil, <<"/">>},
270+
[{sigil, {1, 1, nil}, sigil_r, [<<"foo">>], "iu", nil, <<"/">>},
271271
{comp_op, {1, 11, nil}, '=='},
272272
{identifier, {1, 14, _}, bar}] = tokenize("~r/foo/iu == bar"),
273-
[{sigil, {1, 1, nil}, "M", [<<"1 2 3">>], "u8", nil, <<"[">>}] = tokenize("~M[1 2 3]u8").
273+
[{sigil, {1, 1, nil}, sigil_M, [<<"1 2 3">>], "u8", nil, <<"[">>}] = tokenize("~M[1 2 3]u8").
274274

275275
sigil_heredoc_test() ->
276-
[{sigil, {1, 1, nil}, "S", [<<"sigil heredoc\n">>], "", 0, <<"\"\"\"">>}] = tokenize("~S\"\"\"\nsigil heredoc\n\"\"\""),
277-
[{sigil, {1, 1, nil}, "S", [<<"sigil heredoc\n">>], "", 0, <<"'''">>}] = tokenize("~S'''\nsigil heredoc\n'''"),
278-
[{sigil, {1, 1, nil}, "S", [<<"sigil heredoc\n">>], "", 2, <<"\"\"\"">>}] = tokenize("~S\"\"\"\n sigil heredoc\n \"\"\""),
279-
[{sigil, {1, 1, nil}, "s", [<<"sigil heredoc\n">>], "", 2, <<"\"\"\"">>}] = tokenize("~s\"\"\"\n sigil heredoc\n \"\"\"").
276+
[{sigil, {1, 1, nil}, sigil_S, [<<"sigil heredoc\n">>], "", 0, <<"\"\"\"">>}] = tokenize("~S\"\"\"\nsigil heredoc\n\"\"\""),
277+
[{sigil, {1, 1, nil}, sigil_S, [<<"sigil heredoc\n">>], "", 0, <<"'''">>}] = tokenize("~S'''\nsigil heredoc\n'''"),
278+
[{sigil, {1, 1, nil}, sigil_S, [<<"sigil heredoc\n">>], "", 2, <<"\"\"\"">>}] = tokenize("~S\"\"\"\n sigil heredoc\n \"\"\""),
279+
[{sigil, {1, 1, nil}, sigil_s, [<<"sigil heredoc\n">>], "", 2, <<"\"\"\"">>}] = tokenize("~s\"\"\"\n sigil heredoc\n \"\"\"").
280280

281281
invalid_sigil_delimiter_test() ->
282282
{1, 1, "invalid sigil delimiter: ", Message} = tokenize_error("~s\\"),

0 commit comments

Comments
 (0)