Skip to content

Commit 8e9ea26

Browse files
uohzxelawhatyouhide
authored andcommitted
Add a preserve_comments option to the tokenizer (#6222)
1 parent 3ac7593 commit 8e9ea26

File tree

3 files changed

+88
-44
lines changed

3 files changed

+88
-44
lines changed

lib/elixir/src/elixir.hrl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,6 @@
3636
terminators=[],
3737
check_terminators=true,
3838
existing_atoms_only=false,
39+
preserve_comments=false,
3940
identifier_tokenizer=elixir_tokenizer
4041
}).

lib/elixir/src/elixir_tokenizer.erl

Lines changed: 72 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -119,20 +119,29 @@ tokenize(String, Line, Column, Opts) ->
119119
false -> <<"nofile">>
120120
end,
121121

122-
Existing = case lists:keyfind(existing_atoms_only, 1, Opts) of
123-
{existing_atoms_only, true} -> true;
124-
false -> false
122+
ExistingAtomsOnly = case lists:keyfind(existing_atoms_only, 1, Opts) of
123+
{existing_atoms_only, ExistingAtomsOnlyBool} when
124+
is_boolean(ExistingAtomsOnlyBool) -> ExistingAtomsOnlyBool;
125+
_ -> false
125126
end,
126127

127-
Check = case lists:keyfind(check_terminators, 1, Opts) of
128-
{check_terminators, false} -> false;
129-
false -> true
128+
CheckTerminators = case lists:keyfind(check_terminators, 1, Opts) of
129+
{check_terminators, CheckTerminatorsBool} when
130+
is_boolean(CheckTerminatorsBool) -> CheckTerminatorsBool;
131+
_ -> true
132+
end,
133+
134+
PreserveComments = case lists:keyfind(preserve_comments, 1, Opts) of
135+
{preserve_comments, PreserveCommentsBool} when
136+
is_boolean(PreserveCommentsBool) -> PreserveCommentsBool;
137+
_ -> false
130138
end,
131139

132140
tokenize(String, Line, Column, #elixir_tokenizer{
133141
file=File,
134-
existing_atoms_only=Existing,
135-
check_terminators=Check,
142+
existing_atoms_only=ExistingAtomsOnly,
143+
check_terminators=CheckTerminators,
144+
preserve_comments=PreserveComments,
136145
identifier_tokenizer=elixir_config:get(identifier_tokenizer)
137146
}).
138147

@@ -170,8 +179,14 @@ tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
170179
% Comments
171180

172181
tokenize([$# | String], Line, Column, Scope, Tokens) ->
173-
Rest = tokenize_comment(String),
174-
tokenize(Rest, Line, Column, Scope, Tokens);
182+
{Rest, Comment, Length} = tokenize_comment(String, [$#], 1),
183+
case Scope#elixir_tokenizer.preserve_comments of
184+
true ->
185+
CommentToken = {comment, {Line, Column, Column + Length}, Comment},
186+
tokenize(Rest, Line, Column + Length, Scope, [CommentToken | Tokens]);
187+
false ->
188+
tokenize(Rest, Line, Column, Scope, Tokens)
189+
end;
175190

176191
% Sigils
177192

@@ -453,8 +468,8 @@ tokenize([$% | T], Line, Column, Scope, Tokens) ->
453468
tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, Column + 1}} | Tokens]);
454469

455470
tokenize([$. | T], Line, Column, Scope, Tokens) ->
456-
{Rest, Counter, Offset} = strip_dot_space(T, 0, Column + 1),
457-
handle_dot([$. | Rest], Line + Counter, Offset - 1, Column, Scope, Tokens);
471+
{Rest, Counter, Offset, CommentTokens} = strip_dot_space(T, 0, Column + 1, Line, []),
472+
handle_dot([$. | Rest], Line, Offset - 1, Column, Scope, Tokens, CommentTokens, Counter);
458473

459474
% Identifiers
460475

@@ -498,12 +513,18 @@ strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
498513
strip_horizontal_space(T, Counter) ->
499514
{T, Counter}.
500515

501-
strip_dot_space(T, Counter, Column) ->
516+
strip_dot_space(T, Counter, Column, StartLine, Tokens) ->
502517
case strip_horizontal_space(T) of
503-
{"#" ++ Rest, _} -> strip_dot_space(tokenize_comment(Rest), Counter, 1);
504-
{"\r\n" ++ Rest, _} -> strip_dot_space(Rest, Counter + 1, 1);
505-
{"\n" ++ Rest, _} -> strip_dot_space(Rest, Counter + 1, 1);
506-
{Rest, Length} -> {Rest, Counter, Column + Length}
518+
{"#" ++ R, _} ->
519+
{Rest, Comment, Length} = tokenize_comment(R, [$#], 1),
520+
CommentToken = {comment, {StartLine + Counter, Column, Column + Length}, Comment},
521+
strip_dot_space(Rest, Counter, 1, StartLine, [CommentToken | Tokens]);
522+
{"\r\n" ++ Rest, _} ->
523+
strip_dot_space(Rest, Counter + 1, 1, StartLine, Tokens);
524+
{"\n" ++ Rest, _} ->
525+
strip_dot_space(Rest, Counter + 1, 1, StartLine, Tokens);
526+
{Rest, Length} ->
527+
{Rest, Counter, Column + Length, Tokens}
507528
end.
508529

509530
handle_char(7) -> {"\\a", "alert"};
@@ -572,51 +593,59 @@ handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
572593
add_token_with_nl({Kind, {Line, Column, Column + Length}, Op}, Tokens))
573594
end.
574595

596+
handle_comments(CommentTokens, Tokens, Scope) ->
597+
case Scope#elixir_tokenizer.preserve_comments of
598+
true -> lists:append(CommentTokens, Tokens);
599+
false -> Tokens
600+
end.
601+
575602
% ## Three Token Operators
576-
handle_dot([$., T1, T2, T3 | Rest], Line, Column, DotColumn, Scope, Tokens) when
603+
handle_dot([$., T1, T2, T3 | Rest], Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) when
577604
?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3);
578605
?arrow_op3(T1, T2, T3); ?three_op(T1, T2, T3) ->
579-
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 3, list_to_atom([T1, T2, T3]), Scope, Tokens);
606+
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 3, list_to_atom([T1, T2, T3]), Scope, Tokens, CommentTokens, Counter);
580607

581608
% ## Two Token Operators
582-
handle_dot([$., T1, T2 | Rest], Line, Column, DotColumn, Scope, Tokens) when
609+
handle_dot([$., T1, T2 | Rest], Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) when
583610
?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2);
584611
?arrow_op(T1, T2); ?in_match_op(T1, T2); ?two_op(T1, T2); ?stab_op(T1, T2);
585612
?type_op(T1, T2) ->
586-
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 2, list_to_atom([T1, T2]), Scope, Tokens);
613+
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 2, list_to_atom([T1, T2]), Scope, Tokens, CommentTokens, Counter);
587614

588615
% ## Single Token Operators
589-
handle_dot([$., T | Rest], Line, Column, DotColumn, Scope, Tokens) when
616+
handle_dot([$., T | Rest], Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) when
590617
?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T);
591618
?rel_op(T); ?match_op(T); ?pipe_op(T) ->
592-
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 1, list_to_atom([T]), Scope, Tokens);
619+
handle_call_identifier(Rest, Line, Column + 1, DotColumn, 1, list_to_atom([T]), Scope, Tokens, CommentTokens, Counter);
593620

594621
% ## Exception for .( as it needs to be treated specially in the parser
595-
handle_dot([$., $( | Rest], Line, Column, DotColumn, Scope, Tokens) ->
596-
tokenize([$( | Rest], Line, Column + 2, Scope, add_token_with_nl({dot_call_op, {Line, DotColumn, DotColumn + 1}, '.'}, Tokens));
622+
handle_dot([$., $( | Rest], Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) ->
623+
TokensSoFar = add_token_with_nl({dot_call_op, {Line, DotColumn, DotColumn + 1}, '.'}, Tokens),
624+
tokenize([$( | Rest], Line + Counter, Column + 2, Scope, handle_comments(CommentTokens, TokensSoFar, Scope));
597625

598-
handle_dot([$., H | T] = Original, Line, Column, DotColumn, Scope, Tokens) when ?is_quote(H) ->
626+
handle_dot([$., H | T] = Original, Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) when ?is_quote(H) ->
599627
case elixir_interpolation:extract(Line, Column + 2, Scope, true, T, H) of
600628
{NewLine, NewColumn, [Part], Rest} when is_binary(Part) ->
601629
case unsafe_to_atom(Part, Line, Scope) of
602630
{ok, Atom} ->
603-
Token = check_call_identifier(Line, Column, max(NewColumn - Column, 0), Atom, Rest),
604-
tokenize(Rest, NewLine, NewColumn, Scope,
605-
[Token | add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens)]);
631+
Token = check_call_identifier(Line + Counter, Column, max(NewColumn - Column, 0), Atom, Rest),
632+
TokensSoFar = add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens),
633+
tokenize(Rest, NewLine, NewColumn, Scope, [Token | handle_comments(CommentTokens, TokensSoFar, Scope)]);
606634
{error, Reason} ->
607635
{error, Reason, Original, Tokens}
608636
end;
609637
{error, Reason} ->
610638
interpolation_error(Reason, Original, Tokens, " (for function name starting at line ~B)", [Line])
611639
end;
612640

613-
handle_dot([$. | Rest], Line, Column, DotColumn, Scope, Tokens) ->
614-
tokenize(Rest, Line, Column + 1, Scope, add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens)).
641+
handle_dot([$. | Rest], Line, Column, DotColumn, Scope, Tokens, CommentTokens, Counter) ->
642+
TokensSoFar = add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens),
643+
tokenize(Rest, Line + Counter, Column + 1, Scope, handle_comments(CommentTokens, TokensSoFar, Scope)).
615644

616-
handle_call_identifier(Rest, Line, Column, DotColumn, Length, Op, Scope, Tokens) ->
617-
{_, {_, _, NewColumn}, _} = Token = check_call_identifier(Line, Column, Length, Op, Rest),
618-
tokenize(Rest, Line, NewColumn, Scope,
619-
[Token | add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens)]).
645+
handle_call_identifier(Rest, Line, Column, DotColumn, Length, Op, Scope, Tokens, CommentTokens, Counter) ->
646+
{_, {NewLine, _, NewColumn}, _} = Token = check_call_identifier(Line + Counter, Column, Length, Op, Rest),
647+
TokensSoFar = add_token_with_nl({'.', {Line, DotColumn, DotColumn + 1}}, Tokens),
648+
tokenize(Rest, NewLine, NewColumn, Scope, [Token | handle_comments(CommentTokens, TokensSoFar, Scope)]).
620649

621650
% ## Ambiguous unary/binary operators tokens
622651
handle_space_sensitive_tokens([Sign, NotMarker | T], Line, Column, Scope, [{Identifier, _, _} = H | Tokens]) when
@@ -825,10 +854,14 @@ tokenize_bin(Rest, Acc, Length) ->
825854

826855
%% Comments
827856

828-
tokenize_comment("\r\n" ++ _ = Rest) -> Rest;
829-
tokenize_comment("\n" ++ _ = Rest) -> Rest;
830-
tokenize_comment([_ | Rest]) -> tokenize_comment(Rest);
831-
tokenize_comment([]) -> [].
857+
tokenize_comment("\r\n" ++ _ = Rest, Acc, Length) ->
858+
{Rest, lists:reverse(Acc), Length};
859+
tokenize_comment("\n" ++ _ = Rest, Acc, Length) ->
860+
{Rest, lists:reverse(Acc), Length};
861+
tokenize_comment([H | Rest], Acc, Length) ->
862+
tokenize_comment(Rest, [H | Acc], Length + 1);
863+
tokenize_comment([], Acc, Length) ->
864+
{[], Acc, Length}.
832865

833866
%% Identifiers
834867

lib/elixir/test/erlang/tokenizer_test.erl

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
-include_lib("eunit/include/eunit.hrl").
33

44
tokenize(String) ->
5-
{ok, _Line, _Column, Result} = elixir_tokenizer:tokenize(String, 1, []),
5+
tokenize(String, []).
6+
7+
tokenize(String, Opts) ->
8+
{ok, _Line, _Column, Result} = elixir_tokenizer:tokenize(String, 1, Opts),
69
Result.
710

811
tokenize_error(String) ->
@@ -79,7 +82,9 @@ float_test() ->
7982
{1, "invalid float number ", OversizedFloat} = tokenize_error(OversizedFloat).
8083

8184
comments_test() ->
82-
[{number, {1, 1, 2}, 1}, {eol, {1, 3, 4}}, {number, {2, 1, 2}, 2}] = tokenize("1 # Comment\n2").
85+
[{number, {1, 1, 2}, 1}, {eol, {1, 3, 4}}, {number, {2, 1, 2}, 2}] = tokenize("1 # Comment\n2"),
86+
[{number, {1, 1, 2}, 1}, {comment, {1, 3, 12}, "# Comment"},
87+
{eol, {1, 12, 13}}, {number, {2, 1, 2}, 2}] = tokenize("1 # Comment\n2", [{preserve_comments, true}]).
8388

8489
identifier_test() ->
8590
[{identifier, {1, 1, 4}, abc}] = tokenize("abc "),
@@ -118,13 +123,18 @@ newline_test() ->
118123

119124
dot_newline_operator_test() ->
120125
[{identifier, {1, 1, 4}, foo},
121-
{'.', {2, 4, 5}},
126+
{'.', {1, 4, 5}},
122127
{identifier, {2, 1, 2}, '+'},
123128
{number, {2, 2, 3}, 1}] = tokenize("foo.\n+1"),
124129
[{identifier, {1, 1, 4}, foo},
125-
{'.', {2, 4, 5}},
130+
{'.', {1, 4, 5}},
131+
{identifier, {2, 1, 2}, '+'},
132+
{number, {2, 2, 3}, 1}] = tokenize("foo.#bar\n+1"),
133+
[{identifier, {1, 1, 4}, foo},
134+
{'.', {1, 4, 5}},
135+
{comment, {1, 5, 9}, "#bar"},
126136
{identifier, {2, 1, 2}, '+'},
127-
{number, {2, 2, 3}, 1}] = tokenize("foo.#bar\n+1").
137+
{number, {2, 2, 3}, 1}] = tokenize("foo.#bar\n+1", [{preserve_comments, true}]).
128138

129139
aliases_test() ->
130140
[{'aliases', {1, 1, 4}, ['Foo']}] = tokenize("Foo"),

0 commit comments

Comments
 (0)