Skip to content

Commit 723f31c

Browse files
Earlopainmatzbot
authored andcommitted
[ruby/prism] Fix binary encoding for the parser translator
Skipping detecting the encoding is almost always right, just for binary it should actually happen. A symbol containing escapes that are invalid in utf-8 would fail to parse since symbols must be valid in the script encoding. Additionally, the parser gem would raise an exception somewhere during string handling ruby/prism@fa0154d9e4
1 parent 8e56d9e commit 723f31c

File tree

8 files changed

+119
-3
lines changed

8 files changed

+119
-3
lines changed

lib/prism/translation/parser.rb

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def parse(source_buffer)
5151
source = source_buffer.source
5252

5353
offset_cache = build_offset_cache(source)
54-
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
54+
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
5555

5656
build_ast(result.value, offset_cache)
5757
ensure
@@ -64,7 +64,7 @@ def parse_with_comments(source_buffer)
6464
source = source_buffer.source
6565

6666
offset_cache = build_offset_cache(source)
67-
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
67+
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
6868

6969
[
7070
build_ast(result.value, offset_cache),
@@ -83,7 +83,7 @@ def tokenize(source_buffer, recover = false)
8383
offset_cache = build_offset_cache(source)
8484
result =
8585
begin
86-
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
86+
unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
8787
rescue ::Parser::SyntaxError
8888
raise if !recover
8989
end
@@ -285,6 +285,20 @@ def build_range(location, offset_cache)
285285
)
286286
end
287287

288+
# Options for how prism should parse/lex the source.
289+
def prism_options
290+
options = {
291+
filepath: @source_buffer.name,
292+
version: convert_for_prism(version),
293+
partial_script: true,
294+
}
295+
# The parser gem always encodes to UTF-8, unless it is binary.
296+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
297+
options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
298+
299+
options
300+
end
301+
288302
# Converts the version format handled by Parser to the format handled by Prism.
289303
def convert_for_prism(version)
290304
case version
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# encoding: binary
2+
3+
"\xcd"
4+
5+
:"\xcd"
6+
7+
/#{"\xcd"}/
8+
9+
%W[\xC0]
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# encoding: euc-jp
2+
3+
# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
4+
"\x8E\x01"
5+
6+
%W["\x8E\x01"]

test/prism/ruby/parser_test.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,18 @@
1717
# First, opt in to every AST feature.
1818
Parser::Builders::Default.modernize
1919

20+
# The parser gem rejects some strings that would most likely lead to errors
21+
# in consumers due to encoding problems. RuboCop however monkey-patches this
22+
# method out in order to accept such code.
23+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
24+
Parser::Builders::Default.prepend(
25+
Module.new {
26+
def string_value(token)
27+
value(token)
28+
end
29+
}
30+
)
31+
2032
# Modify the source map == check so that it doesn't check against the node
2133
# itself so we don't get into a recursive loop.
2234
Parser::Source::Map.prepend(

test/prism/ruby/ruby_parser_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def ==(other)
2626
module Prism
2727
class RubyParserTest < TestCase
2828
todos = [
29+
"encoding_euc_jp.txt",
2930
"newline_terminated.txt",
3031
"regex_char_width.txt",
3132
"seattlerb/bug169.txt",
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
@ ProgramNode (location: (3,0)-(9,8))
2+
├── flags: ∅
3+
├── locals: []
4+
└── statements:
5+
@ StatementsNode (location: (3,0)-(9,8))
6+
├── flags: ∅
7+
└── body: (length: 4)
8+
├── @ StringNode (location: (3,0)-(3,6))
9+
│ ├── flags: newline
10+
│ ├── opening_loc: (3,0)-(3,1) = "\""
11+
│ ├── content_loc: (3,1)-(3,5) = "\\xcd"
12+
│ ├── closing_loc: (3,5)-(3,6) = "\""
13+
│ └── unescaped: "\xCD"
14+
├── @ SymbolNode (location: (5,0)-(5,7))
15+
│ ├── flags: newline, static_literal
16+
│ ├── opening_loc: (5,0)-(5,2) = ":\""
17+
│ ├── value_loc: (5,2)-(5,6) = "\\xcd"
18+
│ ├── closing_loc: (5,6)-(5,7) = "\""
19+
│ └── unescaped: "\xCD"
20+
├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
21+
│ ├── flags: newline, static_literal
22+
│ ├── opening_loc: (7,0)-(7,1) = "/"
23+
│ ├── parts: (length: 1)
24+
│ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
25+
│ │ ├── flags: ∅
26+
│ │ ├── opening_loc: (7,1)-(7,3) = "\#{"
27+
│ │ ├── statements:
28+
│ │ │ @ StatementsNode (location: (7,3)-(7,9))
29+
│ │ │ ├── flags: ∅
30+
│ │ │ └── body: (length: 1)
31+
│ │ │ └── @ StringNode (location: (7,3)-(7,9))
32+
│ │ │ ├── flags: static_literal, frozen
33+
│ │ │ ├── opening_loc: (7,3)-(7,4) = "\""
34+
│ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd"
35+
│ │ │ ├── closing_loc: (7,8)-(7,9) = "\""
36+
│ │ │ └── unescaped: "\xCD"
37+
│ │ └── closing_loc: (7,9)-(7,10) = "}"
38+
│ └── closing_loc: (7,10)-(7,11) = "/"
39+
└── @ ArrayNode (location: (9,0)-(9,8))
40+
├── flags: newline
41+
├── elements: (length: 1)
42+
│ └── @ StringNode (location: (9,3)-(9,7))
43+
│ ├── flags: ∅
44+
│ ├── opening_loc: ∅
45+
│ ├── content_loc: (9,3)-(9,7) = "\\xC0"
46+
│ ├── closing_loc: ∅
47+
│ └── unescaped: "\xC0"
48+
├── opening_loc: (9,0)-(9,3) = "%W["
49+
└── closing_loc: (9,7)-(9,8) = "]"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
@ ProgramNode (location: (4,0)-(6,14))
2+
├── flags: ∅
3+
├── locals: []
4+
└── statements:
5+
@ StatementsNode (location: (4,0)-(6,14))
6+
├── flags: ∅
7+
└── body: (length: 2)
8+
├── @ StringNode (location: (4,0)-(4,10))
9+
│ ├── flags: newline
10+
│ ├── opening_loc: (4,0)-(4,1) = "\""
11+
│ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
12+
│ ├── closing_loc: (4,9)-(4,10) = "\""
13+
│ └── unescaped: "\x8E\x01"
14+
└── @ ArrayNode (location: (6,0)-(6,14))
15+
├── flags: newline
16+
├── elements: (length: 1)
17+
│ └── @ StringNode (location: (6,3)-(6,13))
18+
│ ├── flags: ∅
19+
│ ├── opening_loc: ∅
20+
│ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
21+
│ ├── closing_loc: ∅
22+
│ └── unescaped: "\"\x8E\x01\""
23+
├── opening_loc: (6,0)-(6,3) = "%W["
24+
└── closing_loc: (6,13)-(6,14) = "]"

test/prism/snippets_test.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
module Prism
66
class SnippetsTest < TestCase
77
except = [
8+
"encoding_binary.txt",
89
"newline_terminated.txt",
910
"seattlerb/begin_rescue_else_ensure_no_bodies.txt",
1011
"seattlerb/case_in.txt",

0 commit comments

Comments
 (0)