Skip to content

Commit 10eff22

Browse files
authored
Merge pull request #4090 from rmosolgo/full-unicode
Support full unicode range in source text
2 parents d85d91a + a70dec1 commit 10eff22

File tree

4 files changed

+92
-32
lines changed

4 files changed

+92
-32
lines changed

lib/graphql/language/lexer.rb

Lines changed: 42 additions & 25 deletions
Large diffs are not rendered by default.

lib/graphql/language/lexer.rl

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@
3939
BACKSLASH = '\\';
4040
# Could limit to hex here, but “bad unicode escape” on 0XXF is probably a
4141
# more helpful error than “unknown char”
42-
UNICODE_ESCAPE = '\\u' [0-9A-Za-z]{4};
42+
UNICODE_DIGIT = [0-9A-Za-z];
43+
FOUR_DIGIT_UNICODE = UNICODE_DIGIT{4};
44+
N_DIGIT_UNICODE = LCURLY UNICODE_DIGIT{4,} RCURLY;
45+
UNICODE_ESCAPE = '\\u' (FOUR_DIGIT_UNICODE | N_DIGIT_UNICODE);
4346
# https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
4447
STRING_ESCAPE = '\\' [\\/bfnrt];
4548
BLOCK_QUOTE = '"""';
@@ -131,7 +134,25 @@ module GraphQL
131134
# To avoid allocating more strings, this modifies the string passed into it
132135
def self.replace_escaped_characters_in_place(raw_string)
133136
raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
134-
raw_string.gsub!(UTF_8, &UTF_8_REPLACE)
137+
raw_string.gsub!(UTF_8) do |_matched_str|
138+
codepoint_1 = ($1 || $2).to_i(16)
139+
codepoint_2 = $3
140+
141+
if codepoint_2
142+
codepoint_2 = codepoint_2.to_i(16)
143+
if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
144+
(codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
145+
# A surrogate pair
146+
combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
147+
[combined].pack('U'.freeze)
148+
else
149+
# Two separate code points
150+
[codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
151+
end
152+
else
153+
[codepoint_1].pack('U'.freeze)
154+
end
155+
end
135156
nil
136157
end
137158

@@ -203,8 +224,8 @@ module GraphQL
203224
"\\t" => "\t",
204225
}
205226

206-
UTF_8 = /\\u[\dAa-f]{4}/i
207-
UTF_8_REPLACE = ->(m) { [m[-4..-1].to_i(16)].pack('U'.freeze) }
227+
UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
228+
208229

209230
VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
210231

@@ -219,8 +240,7 @@ module GraphQL
219240
line_incr = value.count("\n")
220241
value = GraphQL::Language::BlockString.trim_whitespace(value)
221242
end
222-
# TODO: replace with `String#match?` when we support only Ruby 2.4+
223-
# (It's faster: https://bugs.ruby-lang.org/issues/8110)
243+
224244
if !value.valid_encoding? || !value.match?(VALID_STRING)
225245
meta[:tokens] << token = GraphQL::Language::Token.new(
226246
:BAD_UNICODE_ESCAPE,

spec/graphql/language/lexer_spec.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,21 @@
7878

7979
it "unescapes escaped unicode characters" do
8080
assert_equal "\t", subject.tokenize('"\\u0009"').first.to_s
81+
assert_equal "\t", subject.tokenize('"\\u{0009}"').first.to_s
82+
assert_equal "𐘑", subject.tokenize('"\\u{10611}"').first.to_s
83+
assert_equal "💩", subject.tokenize('"\\u{1F4A9}"').first.to_s
84+
assert_equal "💩", subject.tokenize('"\\uD83D\\uDCA9"').first.to_s
85+
end
86+
87+
it "accepts the full range of unicode" do
88+
assert_equal "💩", subject.tokenize('"💩"').first.to_s
89+
assert_equal "⌱", subject.tokenize('"⌱"').first.to_s
90+
assert_equal "🂡\n🂢", subject.tokenize('"""🂡
91+
🂢"""').first.to_s
92+
end
93+
94+
it "doesn't accept unicode outside strings or comments" do
95+
assert_equal :UNKNOWN_CHAR, GraphQL.scan('😘 ').first.name
8196
end
8297

8398
it "rejects bad unicode, even when there's good unicode in the string" do
@@ -92,7 +107,8 @@
92107
it "rejects unicode that's well-formed but results in invalidly-encoded strings" do
93108
# when the string here gets tokenized into an actual `:STRING`, it results in `valid_encoding?` being false for
94109
# the ruby string so application code usually blows up trying to manipulate it
95-
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\ud83c\\udf2c"').first.name
110+
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\udc00\\udf2c"').first.name
111+
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\u{dc00}\\u{df2c}"').first.name
96112
end
97113

98114
it "clears the previous_token between runs" do

spec/graphql/language/parser_spec.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@
1212
end
1313
end
1414

15+
it "raises an error when unicode is used as names" do
16+
err = assert_raises(GraphQL::ParseError) {
17+
GraphQL.parse('query 😘 { a b }')
18+
}
19+
assert_equal "Parse error on \"\\xF0\" (error) at [1, 7]", err.message
20+
end
21+
1522
describe "anonymous fragment extension" do
1623
let(:document) { GraphQL.parse(query_string) }
1724
let(:query_string) {%|

0 commit comments

Comments
 (0)