diff --git a/lib/absinthe/lexer.ex b/lib/absinthe/lexer.ex index e16663bf45..7477d022e9 100644 --- a/lib/absinthe/lexer.ex +++ b/lib/absinthe/lexer.ex @@ -155,10 +155,19 @@ defmodule Absinthe.Lexer do ]) |> post_traverse({:labeled_token, [:float_value]}) - # EscapedUnicode :: /[0-9A-Fa-f]{4}/ - escaped_unicode = + # EscapedUnicode (Fixed-width) :: /[0-9A-Fa-f]{4}/ + # Per GraphQL September 2025 spec, this supports BMP characters and surrogate pairs + escaped_unicode_fixed = times(ascii_char([?0..?9, ?A..?F, ?a..?f]), 4) - |> post_traverse({:unescape_unicode, []}) + |> post_traverse({:unescape_unicode_fixed, []}) + + # EscapedUnicode (Variable-width) :: \u{ [0-9A-Fa-f]+ } + # Per GraphQL September 2025 spec, supports full Unicode range U+0000 to U+10FFFF + escaped_unicode_variable = + ignore(ascii_char([?{])) + |> times(ascii_char([?0..?9, ?A..?F, ?a..?f]), min: 1) + |> ignore(ascii_char([?}])) + |> post_traverse({:unescape_unicode_variable, []}) # EscapedCharacter :: one of `"` \ `/` b f n r t escaped_character = @@ -175,11 +184,15 @@ defmodule Absinthe.Lexer do # StringCharacter :: # - SourceCharacter but not `"` or \ or LineTerminator - # - \u EscapedUnicode + # - \u{ EscapedUnicode } (variable-width, September 2025 spec) + # - \u EscapedUnicode (fixed-width, legacy) # - \ EscapedCharacter string_character = choice([ - ignore(string(~S(\u))) |> concat(escaped_unicode), + # Variable-width Unicode escape: \u{XXXXXX} + ignore(string(~S(\u))) |> concat(escaped_unicode_variable), + # Fixed-width Unicode escape: \uXXXX (with surrogate pair support) + ignore(string(~S(\u))) |> concat(escaped_unicode_fixed), ignore(ascii_char([?\\])) |> concat(escaped_character), any_unicode ]) @@ -233,6 +246,7 @@ defmodule Absinthe.Lexer do {:ok, [any()]} | {:error, binary(), {integer(), non_neg_integer()}} | {:error, :exceeded_token_limit} + | {:error, :invalid_unicode_escape, binary(), {integer(), non_neg_integer()}} def tokenize(input, options \\ []) do lines = String.split(input, ~r/\r?\n/) @@ -242,6 +256,12 @@ defmodule Absinthe.Lexer do {:error, @stopped_at_token_limit, _, _, _, _} -> {:error, :exceeded_token_limit} + # Handle Unicode escape validation errors + {:error, message, _rest, _context, {line, line_offset}, byte_offset} + when is_binary(message) -> + byte_column = byte_offset - line_offset + 1 + {:error, :invalid_unicode_escape, message, byte_loc_to_char_loc({line, byte_column}, lines)} + {:ok, tokens, "", _, _, _} -> tokens = convert_token_columns_from_byte_to_char(tokens, lines) {:ok, tokens} @@ -364,11 +384,85 @@ defmodule Absinthe.Lexer do defp fill_mantissa(rest, raw, context, _, _), do: {rest, ~c"0." ++ raw, context} - defp unescape_unicode(rest, content, context, _loc, _) do + # Unicode scalar value validation per GraphQL September 2025 spec: + # Valid ranges: U+0000 to U+D7FF, U+E000 to U+10FFFF + # Invalid: surrogate code points U+D800 to U+DFFF (except as surrogate pairs in fixed-width) + defp is_unicode_scalar_value?(value) when value >= 0x0000 and value <= 0xD7FF, do: true + defp is_unicode_scalar_value?(value) when value >= 0xE000 and value <= 0x10FFFF, do: true + defp is_unicode_scalar_value?(_), do: false + + # Check if value is a high surrogate (U+D800 to U+DBFF) + defp is_high_surrogate?(value), do: value >= 0xD800 and value <= 0xDBFF + + # Check if value is a low surrogate (U+DC00 to U+DFFF) + defp is_low_surrogate?(value), do: value >= 0xDC00 and value <= 0xDFFF + + # Decode a surrogate pair to a Unicode scalar value + defp decode_surrogate_pair(high, low) do + 0x10000 + ((high - 0xD800) * 0x400) + (low - 0xDC00) + end + + # Variable-width Unicode escape: \u{XXXXXX} + # Must be a valid Unicode scalar value (not a surrogate) + defp unescape_unicode_variable(rest, content, context, _loc, _) do code = content |> Enum.reverse() value = :erlang.list_to_integer(code, 16) - binary = :unicode.characters_to_binary([value]) - {rest, [binary], context} + + if is_unicode_scalar_value?(value) do + binary = :unicode.characters_to_binary([value]) + {rest, [binary], context} + else + {:error, "Invalid Unicode scalar value in escape sequence"} + end + end + + # Fixed-width Unicode escape: \uXXXX + # Handles BMP characters and surrogate pairs for supplementary characters + defp unescape_unicode_fixed(rest, content, context, _loc, _) do + code = content |> Enum.reverse() + value = :erlang.list_to_integer(code, 16) + + cond do + # Valid BMP character (not a surrogate) + is_unicode_scalar_value?(value) -> + binary = :unicode.characters_to_binary([value]) + {rest, [binary], context} + + # High surrogate - check for following low surrogate to form a pair + is_high_surrogate?(value) -> + case rest do + # Look ahead for \uXXXX pattern + <> + when h1 in ~c"0123456789ABCDEFabcdef" and + h2 in ~c"0123456789ABCDEFabcdef" and + h3 in ~c"0123456789ABCDEFabcdef" and + h4 in ~c"0123456789ABCDEFabcdef" -> + low_code = [h1, h2, h3, h4] + low_value = :erlang.list_to_integer(low_code, 16) + + if is_low_surrogate?(low_value) do + # Valid surrogate pair - decode to scalar value + scalar = decode_surrogate_pair(value, low_value) + binary = :unicode.characters_to_binary([scalar]) + {remaining, [binary], context} + else + # High surrogate not followed by low surrogate + {:error, "Invalid Unicode escape: high surrogate not followed by low surrogate"} + end + + _ -> + # High surrogate without following escape sequence + {:error, "Invalid Unicode escape: lone high surrogate"} + end + + # Lone low surrogate (invalid) + is_low_surrogate?(value) -> + {:error, "Invalid Unicode escape: lone low surrogate"} + + # Out of range + true -> + {:error, "Invalid Unicode scalar value in escape sequence"} + end end @boolean_words ~w( diff --git a/lib/absinthe/phase/parse.ex b/lib/absinthe/phase/parse.ex index a6a58a18b2..1660687022 100644 --- a/lib/absinthe/phase/parse.ex +++ b/lib/absinthe/phase/parse.ex @@ -51,6 +51,9 @@ defmodule Absinthe.Phase.Parse do {:error, :exceeded_token_limit} -> {:error, %Phase.Error{message: "Token limit exceeded", phase: __MODULE__}} + {:error, :invalid_unicode_escape, message, loc} -> + {:error, format_raw_parse_error({:unicode_escape, message, loc})} + other -> other end @@ -113,6 +116,12 @@ defmodule Absinthe.Phase.Parse do %Phase.Error{message: message, locations: [%{line: line, column: column}], phase: __MODULE__} end + @spec format_raw_parse_error({:unicode_escape, String.t(), {line :: pos_integer, column :: pos_integer}}) :: + Phase.Error.t() + defp format_raw_parse_error({:unicode_escape, message, {line, column}}) do + %Phase.Error{message: message, locations: [%{line: line, column: column}], phase: __MODULE__} + end + @unknown_error_msg "An unknown error occurred during parsing" @spec format_raw_parse_error(map) :: Phase.Error.t() defp format_raw_parse_error(%{} = error) do diff --git a/test/absinthe/unicode_test.exs b/test/absinthe/unicode_test.exs new file mode 100644 index 0000000000..5a6b28d50c --- /dev/null +++ b/test/absinthe/unicode_test.exs @@ -0,0 +1,596 @@ +defmodule Absinthe.UnicodeTest do + @moduledoc """ + Tests for GraphQL September 2025 Full Unicode Support (RFCs #805, #1040, #1053, #1142). + + This test module covers: + - Basic Unicode characters in strings + - BMP escape sequences (\\uXXXX) + - Extended/variable-width escape sequences (\\u{XXXXXX}) + - Surrogate pair handling for legacy compatibility + - Emoji and supplementary plane characters + - Unicode validation (rejection of invalid escapes) + - Block strings with Unicode + """ + + use Absinthe.Case, async: true + + alias Absinthe.Lexer + + describe "basic Unicode in strings" do + test "parses ASCII characters" do + assert {:ok, [{:string_value, {1, 1}, ~c"\"hello\""}]} = + Lexer.tokenize(~s("hello")) + end + + test "parses Latin-1 supplement characters" do + # e with acute accent (actual character, not escaped) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("cafe")) + + assert to_string(value) == "\"cafe\"" + end + + test "parses Cyrillic characters" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("Hello")) + + assert to_string(value) == ~s("Hello") + end + + test "parses Chinese characters" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("Hello")) + + assert to_string(value) == ~s("Hello") + end + + test "parses Japanese characters" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("\u3053\u3093\u306B\u3061\u306F")) + + assert to_string(value) == ~s("\u3053\u3093\u306B\u3061\u306F") + end + + test "parses Arabic characters" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("\u0645\u0631\u062D\u0628\u0627")) + + assert to_string(value) == ~s("\u0645\u0631\u062D\u0628\u0627") + end + + test "parses mixed Unicode scripts" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("Hello World")) + + assert to_string(value) == ~s("Hello World") + end + end + + describe "BMP escape sequences (\\uXXXX)" do + test "parses basic ASCII escape" do + # \u0041 = 'A' + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0041")) + + assert to_string(value) == "\"A\"" + end + + test "parses Latin-1 escape" do + # \u00F3 = 'o' with acute accent + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u00F3")) + + assert to_string(value) == "\"\u00F3\"" + end + + test "parses lowercase hex digits" do + # \u00f3 = 'o' with acute accent (lowercase) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u00f3")) + + assert to_string(value) == "\"\u00F3\"" + end + + test "parses mixed case hex digits" do + # \u00Ab = same as \u00AB + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u00Ab")) + + assert to_string(value) == "\"\u00AB\"" + end + + test "parses Cyrillic escape" do + # \u0414 = Cyrillic capital letter De + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0414")) + + assert to_string(value) == "\"\u0414\"" + end + + test "parses BMP character at end of range" do + # \uFFFF = last BMP character + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\uFFFF")) + + assert to_string(value) == "\"\uFFFF\"" + end + + test "parses null character" do + # \u0000 = null character + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0000")) + + assert to_string(value) == "\"\u0000\"" + end + + test "parses multiple escape sequences" do + # \u0041\u0042\u0043 = "ABC" + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0041\u0042\u0043")) + + assert to_string(value) == "\"ABC\"" + end + + test "parses escape mixed with plain text" do + # "Hello \u0041 World" + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("Hello \u0041 World")) + + assert to_string(value) == "\"Hello A World\"" + end + end + + describe "extended Unicode escape sequences (\\u{XXXXXX})" do + test "parses single digit hex" do + # \u{41} = 'A' + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{41}")) + + assert to_string(value) == "\"A\"" + end + + test "parses two digit hex" do + # \u{F3} = 'o' with acute accent + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{F3}")) + + assert to_string(value) == "\"\u00F3\"" + end + + test "parses four digit hex (equivalent to fixed-width)" do + # \u{0041} = 'A' + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{0041}")) + + assert to_string(value) == "\"A\"" + end + + test "parses five digit hex (supplementary plane)" do + # \u{1F600} = grinning face emoji + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1F600}")) + + assert to_string(value) == "\"\u{1F600}\"" + end + + test "parses six digit hex (max Unicode)" do + # \u{10FFFF} = last valid Unicode code point + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{10FFFF}")) + + assert to_string(value) == "\"\u{10FFFF}\"" + end + + test "parses lowercase hex in variable-width" do + # \u{1f600} = grinning face emoji (lowercase) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1f600}")) + + assert to_string(value) == "\"\u{1F600}\"" + end + + test "parses mixed case hex in variable-width" do + # \u{1F6aB} = prohibited sign emoji + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1F6aB}")) + + assert to_string(value) == "\"\u{1F6AB}\"" + end + + test "parses poop emoji" do + # \u{1F4A9} = pile of poo + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1F4A9}")) + + assert to_string(value) == "\"\u{1F4A9}\"" + end + + test "parses musical symbol" do + # \u{1D11E} = musical symbol G clef + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1D11E}")) + + assert to_string(value) == "\"\u{1D11E}\"" + end + end + + describe "surrogate pair handling (legacy compatibility)" do + test "parses surrogate pair for poop emoji" do + # \uD83D\uDCA9 = pile of poo (U+1F4A9) via surrogate pair + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\uD83D\uDCA9")) + + # Should produce the same result as \u{1F4A9} + assert to_string(value) == "\"\u{1F4A9}\"" + end + + test "parses surrogate pair for grinning face" do + # \uD83D\uDE00 = grinning face (U+1F600) via surrogate pair + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\uD83D\uDE00")) + + assert to_string(value) == "\"\u{1F600}\"" + end + + test "parses surrogate pair for G clef" do + # \uD834\uDD1E = G clef (U+1D11E) via surrogate pair + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\uD834\uDD1E")) + + assert to_string(value) == "\"\u{1D11E}\"" + end + + test "surrogate pair and variable-width produce same result" do + {:ok, [{:string_value, _, surrogate_result}]} = + Lexer.tokenize(~S("\uD83D\uDCA9")) + + {:ok, [{:string_value, _, variable_result}]} = + Lexer.tokenize(~S("\u{1F4A9}")) + + assert surrogate_result == variable_result + end + end + + describe "emoji and supplementary plane characters" do + test "parses direct emoji in string" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("Hello!")) + + assert to_string(value) == ~s("Hello!") + end + + test "parses multiple emojis" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("")) + + assert to_string(value) == ~s("") + end + + test "parses emoji with skin tone modifier" do + # Thumbs up with light skin tone + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("")) + + assert to_string(value) == ~s("") + end + + test "parses flag emoji (regional indicator symbols)" do + # US flag (two regional indicators) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~s("")) + + assert to_string(value) == ~s("") + end + + test "parses ancient script characters" do + # Egyptian hieroglyph A001 + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{13000}")) + + assert to_string(value) == "\"\u{13000}\"" + end + + test "parses mathematical symbols" do + # Mathematical bold capital A + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u{1D400}")) + + assert to_string(value) == "\"\u{1D400}\"" + end + end + + describe "invalid Unicode escape rejection" do + test "rejects lone high surrogate" do + # \uD800 alone is invalid + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\uD800")) + assert message =~ "surrogate" + end + + test "rejects lone low surrogate" do + # \uDC00 alone is invalid + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\uDC00")) + assert message =~ "surrogate" + end + + test "rejects high surrogate at end of string" do + # High surrogate at end with no pair + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\uD83D")) + assert message =~ "surrogate" + end + + test "rejects high surrogate not followed by low surrogate" do + # High surrogate followed by non-surrogate + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\uD83D\u0041")) + assert message =~ "surrogate" + end + + test "rejects surrogate in variable-width escape" do + # \u{D800} - surrogate in variable-width form + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\u{D800}")) + assert message =~ "Invalid Unicode scalar value" + end + + test "rejects out of range variable-width escape" do + # \u{110000} - beyond Unicode range + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\u{110000}")) + assert message =~ "Invalid Unicode scalar value" + end + + test "rejects very large value" do + # \u{FFFFFF} - way beyond Unicode range + assert {:error, :invalid_unicode_escape, message, _loc} = Lexer.tokenize(~S("\u{FFFFFF}")) + assert message =~ "Invalid Unicode scalar value" + end + end + + describe "block strings with Unicode" do + test "parses block string with Unicode" do + query = ~s("""Hello World""") + + assert {:ok, [{:block_string_value, {1, 1}, value}]} = + Lexer.tokenize(query) + + assert to_string(value) == ~s("""Hello World""") + end + + test "parses block string with emoji" do + query = ~s("""Hello ! World""") + + assert {:ok, [{:block_string_value, {1, 1}, value}]} = + Lexer.tokenize(query) + + assert to_string(value) == ~s("""Hello ! World""") + end + + test "parses multiline block string with Unicode" do + query = """ + \"\"\" + Line 1: Hello + Line 2: + Line 3: World + \"\"\" + """ + + assert {:ok, [{:block_string_value, {1, 1}, _value}]} = + Lexer.tokenize(query) + end + + test "block strings preserve raw Unicode (no escape processing)" do + # Block strings should NOT process \uXXXX escapes + query = ~S("""\u0041""") + + assert {:ok, [{:block_string_value, {1, 1}, value}]} = + Lexer.tokenize(query) + + # The escape sequence should remain as-is + assert to_string(value) == ~S("""\u0041""") + end + end + + describe "integration with parser" do + defp run(input) do + with {:ok, %{input: input}} <- Absinthe.Phase.Parse.run(input) do + {:ok, input} + end + end + + defp get_string_value(result) do + path = [ + Access.key!(:definitions), + Access.at(0), + Access.key!(:selection_set), + Access.key!(:selections), + Access.at(0), + Access.key!(:arguments), + Access.at(0), + Access.key!(:value), + Access.key!(:value) + ] + + get_in(result, path) + end + + test "parses query with BMP Unicode escape" do + query = ~S""" + query { + user(name: "\u00F3") + } + """ + + assert {:ok, result} = run(query) + assert get_string_value(result) == "\u00F3" + end + + test "parses query with extended Unicode escape" do + query = ~S""" + query { + user(name: "\u{1F600}") + } + """ + + assert {:ok, result} = run(query) + assert get_string_value(result) == "\u{1F600}" + end + + test "parses query with surrogate pair" do + query = ~S""" + query { + user(name: "\uD83D\uDE00") + } + """ + + assert {:ok, result} = run(query) + assert get_string_value(result) == "\u{1F600}" + end + + test "parses query with direct emoji" do + query = """ + query { + user(name: "Hello !") + } + """ + + assert {:ok, result} = run(query) + assert get_string_value(result) == "Hello !" + end + + test "parses query with mixed escape styles" do + query = ~S""" + query { + user(name: "\u0041\u{42}C") + } + """ + + assert {:ok, result} = run(query) + assert get_string_value(result) == "ABC" + end + + test "rejects query with invalid Unicode escape" do + query = ~S""" + query { + user(name: "\uD800") + } + """ + + assert {:error, _} = run(query) + end + end + + describe "Unicode in field names (spec compliance check)" do + # GraphQL spec: Names must match /[_A-Za-z][_0-9A-Za-z]*/ + # Unicode is NOT allowed in names per spec + # Note: The lexer doesn't reject Unicode in positions where names are expected, + # but it won't parse them as names. This is handled at the parser level. + + test "Unicode outside strings is ignored by lexer" do + # The lexer encounters Unicode characters outside of strings + # They are treated as ignored/whitespace since they don't match token patterns + # This means bare Unicode characters between valid tokens are skipped + query = "{ }" + + # The lexer ignores unknown characters and successfully parses the braces + assert {:ok, tokens} = Lexer.tokenize(query) + # Only the braces are parsed; Unicode is ignored as whitespace + assert [{:"{", _}, {:"}", _}] = tokens + end + + test "allows valid ASCII names" do + query = "{ valid_Name123 }" + + assert {:ok, + [ + {:"{", _}, + {:name, _, ~c"valid_Name123"}, + {:"}", _} + ]} = Lexer.tokenize(query) + end + + test "names starting with underscore are valid" do + query = "{ _privateName }" + + assert {:ok, + [ + {:"{", _}, + {:name, _, ~c"_privateName"}, + {:"}", _} + ]} = Lexer.tokenize(query) + end + end + + describe "edge cases" do + test "empty string" do + assert {:ok, [{:string_value, {1, 1}, ~c"\"\""}]} = + Lexer.tokenize(~s("")) + end + + test "string with only escape sequence" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0041")) + + assert to_string(value) == "\"A\"" + end + + test "consecutive escape sequences" do + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0041\u{42}")) + + assert to_string(value) == "\"AB\"" + end + + test "escape sequence at string boundaries" do + # Escape at start + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0041bc")) + + assert to_string(value) == "\"Abc\"" + + # Escape at end + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("ab\u0043")) + + assert to_string(value) == "\"abC\"" + end + + test "long string with many escape sequences" do + # Create string with 100 escape sequences + escapes = String.duplicate(~S(\u0041), 100) + query = ~s("#{escapes}") + + assert {:ok, [{:string_value, {1, 1}, value}]} = Lexer.tokenize(query) + assert to_string(value) == "\"" <> String.duplicate("A", 100) <> "\"" + end + + test "control characters via escape" do + # Null, bell, backspace, tab, newline, etc. + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u0000\u0007\u0008\t\n")) + + # Contains control characters + assert is_list(value) + end + + test "zero-width characters" do + # Zero-width space (U+200B) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u200B")) + + assert to_string(value) == "\"\u200B\"" + end + + test "right-to-left mark" do + # Right-to-left mark (U+200F) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\u200F")) + + assert to_string(value) == "\"\u200F\"" + end + + test "byte order mark" do + # BOM (U+FEFF) + assert {:ok, [{:string_value, {1, 1}, value}]} = + Lexer.tokenize(~S("\uFEFF")) + + assert to_string(value) == "\"\uFEFF\"" + end + end +end