Merge pull request #2675 from ruby/lexer-encoding

soutaro · soutaro · commit 79586f26ee19 · 2025-12-17T11:04:58.000+09:00
Better encoding
diff --git a/docs/encoding.md b/docs/encoding.md
@@ -0,0 +1,56 @@
+# RBS File Encoding
+
+## Best Practice
+
+**Use UTF-8** for both file encoding and your system locale.
+
+## Supported Encodings
+
+RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).
+
+**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...
+
+## Unicode Codepoint Symbols
+
+String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).
+
+When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:
+
+```rbs
+# In UTF-8 encoded files
+
+type t = "\u0123"  # Translated to the actual Unicode character ģ
+type s = "\u3042"  # Translated to the actual Unicode character あ
+```
+
+When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:
+
+```rbs
+# In non-UTF-8 encoded files
+
+type t = "\u0123"  # Remains as the literal string "\u0123"
+```
+
+## Implementation
+
+RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.
+
+`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)
+
+When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.
+
+### Parsing non UTF-8 RBS source text
+
+If you want to work with another encoding, ensure the source string has ASCII compatible encoding.
+
+```ruby
+source = '"日本語"'
+RBS::Parser.parse_type(source.encode(Encoding::EUC_JP))  # => Parses successfully
+RBS::Parser.parse_type(source.encode(Encoding::UTF_32))  # => Returns `nil` since UTF-32 is not ASCII compatible
+```
+
+### Specifying file encoding
+
+Currently, RBS doesn't support specifying file encoding directly.
+
+You can use `Encoding.default_external` while the gem loads RBS files from the storage.
diff --git a/include/rbs/string.h b/include/rbs/string.h
@@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
  */
 bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);
 
-unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);
-
 #endif
diff --git a/include/rbs/util/rbs_unescape.h b/include/rbs/util/rbs_unescape.h
@@ -4,6 +4,7 @@
 #include <stddef.h>
 #include "rbs/util/rbs_allocator.h"
 #include "rbs/string.h"
+#include "rbs/util/rbs_encoding.h"
 
 /**
  * Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
@@ -18,6 +19,6 @@
  *
  * @returns A new owned string that will be freed when the allocator is freed.
  * */
-rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
+rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);
 
 #endif // RBS_RBS_UNESCAPE_H
diff --git a/src/parser.c b/src/parser.c
@@ -317,7 +317,7 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_
             return false;
         }
 
-        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
+        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
         rbs_location_t *symbolLoc = rbs_location_current_token(parser);
         rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
         rbs_ast_symbol_t *name = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -927,7 +927,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
 
         rbs_string_t symbol = rbs_string_new(current_token.start + offset_bytes, current_token.end);
 
-        rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol);
+        rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol, parser->rbs_lexer_t->encoding);
 
         rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_symbol);
 
@@ -1157,7 +1157,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) {
     case tDQSTRING: {
         rbs_location_t *loc = rbs_location_current_token(parser);
 
-        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
+        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
         rbs_node_t *literal = (rbs_node_t *) rbs_ast_string_new(ALLOCATOR(), loc, unquoted_str);
         *type = (rbs_node_t *) rbs_types_literal_new(ALLOCATOR(), loc, literal);
         return true;
@@ -1605,7 +1605,9 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota
         parser->rbs_lexer_t->string.start + rg.start.byte_pos + offset_bytes,
         parser->rbs_lexer_t->string.end
     );
-    unsigned int open_char = rbs_utf8_string_to_codepoint(str);
+
+    // Assumes the input is ASCII compatible
+    unsigned int open_char = str.start[0];
 
     unsigned int close_char;
 
@@ -1718,7 +1720,7 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_
     }
     case tQIDENT: {
         rbs_string_t string = rbs_parser_peek_current_token(parser);
-        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string);
+        rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string, parser->rbs_lexer_t->encoding);
         rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
         rbs_location_t *symbolLoc = rbs_location_current_token(parser);
         *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -3116,7 +3118,9 @@ static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_
             comment_start,
             parser->rbs_lexer_t->string.end
         );
-        unsigned char c = rbs_utf8_string_to_codepoint(str);
+
+        // Assumes the input is ASCII compatible
+        unsigned char c = str.start[0];
 
         if (c == ' ') {
             comment_start += space_bytes;
diff --git a/src/string.c b/src/string.c
@@ -1,59 +1,10 @@
 #include "rbs/string.h"
-#include "rbs/defines.h"
 
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <ctype.h>
 
-unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
-    unsigned int codepoint = 0;
-    int remaining_bytes = 0;
-
-    const char *s = string.start;
-    const char *end = string.end;
-
-    if (s >= end) return 0; // End of string
-
-    if (RBS_LIKELY((*s & 0x80) == 0)) {
-        // Single byte character (0xxxxxxx)
-        return *s;
-    } else if ((*s & 0xE0) == 0xC0) {
-        // Two byte character (110xxxxx 10xxxxxx)
-        codepoint = *s & 0x1F;
-        remaining_bytes = 1;
-    } else if ((*s & 0xF0) == 0xE0) {
-        // Three byte character (1110xxxx 10xxxxxx 10xxxxxx)
-        codepoint = *s & 0x0F;
-        remaining_bytes = 2;
-    } else if ((*s & 0xF8) == 0xF0) {
-        // Four byte character (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
-        codepoint = *s & 0x07;
-        remaining_bytes = 3;
-    } else {
-        // Invalid UTF-8 sequence
-        return 0xFFFD; // Unicode replacement character
-    }
-
-    s++;
-    while (remaining_bytes > 0 && s < end) {
-        if ((*s & 0xC0) != 0x80) {
-            // Invalid continuation byte
-            return 0xFFFD;
-        }
-        codepoint = (codepoint << 6) | (*s & 0x3F);
-        s++;
-        remaining_bytes--;
-    }
-
-    if (remaining_bytes > 0) {
-        // Incomplete sequence
-        return 0xFFFD;
-    }
-
-    return codepoint;
-}
-
 rbs_string_t rbs_string_new(const char *start, const char *end) {
     return (rbs_string_t) {
         .start = start,
diff --git a/src/util/rbs_unescape.c b/src/util/rbs_unescape.c
@@ -1,4 +1,5 @@
 #include "rbs/util/rbs_unescape.h"
+#include "rbs/util/rbs_encoding.h"
 #include <string.h>
 #include <stdlib.h>
 #include <ctype.h>
@@ -42,20 +43,44 @@ static int octal_to_int(const char *octal, int length) {
     return result;
 }
 
-int rbs_utf8_codelen(unsigned int c) {
-    if (c <= 0x7F) return 1;
-    if (c <= 0x7FF) return 2;
-    if (c <= 0xFFFF) return 3;
-    if (c <= 0x10FFFF) return 4;
-    return 1; // Invalid Unicode codepoint, treat as 1 byte
+// Fills buf starting at index 'start' with the UTF-8 encoding of 'codepoint'.
+// Returns the number of bytes written, or 0 when the output is not changed.
+//
+size_t rbs_utf8_fill_codepoint(char *buf, size_t start, size_t end, unsigned int codepoint) {
+    if (start + 4 > end) {
+        return 0;
+    }
+
+    if (codepoint <= 0x7F) {
+        buf[start] = codepoint & 0x7F;
+        return 1;
+    } else if (codepoint <= 0x7FF) {
+        buf[start + 0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+        buf[start + 1] = 0x80 | (codepoint & 0x3F);
+        return 2;
+    } else if (codepoint <= 0xFFFF) {
+        buf[start + 0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+        buf[start + 1] = 0x80 | ((codepoint >> 6) & 0x3F);
+        buf[start + 2] = 0x80 | (codepoint & 0x3F);
+        return 3;
+    } else if (codepoint <= 0x10FFFF) {
+        buf[start + 0] = 0xF0 | ((codepoint >> 18) & 0x07);
+        buf[start + 1] = 0x80 | ((codepoint >> 12) & 0x3F);
+        buf[start + 2] = 0x80 | ((codepoint >> 6) & 0x3F);
+        buf[start + 3] = 0x80 | (codepoint & 0x3F);
+        return 4;
+    } else {
+        return 0;
+    }
 }
 
-rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote) {
+rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote, bool is_unicode) {
     if (!string.start) return RBS_STRING_NULL;
 
     size_t len = string.end - string.start;
     const char *input = string.start;
 
+    // The output cannot be longer than the input even after unescaping.
     char *output = rbs_allocator_alloc_many(allocator, len + 1, char);
     if (!output) return RBS_STRING_NULL;
 
@@ -79,9 +104,21 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
                     i += hex_len + 2;
                 } else if (input[i + 1] == 'u' && i + 5 < len) {
                     // Unicode escape
-                    int value = hex_to_int(input + i + 2, 4);
-                    output[j++] = (char) value;
-                    i += 6;
+
+                    if (is_unicode) {
+                        // The UTF-8 representation is at most 4 bytes, shorter than the input length.
+                        int value = hex_to_int(input + i + 2, 4);
+                        j += rbs_utf8_fill_codepoint(output, j, len + 1, value);
+                        i += 6;
+                    } else {
+                        // Copy the escape sequence as-is
+                        output[j++] = input[i++];
+                        output[j++] = input[i++];
+                        output[j++] = input[i++];
+                        output[j++] = input[i++];
+                        output[j++] = input[i++];
+                        output[j++] = input[i++];
+                    }
                 } else {
                     // Other escapes
                     int found = 0;
@@ -114,18 +151,17 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
     return rbs_string_new(output, output + j);
 }
 
-rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input) {
-    unsigned int first_char = rbs_utf8_string_to_codepoint(input);
-    size_t byte_length = rbs_string_len(input);
+rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input, const rbs_encoding_t *encoding) {
+    unsigned int first_char = input.start[0];
+
+    const char *new_start = input.start;
+    const char *new_end = input.end;
 
-    ptrdiff_t start_offset = 0;
     if (first_char == '"' || first_char == '\'' || first_char == '`') {
-        int bs = rbs_utf8_codelen(first_char);
-        start_offset += bs;
-        byte_length -= 2 * bs;
+        new_start += 1;
+        new_end -= 1;
     }
 
-    const char *new_start = input.start + start_offset;
-    rbs_string_t string = rbs_string_new(new_start, new_start + byte_length);
-    return unescape_string(allocator, string, first_char == '"');
+    rbs_string_t string = rbs_string_new(new_start, new_end);
+    return unescape_string(allocator, string, first_char == '"', encoding == RBS_ENCODING_UTF_8_ENTRY);
 }
diff --git a/test/rbs/type_parsing_test.rb b/test/rbs/type_parsing_test.rb
@@ -883,4 +883,40 @@ def test_escape_sequences
       assert_equal "\x00", type.types[2].literal
     end
   end
+
+  def test_parse__string_octal_escape
+    Parser.parse_type('"\100"').yield_self do |type|
+      assert_equal "\100", type.literal
+    end
+    Parser.parse_type('"\400"').yield_self do |type|
+      assert_equal "\400", type.literal
+    end
+  end
+
+  def test_parse__string_hex_escape
+    Parser.parse_type('"\x10"').yield_self do |type|
+      assert_equal "\x10", type.literal
+    end
+    Parser.parse_type('"\x40"').yield_self do |type|
+      assert_equal "\x40", type.literal
+    end
+  end
+
+  def test_parse__string_unicode_escape
+    Parser.parse_type('"\u005a"').yield_self do |type|
+      assert_equal "Z", type.literal
+    end
+    Parser.parse_type('"[\u30eb]"').yield_self do |type|
+      assert_equal "[ル]", type.literal
+    end
+  end
+
+  def test_parse__string_unicode_escape__non_unicode
+    Parser.parse_type('"\u005a"'.encode(Encoding::ASCII)).yield_self do |type|
+      assert_equal "\\u005a", type.literal
+    end
+    Parser.parse_type('"[\u30eb]"'.encode(Encoding::Shift_JIS)).yield_self do |type|
+      assert_equal "[\\u30eb]", type.literal
+    end
+  end
 end