Skip to content

Commit 79586f2

Browse files
committed
Merge pull request #2675 from ruby/lexer-encoding
Better encoding
1 parent 7841784 commit 79586f2

File tree

7 files changed

+160
-78
lines changed

7 files changed

+160
-78
lines changed

docs/encoding.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RBS File Encoding
2+
3+
## Best Practice
4+
5+
**Use UTF-8** for both file encoding and your system locale.
6+
7+
## Supported Encodings
8+
9+
RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).
10+
11+
**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...
12+
13+
## Unicode Codepoint Symbols
14+
15+
String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).
16+
17+
When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:
18+
19+
```rbs
20+
# In UTF-8 encoded files
21+
22+
type t = "\u0123" # Translated to the actual Unicode character ģ
23+
type s = "\u3042" # Translated to the actual Unicode character あ
24+
```
25+
26+
When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:
27+
28+
```rbs
29+
# In non-UTF-8 encoded files
30+
31+
type t = "\u0123" # Remains as the literal string "\u0123"
32+
```
33+
34+
## Implementation
35+
36+
RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.
37+
38+
`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)
39+
40+
When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.
41+
42+
### Parsing non UTF-8 RBS source text
43+
44+
If you want to work with another encoding, ensure the source string has ASCII compatible encoding.
45+
46+
```ruby
47+
source = '"日本語"'
48+
RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully
49+
RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible
50+
```
51+
52+
### Specifying file encoding
53+
54+
Currently, RBS doesn't support specifying file encoding directly.
55+
56+
You can use `Encoding.default_external` while the gem loads RBS files from the storage.

include/rbs/string.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
4444
*/
4545
bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);
4646

47-
unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);
48-
4947
#endif

include/rbs/util/rbs_unescape.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stddef.h>
55
#include "rbs/util/rbs_allocator.h"
66
#include "rbs/string.h"
7+
#include "rbs/util/rbs_encoding.h"
78

89
/**
910
* Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
@@ -18,6 +19,6 @@
1819
*
1920
* @returns A new owned string that will be freed when the allocator is freed.
2021
* */
21-
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
22+
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);
2223

2324
#endif // RBS_RBS_UNESCAPE_H

src/parser.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_
317317
return false;
318318
}
319319

320-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
320+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
321321
rbs_location_t *symbolLoc = rbs_location_current_token(parser);
322322
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
323323
rbs_ast_symbol_t *name = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -927,7 +927,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
927927

928928
rbs_string_t symbol = rbs_string_new(current_token.start + offset_bytes, current_token.end);
929929

930-
rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol);
930+
rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol, parser->rbs_lexer_t->encoding);
931931

932932
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_symbol);
933933

@@ -1157,7 +1157,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) {
11571157
case tDQSTRING: {
11581158
rbs_location_t *loc = rbs_location_current_token(parser);
11591159

1160-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
1160+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
11611161
rbs_node_t *literal = (rbs_node_t *) rbs_ast_string_new(ALLOCATOR(), loc, unquoted_str);
11621162
*type = (rbs_node_t *) rbs_types_literal_new(ALLOCATOR(), loc, literal);
11631163
return true;
@@ -1605,7 +1605,9 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota
16051605
parser->rbs_lexer_t->string.start + rg.start.byte_pos + offset_bytes,
16061606
parser->rbs_lexer_t->string.end
16071607
);
1608-
unsigned int open_char = rbs_utf8_string_to_codepoint(str);
1608+
1609+
// Assumes the input is ASCII compatible
1610+
unsigned int open_char = str.start[0];
16091611

16101612
unsigned int close_char;
16111613

@@ -1718,7 +1720,7 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_
17181720
}
17191721
case tQIDENT: {
17201722
rbs_string_t string = rbs_parser_peek_current_token(parser);
1721-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string);
1723+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string, parser->rbs_lexer_t->encoding);
17221724
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
17231725
rbs_location_t *symbolLoc = rbs_location_current_token(parser);
17241726
*symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -3116,7 +3118,9 @@ static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_
31163118
comment_start,
31173119
parser->rbs_lexer_t->string.end
31183120
);
3119-
unsigned char c = rbs_utf8_string_to_codepoint(str);
3121+
3122+
// Assumes the input is ASCII compatible
3123+
unsigned char c = str.start[0];
31203124

31213125
if (c == ' ') {
31223126
comment_start += space_bytes;

src/string.c

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,10 @@
11
#include "rbs/string.h"
2-
#include "rbs/defines.h"
32

43
#include <stdlib.h>
54
#include <string.h>
65
#include <stdio.h>
76
#include <ctype.h>
87

9-
unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
10-
unsigned int codepoint = 0;
11-
int remaining_bytes = 0;
12-
13-
const char *s = string.start;
14-
const char *end = string.end;
15-
16-
if (s >= end) return 0; // End of string
17-
18-
if (RBS_LIKELY((*s & 0x80) == 0)) {
19-
// Single byte character (0xxxxxxx)
20-
return *s;
21-
} else if ((*s & 0xE0) == 0xC0) {
22-
// Two byte character (110xxxxx 10xxxxxx)
23-
codepoint = *s & 0x1F;
24-
remaining_bytes = 1;
25-
} else if ((*s & 0xF0) == 0xE0) {
26-
// Three byte character (1110xxxx 10xxxxxx 10xxxxxx)
27-
codepoint = *s & 0x0F;
28-
remaining_bytes = 2;
29-
} else if ((*s & 0xF8) == 0xF0) {
30-
// Four byte character (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
31-
codepoint = *s & 0x07;
32-
remaining_bytes = 3;
33-
} else {
34-
// Invalid UTF-8 sequence
35-
return 0xFFFD; // Unicode replacement character
36-
}
37-
38-
s++;
39-
while (remaining_bytes > 0 && s < end) {
40-
if ((*s & 0xC0) != 0x80) {
41-
// Invalid continuation byte
42-
return 0xFFFD;
43-
}
44-
codepoint = (codepoint << 6) | (*s & 0x3F);
45-
s++;
46-
remaining_bytes--;
47-
}
48-
49-
if (remaining_bytes > 0) {
50-
// Incomplete sequence
51-
return 0xFFFD;
52-
}
53-
54-
return codepoint;
55-
}
56-
578
rbs_string_t rbs_string_new(const char *start, const char *end) {
589
return (rbs_string_t) {
5910
.start = start,

src/util/rbs_unescape.c

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "rbs/util/rbs_unescape.h"
2+
#include "rbs/util/rbs_encoding.h"
23
#include <string.h>
34
#include <stdlib.h>
45
#include <ctype.h>
@@ -42,20 +43,44 @@ static int octal_to_int(const char *octal, int length) {
4243
return result;
4344
}
4445

45-
int rbs_utf8_codelen(unsigned int c) {
46-
if (c <= 0x7F) return 1;
47-
if (c <= 0x7FF) return 2;
48-
if (c <= 0xFFFF) return 3;
49-
if (c <= 0x10FFFF) return 4;
50-
return 1; // Invalid Unicode codepoint, treat as 1 byte
46+
// Fills buf starting at index 'start' with the UTF-8 encoding of 'codepoint'.
47+
// Returns the number of bytes written, or 0 when the output is not changed.
48+
//
49+
size_t rbs_utf8_fill_codepoint(char *buf, size_t start, size_t end, unsigned int codepoint) {
50+
if (start + 4 > end) {
51+
return 0;
52+
}
53+
54+
if (codepoint <= 0x7F) {
55+
buf[start] = codepoint & 0x7F;
56+
return 1;
57+
} else if (codepoint <= 0x7FF) {
58+
buf[start + 0] = 0xC0 | ((codepoint >> 6) & 0x1F);
59+
buf[start + 1] = 0x80 | (codepoint & 0x3F);
60+
return 2;
61+
} else if (codepoint <= 0xFFFF) {
62+
buf[start + 0] = 0xE0 | ((codepoint >> 12) & 0x0F);
63+
buf[start + 1] = 0x80 | ((codepoint >> 6) & 0x3F);
64+
buf[start + 2] = 0x80 | (codepoint & 0x3F);
65+
return 3;
66+
} else if (codepoint <= 0x10FFFF) {
67+
buf[start + 0] = 0xF0 | ((codepoint >> 18) & 0x07);
68+
buf[start + 1] = 0x80 | ((codepoint >> 12) & 0x3F);
69+
buf[start + 2] = 0x80 | ((codepoint >> 6) & 0x3F);
70+
buf[start + 3] = 0x80 | (codepoint & 0x3F);
71+
return 4;
72+
} else {
73+
return 0;
74+
}
5175
}
5276

53-
rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote) {
77+
rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote, bool is_unicode) {
5478
if (!string.start) return RBS_STRING_NULL;
5579

5680
size_t len = string.end - string.start;
5781
const char *input = string.start;
5882

83+
// The output cannot be longer than the input even after unescaping.
5984
char *output = rbs_allocator_alloc_many(allocator, len + 1, char);
6085
if (!output) return RBS_STRING_NULL;
6186

@@ -79,9 +104,21 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
79104
i += hex_len + 2;
80105
} else if (input[i + 1] == 'u' && i + 5 < len) {
81106
// Unicode escape
82-
int value = hex_to_int(input + i + 2, 4);
83-
output[j++] = (char) value;
84-
i += 6;
107+
108+
if (is_unicode) {
109+
// The UTF-8 representation is at most 4 bytes, shorter than the input length.
110+
int value = hex_to_int(input + i + 2, 4);
111+
j += rbs_utf8_fill_codepoint(output, j, len + 1, value);
112+
i += 6;
113+
} else {
114+
// Copy the escape sequence as-is
115+
output[j++] = input[i++];
116+
output[j++] = input[i++];
117+
output[j++] = input[i++];
118+
output[j++] = input[i++];
119+
output[j++] = input[i++];
120+
output[j++] = input[i++];
121+
}
85122
} else {
86123
// Other escapes
87124
int found = 0;
@@ -114,18 +151,17 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
114151
return rbs_string_new(output, output + j);
115152
}
116153

117-
rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input) {
118-
unsigned int first_char = rbs_utf8_string_to_codepoint(input);
119-
size_t byte_length = rbs_string_len(input);
154+
rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input, const rbs_encoding_t *encoding) {
155+
unsigned int first_char = input.start[0];
156+
157+
const char *new_start = input.start;
158+
const char *new_end = input.end;
120159

121-
ptrdiff_t start_offset = 0;
122160
if (first_char == '"' || first_char == '\'' || first_char == '`') {
123-
int bs = rbs_utf8_codelen(first_char);
124-
start_offset += bs;
125-
byte_length -= 2 * bs;
161+
new_start += 1;
162+
new_end -= 1;
126163
}
127164

128-
const char *new_start = input.start + start_offset;
129-
rbs_string_t string = rbs_string_new(new_start, new_start + byte_length);
130-
return unescape_string(allocator, string, first_char == '"');
165+
rbs_string_t string = rbs_string_new(new_start, new_end);
166+
return unescape_string(allocator, string, first_char == '"', encoding == RBS_ENCODING_UTF_8_ENTRY);
131167
}

test/rbs/type_parsing_test.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,4 +883,40 @@ def test_escape_sequences
883883
assert_equal "\x00", type.types[2].literal
884884
end
885885
end
886+
887+
def test_parse__string_octal_escape
888+
Parser.parse_type('"\100"').yield_self do |type|
889+
assert_equal "\100", type.literal
890+
end
891+
Parser.parse_type('"\400"').yield_self do |type|
892+
assert_equal "\400", type.literal
893+
end
894+
end
895+
896+
def test_parse__string_hex_escape
897+
Parser.parse_type('"\x10"').yield_self do |type|
898+
assert_equal "\x10", type.literal
899+
end
900+
Parser.parse_type('"\x40"').yield_self do |type|
901+
assert_equal "\x40", type.literal
902+
end
903+
end
904+
905+
def test_parse__string_unicode_escape
906+
Parser.parse_type('"\u005a"').yield_self do |type|
907+
assert_equal "Z", type.literal
908+
end
909+
Parser.parse_type('"[\u30eb]"').yield_self do |type|
910+
assert_equal "[ル]", type.literal
911+
end
912+
end
913+
914+
def test_parse__string_unicode_escape__non_unicode
915+
Parser.parse_type('"\u005a"'.encode(Encoding::ASCII)).yield_self do |type|
916+
assert_equal "\\u005a", type.literal
917+
end
918+
Parser.parse_type('"[\u30eb]"'.encode(Encoding::Shift_JIS)).yield_self do |type|
919+
assert_equal "[\\u30eb]", type.literal
920+
end
921+
end
886922
end

0 commit comments

Comments
 (0)