Skip to content

Commit 11bb41b

Browse files
authored
Merge branch 'master' into sampersand/2025-10-07/fix-typo
2 parents 098cea9 + b6d92e5 commit 11bb41b

File tree

8 files changed

+163
-80
lines changed

8 files changed

+163
-80
lines changed

Gemfile.lock

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ GEM
4141
diff-lcs (1.6.2)
4242
digest (3.2.0)
4343
drb (2.2.3)
44-
erb (5.0.2)
44+
erb (5.0.3)
4545
extconf_compile_commands_json (0.0.7)
4646
ffi (1.17.2-aarch64-linux-gnu)
4747
ffi (1.17.2-arm64-darwin)
@@ -102,9 +102,10 @@ GEM
102102
rb-fsevent (0.11.2)
103103
rb-inotify (0.11.1)
104104
ffi (~> 1.0)
105-
rdoc (6.14.2)
105+
rdoc (6.15.0)
106106
erb
107107
psych (>= 4.0.0)
108+
tsort
108109
regexp_parser (2.11.3)
109110
rspec (3.13.1)
110111
rspec-core (~> 3.13.0)

docs/encoding.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RBS File Encoding
2+
3+
## Best Practice
4+
5+
**Use UTF-8** for both file encoding and your system locale.
6+
7+
## Supported Encodings
8+
9+
RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).
10+
11+
**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...
12+
13+
## Unicode Codepoint Symbols
14+
15+
String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).
16+
17+
When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:
18+
19+
```rbs
20+
# In UTF-8 encoded files
21+
22+
type t = "\u0123" # Translated to the actual Unicode character ģ
23+
type s = "\u3042" # Translated to the actual Unicode character あ
24+
```
25+
26+
When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:
27+
28+
```rbs
29+
# In non-UTF-8 encoded files
30+
31+
type t = "\u0123" # Remains as the literal string "\u0123"
32+
```
33+
34+
## Implementation
35+
36+
RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.
37+
38+
`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)
39+
40+
When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.
41+
42+
### Parsing non UTF-8 RBS source text
43+
44+
If you want to work with another encoding, ensure the source string has ASCII compatible encoding.
45+
46+
```ruby
47+
source = '"日本語"'
48+
RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully
49+
RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible
50+
```
51+
52+
### Specifying file encoding
53+
54+
Currently, RBS doesn't support specifying file encoding directly.
55+
56+
You can use `Encoding.default_external` while the gem loads RBS files from the storage.

include/rbs/string.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
4444
*/
4545
bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);
4646

47-
unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);
48-
4947
#endif

include/rbs/util/rbs_unescape.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stddef.h>
55
#include "rbs/util/rbs_allocator.h"
66
#include "rbs/string.h"
7+
#include "rbs/util/rbs_encoding.h"
78

89
/**
910
* Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
@@ -18,6 +19,6 @@
1819
*
1920
* @returns A new owned string that will be freed when the allocator is freed.
2021
* */
21-
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
22+
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);
2223

2324
#endif // RBS_RBS_UNESCAPE_H

src/parser.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_
358358
return false;
359359
}
360360

361-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
361+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
362362
rbs_location_t *symbolLoc = rbs_location_current_token(parser);
363363
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
364364
rbs_ast_symbol_t *name = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -973,7 +973,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ
973973

974974
rbs_string_t symbol = rbs_string_new(current_token.start + offset_bytes, current_token.end);
975975

976-
rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol);
976+
rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol, parser->rbs_lexer_t->encoding);
977977

978978
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_symbol);
979979

@@ -1215,7 +1215,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type, bool void_allo
12151215
case tDQSTRING: {
12161216
rbs_location_t *loc = rbs_location_current_token(parser);
12171217

1218-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser));
1218+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->rbs_lexer_t->encoding);
12191219
rbs_node_t *literal = (rbs_node_t *) rbs_ast_string_new(ALLOCATOR(), loc, unquoted_str);
12201220
*type = (rbs_node_t *) rbs_types_literal_new(ALLOCATOR(), loc, literal);
12211221
return true;
@@ -1704,7 +1704,9 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota
17041704
parser->rbs_lexer_t->string.start + rg.start.byte_pos + offset_bytes,
17051705
parser->rbs_lexer_t->string.end
17061706
);
1707-
unsigned int open_char = rbs_utf8_string_to_codepoint(str);
1707+
1708+
// Assumes the input is ASCII compatible
1709+
unsigned int open_char = str.start[0];
17081710

17091711
unsigned int close_char;
17101712

@@ -1817,7 +1819,7 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_
18171819
}
18181820
case tQIDENT: {
18191821
rbs_string_t string = rbs_parser_peek_current_token(parser);
1820-
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string);
1822+
rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string, parser->rbs_lexer_t->encoding);
18211823
rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str);
18221824
rbs_location_t *symbolLoc = rbs_location_current_token(parser);
18231825
*symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id);
@@ -3218,7 +3220,9 @@ static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_
32183220
comment_start,
32193221
parser->rbs_lexer_t->string.end
32203222
);
3221-
unsigned char c = rbs_utf8_string_to_codepoint(str);
3223+
3224+
// Assumes the input is ASCII compatible
3225+
unsigned char c = str.start[0];
32223226

32233227
if (c == ' ') {
32243228
comment_start += space_bytes;

src/string.c

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,10 @@
11
#include "rbs/string.h"
2-
#include "rbs/defines.h"
32

43
#include <stdlib.h>
54
#include <string.h>
65
#include <stdio.h>
76
#include <ctype.h>
87

9-
unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) {
10-
unsigned int codepoint = 0;
11-
int remaining_bytes = 0;
12-
13-
const char *s = string.start;
14-
const char *end = string.end;
15-
16-
if (s >= end) return 0; // End of string
17-
18-
if (RBS_LIKELY((*s & 0x80) == 0)) {
19-
// Single byte character (0xxxxxxx)
20-
return *s;
21-
} else if ((*s & 0xE0) == 0xC0) {
22-
// Two byte character (110xxxxx 10xxxxxx)
23-
codepoint = *s & 0x1F;
24-
remaining_bytes = 1;
25-
} else if ((*s & 0xF0) == 0xE0) {
26-
// Three byte character (1110xxxx 10xxxxxx 10xxxxxx)
27-
codepoint = *s & 0x0F;
28-
remaining_bytes = 2;
29-
} else if ((*s & 0xF8) == 0xF0) {
30-
// Four byte character (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
31-
codepoint = *s & 0x07;
32-
remaining_bytes = 3;
33-
} else {
34-
// Invalid UTF-8 sequence
35-
return 0xFFFD; // Unicode replacement character
36-
}
37-
38-
s++;
39-
while (remaining_bytes > 0 && s < end) {
40-
if ((*s & 0xC0) != 0x80) {
41-
// Invalid continuation byte
42-
return 0xFFFD;
43-
}
44-
codepoint = (codepoint << 6) | (*s & 0x3F);
45-
s++;
46-
remaining_bytes--;
47-
}
48-
49-
if (remaining_bytes > 0) {
50-
// Incomplete sequence
51-
return 0xFFFD;
52-
}
53-
54-
return codepoint;
55-
}
56-
578
rbs_string_t rbs_string_new(const char *start, const char *end) {
589
return (rbs_string_t) {
5910
.start = start,

src/util/rbs_unescape.c

Lines changed: 56 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "rbs/util/rbs_unescape.h"
2+
#include "rbs/util/rbs_encoding.h"
23
#include <string.h>
34
#include <stdlib.h>
45
#include <ctype.h>
@@ -42,20 +43,44 @@ static int octal_to_int(const char *octal, int length) {
4243
return result;
4344
}
4445

45-
int rbs_utf8_codelen(unsigned int c) {
46-
if (c <= 0x7F) return 1;
47-
if (c <= 0x7FF) return 2;
48-
if (c <= 0xFFFF) return 3;
49-
if (c <= 0x10FFFF) return 4;
50-
return 1; // Invalid Unicode codepoint, treat as 1 byte
46+
// Fills buf starting at index 'start' with the UTF-8 encoding of 'codepoint'.
47+
// Returns the number of bytes written, or 0 when the output is not changed.
48+
//
49+
size_t rbs_utf8_fill_codepoint(char *buf, size_t start, size_t end, unsigned int codepoint) {
50+
if (start + 4 > end) {
51+
return 0;
52+
}
53+
54+
if (codepoint <= 0x7F) {
55+
buf[start] = codepoint & 0x7F;
56+
return 1;
57+
} else if (codepoint <= 0x7FF) {
58+
buf[start + 0] = 0xC0 | ((codepoint >> 6) & 0x1F);
59+
buf[start + 1] = 0x80 | (codepoint & 0x3F);
60+
return 2;
61+
} else if (codepoint <= 0xFFFF) {
62+
buf[start + 0] = 0xE0 | ((codepoint >> 12) & 0x0F);
63+
buf[start + 1] = 0x80 | ((codepoint >> 6) & 0x3F);
64+
buf[start + 2] = 0x80 | (codepoint & 0x3F);
65+
return 3;
66+
} else if (codepoint <= 0x10FFFF) {
67+
buf[start + 0] = 0xF0 | ((codepoint >> 18) & 0x07);
68+
buf[start + 1] = 0x80 | ((codepoint >> 12) & 0x3F);
69+
buf[start + 2] = 0x80 | ((codepoint >> 6) & 0x3F);
70+
buf[start + 3] = 0x80 | (codepoint & 0x3F);
71+
return 4;
72+
} else {
73+
return 0;
74+
}
5175
}
5276

53-
rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote) {
77+
rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote, bool is_unicode) {
5478
if (!string.start) return RBS_STRING_NULL;
5579

5680
size_t len = string.end - string.start;
5781
const char *input = string.start;
5882

83+
// The output cannot be longer than the input even after unescaping.
5984
char *output = rbs_allocator_alloc_many(allocator, len + 1, char);
6085
if (!output) return RBS_STRING_NULL;
6186

@@ -79,9 +104,21 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
79104
i += hex_len + 2;
80105
} else if (input[i + 1] == 'u' && i + 5 < len) {
81106
// Unicode escape
82-
int value = hex_to_int(input + i + 2, 4);
83-
output[j++] = (char) value;
84-
i += 6;
107+
108+
if (is_unicode) {
109+
// The UTF-8 representation is at most 4 bytes, shorter than the input length.
110+
int value = hex_to_int(input + i + 2, 4);
111+
j += rbs_utf8_fill_codepoint(output, j, len + 1, value);
112+
i += 6;
113+
} else {
114+
// Copy the escape sequence as-is
115+
output[j++] = input[i++];
116+
output[j++] = input[i++];
117+
output[j++] = input[i++];
118+
output[j++] = input[i++];
119+
output[j++] = input[i++];
120+
output[j++] = input[i++];
121+
}
85122
} else {
86123
// Other escapes
87124
int found = 0;
@@ -114,18 +151,17 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri
114151
return rbs_string_new(output, output + j);
115152
}
116153

117-
rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input) {
118-
unsigned int first_char = rbs_utf8_string_to_codepoint(input);
119-
size_t byte_length = rbs_string_len(input);
154+
rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input, const rbs_encoding_t *encoding) {
155+
unsigned int first_char = input.start[0];
156+
157+
const char *new_start = input.start;
158+
const char *new_end = input.end;
120159

121-
ptrdiff_t start_offset = 0;
122160
if (first_char == '"' || first_char == '\'' || first_char == '`') {
123-
int bs = rbs_utf8_codelen(first_char);
124-
start_offset += bs;
125-
byte_length -= 2 * bs;
161+
new_start += 1;
162+
new_end -= 1;
126163
}
127164

128-
const char *new_start = input.start + start_offset;
129-
rbs_string_t string = rbs_string_new(new_start, new_start + byte_length);
130-
return unescape_string(allocator, string, first_char == '"');
165+
rbs_string_t string = rbs_string_new(new_start, new_end);
166+
return unescape_string(allocator, string, first_char == '"', encoding == RBS_ENCODING_UTF_8_ENTRY);
131167
}

test/rbs/type_parsing_test.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,4 +931,40 @@ def test_parse__void__prohibited
931931
Parser.parse_type("[void]")
932932
end
933933
end
934+
935+
def test_parse__string_octal_escape
936+
Parser.parse_type('"\100"').yield_self do |type|
937+
assert_equal "\100", type.literal
938+
end
939+
Parser.parse_type('"\400"').yield_self do |type|
940+
assert_equal "\400", type.literal
941+
end
942+
end
943+
944+
def test_parse__string_hex_escape
945+
Parser.parse_type('"\x10"').yield_self do |type|
946+
assert_equal "\x10", type.literal
947+
end
948+
Parser.parse_type('"\x40"').yield_self do |type|
949+
assert_equal "\x40", type.literal
950+
end
951+
end
952+
953+
def test_parse__string_unicode_escape
954+
Parser.parse_type('"\u005a"').yield_self do |type|
955+
assert_equal "Z", type.literal
956+
end
957+
Parser.parse_type('"[\u30eb]"').yield_self do |type|
958+
assert_equal "[ル]", type.literal
959+
end
960+
end
961+
962+
def test_parse__string_unicode_escape__non_unicode
963+
Parser.parse_type('"\u005a"'.encode(Encoding::ASCII)).yield_self do |type|
964+
assert_equal "\\u005a", type.literal
965+
end
966+
Parser.parse_type('"[\u30eb]"'.encode(Encoding::Shift_JIS)).yield_self do |type|
967+
assert_equal "[\\u30eb]", type.literal
968+
end
969+
end
934970
end

0 commit comments

Comments
 (0)