Skip to content

Commit 275af25

Browse files
authored
Merge pull request #2756 from ruby/lexer-unicode
Parser/lexer backports
2 parents 7841784 + 703e3f4 commit 275af25

File tree

16 files changed

+245
-146
lines changed

16 files changed

+245
-146
lines changed

docs/encoding.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# RBS File Encoding
2+
3+
## Best Practice
4+
5+
**Use UTF-8** for both file encoding and your system locale.
6+
7+
## Supported Encodings
8+
9+
RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support).
10+
11+
**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ...
12+
13+
## Unicode Codepoint Symbols
14+
15+
String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`).
16+
17+
When the file encoding is UTF-8, the parser translates Unicode codepoint symbols:
18+
19+
```rbs
20+
# In UTF-8 encoded files
21+
22+
type t = "\u0123" # Translated to the actual Unicode character ģ
23+
type s = "\u3042" # Translated to the actual Unicode character あ
24+
```
25+
26+
When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`:
27+
28+
```rbs
29+
# In non-UTF-8 encoded files
30+
31+
type t = "\u0123" # Remains as the literal string "\u0123"
32+
```
33+
34+
## Implementation
35+
36+
RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`.
37+
38+
`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.)
39+
40+
When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding.
41+
42+
### Parsing non UTF-8 RBS source text
43+
44+
If you want to work with another encoding, ensure the source string has ASCII compatible encoding.
45+
46+
```ruby
47+
source = '"日本語"'
48+
RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully
49+
RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible
50+
```
51+
52+
### Specifying file encoding
53+
54+
Currently, RBS doesn't support specifying file encoding directly.
55+
56+
You can use `Encoding.default_external` while the gem loads RBS files from the storage.

ext/rbs_extension/class_constants.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
#include "rbs_extension.h"
99

10-
VALUE RBS_Parser;
11-
1210
VALUE RBS;
1311
VALUE RBS_AST;
1412
VALUE RBS_AST_Declarations;

ext/rbs_extension/legacy_location.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void rbs_loc_legacy_alloc_children(rbs_loc *loc, unsigned short cap) {
3333
check_children_max(cap);
3434

3535
size_t s = RBS_LOC_CHILDREN_SIZE(cap);
36-
loc->children = malloc(s);
36+
loc->children = (rbs_loc_children *) malloc(s);
3737

3838
*loc->children = (rbs_loc_children) {
3939
.len = 0,
@@ -50,7 +50,7 @@ static void check_children_cap(rbs_loc *loc) {
5050
if (loc->children->len == loc->children->cap) {
5151
check_children_max(loc->children->cap + 1);
5252
size_t s = RBS_LOC_CHILDREN_SIZE(++loc->children->cap);
53-
loc->children = realloc(loc->children, s);
53+
loc->children = (rbs_loc_children *) realloc(loc->children, s);
5454
}
5555
}
5656
}
@@ -86,12 +86,12 @@ void rbs_loc_free(rbs_loc *loc) {
8686
}
8787

8888
static void rbs_loc_mark(void *ptr) {
89-
rbs_loc *loc = ptr;
89+
rbs_loc *loc = (rbs_loc *) ptr;
9090
rb_gc_mark(loc->buffer);
9191
}
9292

9393
static size_t rbs_loc_memsize(const void *ptr) {
94-
const rbs_loc *loc = ptr;
94+
const rbs_loc *loc = (const rbs_loc *) ptr;
9595
if (loc->children == NULL) {
9696
return sizeof(rbs_loc);
9797
} else {
@@ -117,7 +117,7 @@ static VALUE location_s_allocate(VALUE klass) {
117117
}
118118

119119
rbs_loc *rbs_check_location(VALUE obj) {
120-
return rb_check_typeddata(obj, &location_type);
120+
return (rbs_loc *) rb_check_typeddata(obj, &location_type);
121121
}
122122

123123
static VALUE location_initialize(VALUE self, VALUE buffer, VALUE start_pos, VALUE end_pos) {

ext/rbs_extension/main.c

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -187,18 +187,10 @@ static VALUE parse_method_type_try(VALUE a) {
187187
}
188188

189189
rbs_method_type_t *method_type = NULL;
190-
rbs_parse_method_type(parser, &method_type);
190+
rbs_parse_method_type(parser, &method_type, RB_TEST(arg->require_eof));
191191

192192
raise_error_if_any(parser, arg->buffer);
193193

194-
if (RB_TEST(arg->require_eof)) {
195-
rbs_parser_advance(parser);
196-
if (parser->current_token.type != pEOF) {
197-
rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF));
198-
raise_error(parser->error, arg->buffer);
199-
}
200-
}
201-
202194
rbs_translation_context_t ctx = rbs_translation_context_create(
203195
&parser->constant_pool,
204196
arg->buffer,

include/rbs/parser.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ typedef struct rbs_error_t {
4444
* An RBS parser is a LL(3) parser.
4545
* */
4646
typedef struct {
47-
rbs_lexer_t *rbs_lexer_t;
47+
rbs_lexer_t *lexer;
4848

4949
rbs_token_t current_token;
5050
rbs_token_t next_token; /* The first lookahead token */
@@ -127,7 +127,7 @@ rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line
127127
void rbs_parser_set_error(rbs_parser_t *parser, rbs_token_t tok, bool syntax_error, const char *fmt, ...) RBS_ATTRIBUTE_FORMAT(4, 5);
128128

129129
bool rbs_parse_type(rbs_parser_t *parser, rbs_node_t **type);
130-
bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type);
130+
bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof);
131131
bool rbs_parse_signature(rbs_parser_t *parser, rbs_signature_t **signature);
132132

133133
bool rbs_parse_type_params(rbs_parser_t *parser, bool module_type_params, rbs_node_list_t **params);

include/rbs/string.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self);
4444
*/
4545
bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs);
4646

47-
unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string);
48-
4947
#endif

include/rbs/util/rbs_unescape.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stddef.h>
55
#include "rbs/util/rbs_allocator.h"
66
#include "rbs/string.h"
7+
#include "rbs/util/rbs_encoding.h"
78

89
/**
910
* Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE.
@@ -18,6 +19,6 @@
1819
*
1920
* @returns A new owned string that will be freed when the allocator is freed.
2021
* */
21-
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input);
22+
rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding);
2223

2324
#endif // RBS_RBS_UNESCAPE_H

src/location.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
void rbs_loc_alloc_children(rbs_allocator_t *allocator, rbs_location_t *loc, size_t capacity) {
99
RBS_ASSERT(capacity <= sizeof(rbs_loc_entry_bitmap) * 8, "Capacity %zu is too large. Max is %zu", capacity, sizeof(rbs_loc_entry_bitmap) * 8);
1010

11-
loc->children = rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));
11+
loc->children = (rbs_loc_children *) rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children));
1212

1313
loc->children->len = 0;
1414
loc->children->required_p = 0;

0 commit comments

Comments
 (0)