Skip to content

Commit eceab2f

Browse files
kddnewtonmatzbot
authored andcommitted
[ruby/prism] Escape error location is incorrect for some regex
When you have a regular expression that has a named capture that has an escape sequence in the named capture, and that escape sequence is a unicode escape sequence with an invalid surrogate pair, the error was attached to the owned string as opposed to a location on the shared source. ruby/prism@793a7a6a0a
1 parent bbc10ed commit eceab2f

File tree

1 file changed

+14
-10
lines changed

1 file changed

+14
-10
lines changed

prism/prism.c

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8613,7 +8613,7 @@ escape_hexadecimal_digit(const uint8_t value) {
86138613
* validated.
86148614
*/
86158615
static inline uint32_t
8616-
escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length) {
8616+
escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location) {
86178617
uint32_t value = 0;
86188618
for (size_t index = 0; index < length; index++) {
86198619
if (index != 0) value <<= 4;
@@ -8623,7 +8623,11 @@ escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length) {
86238623
// Here we're going to verify that the value is actually a valid Unicode
86248624
// codepoint and not a surrogate pair.
86258625
if (value >= 0xD800 && value <= 0xDFFF) {
8626-
pm_parser_err(parser, string, string + length, PM_ERR_ESCAPE_INVALID_UNICODE);
8626+
if (error_location != NULL) {
8627+
pm_parser_err(parser, error_location->start, error_location->end, PM_ERR_ESCAPE_INVALID_UNICODE);
8628+
} else {
8629+
pm_parser_err(parser, string, string + length, PM_ERR_ESCAPE_INVALID_UNICODE);
8630+
}
86278631
return 0xFFFD;
86288632
}
86298633

@@ -8923,7 +8927,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
89238927
extra_codepoints_start = unicode_start;
89248928
}
89258929

8926-
uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length);
8930+
uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL);
89278931
escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
89288932

89298933
parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end);
@@ -8964,7 +8968,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
89648968
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
89658969
}
89668970
} else if (length == 4) {
8967-
uint32_t value = escape_unicode(parser, parser->current.end, 4);
8971+
uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL);
89688972

89698973
if (flags & PM_ESCAPE_FLAG_REGEXP) {
89708974
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start));
@@ -20368,7 +20372,7 @@ pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, con
2036820372
}
2036920373

2037020374
static inline const uint8_t *
20371-
pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
20375+
pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end, const pm_location_t *error_location) {
2037220376
const uint8_t *start = cursor - 1;
2037320377
cursor++;
2037420378

@@ -20379,7 +20383,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con
2037920383

2038020384
if (*cursor != '{') {
2038120385
size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4));
20382-
uint32_t value = escape_unicode(parser, cursor, length);
20386+
uint32_t value = escape_unicode(parser, cursor, length, error_location);
2038320387

2038420388
if (!pm_buffer_append_unicode_codepoint(unescaped, value)) {
2038520389
pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start));
@@ -20402,7 +20406,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con
2040220406
if (length == 0) {
2040320407
break;
2040420408
}
20405-
uint32_t value = escape_unicode(parser, cursor, length);
20409+
uint32_t value = escape_unicode(parser, cursor, length, error_location);
2040620410

2040720411
(void) pm_buffer_append_unicode_codepoint(unescaped, value);
2040820412
cursor += length;
@@ -20412,7 +20416,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con
2041220416
}
2041320417

2041420418
static void
20415-
pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) {
20419+
pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor, const pm_location_t *error_location) {
2041620420
const uint8_t *end = source + length;
2041720421
pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source));
2041820422

@@ -20430,7 +20434,7 @@ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8
2043020434
cursor = pm_named_capture_escape_octal(unescaped, cursor, end);
2043120435
break;
2043220436
case 'u':
20433-
cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end);
20437+
cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end, error_location);
2043420438
break;
2043520439
default:
2043620440
pm_buffer_append_byte(unescaped, '\\');
@@ -20473,7 +20477,7 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
2047320477
// unescaped, which is what we need.
2047420478
const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding);
2047520479
if (PRISM_UNLIKELY(cursor != NULL)) {
20476-
pm_named_capture_escape(parser, &unescaped, source, length, cursor);
20480+
pm_named_capture_escape(parser, &unescaped, source, length, cursor, callback_data->shared ? NULL : &call->receiver->location);
2047720481
source = (const uint8_t *) pm_buffer_value(&unescaped);
2047820482
length = pm_buffer_length(&unescaped);
2047920483
}

0 commit comments

Comments
 (0)