Skip to content

Commit c446871

Browse files
authored
Optimize lit_is_valid_utf8_string (#4762)
JerryScript-DCO-1.0-Signed-off-by: Gergo Csizi [email protected]
1 parent 4e8d634 commit c446871

File tree

2 files changed

+80
-54
lines changed

2 files changed

+80
-54
lines changed

jerry-core/lit/lit-strings.c

Lines changed: 44 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,15 @@
1717

1818
#include "jrt-libc-includes.h"
1919

20+
#define LIT_UTF8_SURROGATE_MARKER 0xed /**< utf8 surrogate marker */
21+
#define LIT_UTF8_HIGH_SURROGATE_MIN 0xa0 /**< utf8 high surrogate minimum */
22+
#define LIT_UTF8_HIGH_SURROGATE_MAX 0xaf /**< utf8 high surrogate maximum */
23+
#define LIT_UTF8_LOW_SURROGATE_MIN 0xb0 /**< utf8 low surrogate minimum */
24+
#define LIT_UTF8_LOW_SURROGATE_MAX 0xbf /**< utf8 low surrogate maximum */
25+
#define LIT_UTF8_1_BYTE_MAX 0xf4 /**< utf8 one byte max */
26+
#define LIT_UTF8_2_BYTE_MAX 0x8f /**< utf8 two byte max */
27+
#define LIT_UTF8_VALID_TWO_BYTE_START 0xc2 /**< utf8 two byte start */
28+
2029
/**
2130
* Validate utf-8 string
2231
*
@@ -31,89 +40,70 @@ lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
3140
lit_utf8_size_t buf_size, /**< string size */
3241
bool is_strict) /**< true if surrogate pairs are not allowed */
3342
{
34-
lit_utf8_size_t idx = 0;
43+
const unsigned char *end = buf_size + utf8_buf_p;
3544

36-
bool is_prev_code_point_high_surrogate = false;
37-
while (idx < buf_size)
45+
const unsigned char *idx = (const unsigned char *) utf8_buf_p;
46+
47+
while (idx < end)
3848
{
39-
lit_utf8_byte_t c = utf8_buf_p[idx++];
40-
if ((c & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER)
49+
const uint8_t first_byte = *idx++;
50+
51+
if (first_byte < LIT_UTF8_EXTRA_BYTE_MARKER)
4152
{
42-
is_prev_code_point_high_surrogate = false;
4353
continue;
4454
}
4555

46-
lit_code_point_t code_point = 0;
47-
lit_code_point_t min_code_point = 0;
48-
lit_utf8_size_t extra_bytes_count;
49-
if ((c & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER)
50-
{
51-
extra_bytes_count = 1;
52-
min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN;
53-
code_point = ((uint32_t) (c & LIT_UTF8_LAST_5_BITS_MASK));
54-
}
55-
else if ((c & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER)
56+
if (first_byte < LIT_UTF8_VALID_TWO_BYTE_START || idx >= end)
5657
{
57-
extra_bytes_count = 2;
58-
min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN;
59-
code_point = ((uint32_t) (c & LIT_UTF8_LAST_4_BITS_MASK));
60-
}
61-
else if ((c & LIT_UTF8_4_BYTE_MASK) == LIT_UTF8_4_BYTE_MARKER)
62-
{
63-
extra_bytes_count = 3;
64-
min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN;
65-
code_point = ((uint32_t) (c & LIT_UTF8_LAST_3_BITS_MASK));
66-
}
67-
else
68-
{
69-
/* utf-8 string could not contain 5- and 6-byte sequences. */
7058
return false;
7159
}
7260

73-
if (idx + extra_bytes_count > buf_size)
61+
const uint8_t second_byte = *idx++;
62+
63+
if ((second_byte & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
7464
{
75-
/* utf-8 string breaks in the middle */
7665
return false;
7766
}
7867

79-
for (lit_utf8_size_t offset = 0; offset < extra_bytes_count; ++offset)
68+
if (first_byte < LIT_UTF8_3_BYTE_MARKER)
8069
{
81-
c = utf8_buf_p[idx + offset];
82-
if ((c & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
83-
{
84-
/* invalid continuation byte */
85-
return false;
86-
}
87-
code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES;
88-
code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK);
70+
continue;
8971
}
9072

91-
if (code_point < min_code_point
92-
|| code_point > LIT_UNICODE_CODE_POINT_MAX)
73+
if (idx >= end || (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
9374
{
94-
/* utf-8 string doesn't encode valid unicode code point */
9575
return false;
9676
}
9777

98-
if (is_strict)
78+
if (first_byte < LIT_UTF8_4_BYTE_MARKER)
9979
{
100-
is_prev_code_point_high_surrogate = false;
101-
102-
if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
103-
&& code_point <= LIT_UTF16_HIGH_SURROGATE_MAX)
80+
if (first_byte == LIT_UTF8_3_BYTE_MARKER && (second_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_EXTRA_BYTE_MARKER)
10481
{
105-
is_prev_code_point_high_surrogate = true;
82+
return false;
10683
}
107-
else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
108-
&& code_point <= LIT_UTF16_LOW_SURROGATE_MAX
109-
&& is_prev_code_point_high_surrogate)
84+
85+
if (is_strict
86+
&& first_byte == LIT_UTF8_SURROGATE_MARKER
87+
&& second_byte >= LIT_UTF8_HIGH_SURROGATE_MIN
88+
&& second_byte <= LIT_UTF8_HIGH_SURROGATE_MAX
89+
&& idx + 3 <= end
90+
&& idx[0] == LIT_UTF8_SURROGATE_MARKER
91+
&& idx[1] >= LIT_UTF8_LOW_SURROGATE_MIN
92+
&& idx[1] <= LIT_UTF8_LOW_SURROGATE_MAX)
11093
{
111-
/* sequence of high and low surrogate is not allowed */
11294
return false;
11395
}
96+
continue;
11497
}
11598

116-
idx += extra_bytes_count;
99+
if (idx >= end
100+
|| first_byte > LIT_UTF8_1_BYTE_MAX
101+
|| (first_byte == LIT_UTF8_4_BYTE_MARKER && second_byte <= LIT_UTF8_EXTRA_BYTE_MARKER)
102+
|| (first_byte == LIT_UTF8_1_BYTE_MAX && second_byte > LIT_UTF8_2_BYTE_MAX)
103+
|| (*idx++ & LIT_UTF8_EXTRA_BYTE_MASK) != LIT_UTF8_EXTRA_BYTE_MARKER)
104+
{
105+
return false;
106+
}
117107
}
118108

119109
return true;

tests/unit-core/test-strings.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,42 @@ main (void)
215215
TEST_ASSERT (res_buf[1] == 0x9F);
216216
TEST_ASSERT (res_buf[2] == 0xBF);
217217

218+
/* Ascii string */
219+
lit_utf8_byte_t utf8_string_ascii[] = {'G','o','o','d','b','y','e'};
220+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_ascii, sizeof (utf8_string_ascii), true));
221+
222+
/* Control character */
223+
lit_utf8_byte_t utf8_string_control[] = {0x00};
224+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_control, sizeof (utf8_string_control), true));
225+
226+
/* 3 byte characters */
227+
lit_utf8_byte_t utf8_string_3byte[] = {0xe4, 0xbd, 0xa0, 0xe5, 0xa5, 0xbd, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c};
228+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_3byte, sizeof (utf8_string_3byte), true));
229+
230+
/* 4 byte characters */
231+
lit_utf8_byte_t utf8_string_4byte[] = {0xf0, 0x90, 0x80, 0x80, 0xf0, 0x9f, 0xa7, 0xbf};
232+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_4byte, sizeof (utf8_string_4byte), true));
233+
234+
/* Invalid continuation byte */
235+
lit_utf8_byte_t utf8_string_invalid[] = {0xa0};
236+
TEST_ASSERT (!lit_is_valid_utf8_string (utf8_string_invalid, sizeof (utf8_string_invalid), true));
237+
238+
/* Isolated high surrogate */
239+
lit_utf8_byte_t utf8_string_high[] = {0xed, 0xa0, 0x80};
240+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_high, sizeof (utf8_string_high), true));
241+
242+
/* Isolated low surrogate */
243+
lit_utf8_byte_t utf8_string_low[] = {0xed, 0xbf, 0xbf};
244+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_low, sizeof (utf8_string_low), true));
245+
246+
/* Correct pair of surrogates in strict*/
247+
lit_utf8_byte_t utf8_string_surrogates_strict[] = {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf};
248+
TEST_ASSERT (!lit_is_valid_utf8_string (utf8_string_surrogates_strict, sizeof (utf8_string_surrogates_strict), true));
249+
250+
/* Correct pair of surrogates*/
251+
lit_utf8_byte_t utf8_string_surrogates[] = {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf};
252+
TEST_ASSERT (lit_is_valid_utf8_string (utf8_string_surrogates, sizeof (utf8_string_surrogates), false));
253+
218254
ecma_finalize ();
219255
jmem_finalize ();
220256

0 commit comments

Comments
 (0)