17
17
18
18
#include "jrt-libc-includes.h"
19
19
20
+ #define LIT_UTF8_SURROGATE_MARKER 0xed /**< utf8 surrogate marker */
21
+ #define LIT_UTF8_HIGH_SURROGATE_MIN 0xa0 /**< utf8 high surrogate minimum */
22
+ #define LIT_UTF8_HIGH_SURROGATE_MAX 0xaf /**< utf8 high surrogate maximum */
23
+ #define LIT_UTF8_LOW_SURROGATE_MIN 0xb0 /**< utf8 low surrogate minimum */
24
+ #define LIT_UTF8_LOW_SURROGATE_MAX 0xbf /**< utf8 low surrogate maximum */
25
+ #define LIT_UTF8_1_BYTE_MAX 0xf4 /**< utf8 one byte max */
26
+ #define LIT_UTF8_2_BYTE_MAX 0x8f /**< utf8 two byte max */
27
+ #define LIT_UTF8_VALID_TWO_BYTE_START 0xc2 /**< utf8 two byte start */
28
+
20
29
/**
21
30
* Validate utf-8 string
22
31
*
@@ -31,89 +40,70 @@ lit_is_valid_utf8_string (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 string *
31
40
lit_utf8_size_t buf_size , /**< string size */
32
41
bool is_strict ) /**< true if surrogate pairs are not allowed */
33
42
{
34
- lit_utf8_size_t idx = 0 ;
43
+ const unsigned char * end = buf_size + utf8_buf_p ;
35
44
36
- bool is_prev_code_point_high_surrogate = false;
37
- while (idx < buf_size )
45
+ const unsigned char * idx = (const unsigned char * ) utf8_buf_p ;
46
+
47
+ while (idx < end )
38
48
{
39
- lit_utf8_byte_t c = utf8_buf_p [idx ++ ];
40
- if ((c & LIT_UTF8_1_BYTE_MASK ) == LIT_UTF8_1_BYTE_MARKER )
49
+ const uint8_t first_byte = * idx ++ ;
50
+
51
+ if (first_byte < LIT_UTF8_EXTRA_BYTE_MARKER )
41
52
{
42
- is_prev_code_point_high_surrogate = false;
43
53
continue ;
44
54
}
45
55
46
- lit_code_point_t code_point = 0 ;
47
- lit_code_point_t min_code_point = 0 ;
48
- lit_utf8_size_t extra_bytes_count ;
49
- if ((c & LIT_UTF8_2_BYTE_MASK ) == LIT_UTF8_2_BYTE_MARKER )
50
- {
51
- extra_bytes_count = 1 ;
52
- min_code_point = LIT_UTF8_2_BYTE_CODE_POINT_MIN ;
53
- code_point = ((uint32_t ) (c & LIT_UTF8_LAST_5_BITS_MASK ));
54
- }
55
- else if ((c & LIT_UTF8_3_BYTE_MASK ) == LIT_UTF8_3_BYTE_MARKER )
56
+ if (first_byte < LIT_UTF8_VALID_TWO_BYTE_START || idx >= end )
56
57
{
57
- extra_bytes_count = 2 ;
58
- min_code_point = LIT_UTF8_3_BYTE_CODE_POINT_MIN ;
59
- code_point = ((uint32_t ) (c & LIT_UTF8_LAST_4_BITS_MASK ));
60
- }
61
- else if ((c & LIT_UTF8_4_BYTE_MASK ) == LIT_UTF8_4_BYTE_MARKER )
62
- {
63
- extra_bytes_count = 3 ;
64
- min_code_point = LIT_UTF8_4_BYTE_CODE_POINT_MIN ;
65
- code_point = ((uint32_t ) (c & LIT_UTF8_LAST_3_BITS_MASK ));
66
- }
67
- else
68
- {
69
- /* utf-8 string could not contain 5- and 6-byte sequences. */
70
58
return false;
71
59
}
72
60
73
- if (idx + extra_bytes_count > buf_size )
61
+ const uint8_t second_byte = * idx ++ ;
62
+
63
+ if ((second_byte & LIT_UTF8_EXTRA_BYTE_MASK ) != LIT_UTF8_EXTRA_BYTE_MARKER )
74
64
{
75
- /* utf-8 string breaks in the middle */
76
65
return false;
77
66
}
78
67
79
- for ( lit_utf8_size_t offset = 0 ; offset < extra_bytes_count ; ++ offset )
68
+ if ( first_byte < LIT_UTF8_3_BYTE_MARKER )
80
69
{
81
- c = utf8_buf_p [idx + offset ];
82
- if ((c & LIT_UTF8_EXTRA_BYTE_MASK ) != LIT_UTF8_EXTRA_BYTE_MARKER )
83
- {
84
- /* invalid continuation byte */
85
- return false;
86
- }
87
- code_point <<= LIT_UTF8_BITS_IN_EXTRA_BYTES ;
88
- code_point |= (c & LIT_UTF8_LAST_6_BITS_MASK );
70
+ continue ;
89
71
}
90
72
91
- if (code_point < min_code_point
92
- || code_point > LIT_UNICODE_CODE_POINT_MAX )
73
+ if (idx >= end || (* idx ++ & LIT_UTF8_EXTRA_BYTE_MASK ) != LIT_UTF8_EXTRA_BYTE_MARKER )
93
74
{
94
- /* utf-8 string doesn't encode valid unicode code point */
95
75
return false;
96
76
}
97
77
98
- if (is_strict )
78
+ if (first_byte < LIT_UTF8_4_BYTE_MARKER )
99
79
{
100
- is_prev_code_point_high_surrogate = false;
101
-
102
- if (code_point >= LIT_UTF16_HIGH_SURROGATE_MIN
103
- && code_point <= LIT_UTF16_HIGH_SURROGATE_MAX )
80
+ if (first_byte == LIT_UTF8_3_BYTE_MARKER && (second_byte & LIT_UTF8_2_BYTE_MASK ) == LIT_UTF8_EXTRA_BYTE_MARKER )
104
81
{
105
- is_prev_code_point_high_surrogate = true ;
82
+ return false ;
106
83
}
107
- else if (code_point >= LIT_UTF16_LOW_SURROGATE_MIN
108
- && code_point <= LIT_UTF16_LOW_SURROGATE_MAX
109
- && is_prev_code_point_high_surrogate )
84
+
85
+ if (is_strict
86
+ && first_byte == LIT_UTF8_SURROGATE_MARKER
87
+ && second_byte >= LIT_UTF8_HIGH_SURROGATE_MIN
88
+ && second_byte <= LIT_UTF8_HIGH_SURROGATE_MAX
89
+ && idx + 3 <= end
90
+ && idx [0 ] == LIT_UTF8_SURROGATE_MARKER
91
+ && idx [1 ] >= LIT_UTF8_LOW_SURROGATE_MIN
92
+ && idx [1 ] <= LIT_UTF8_LOW_SURROGATE_MAX )
110
93
{
111
- /* sequence of high and low surrogate is not allowed */
112
94
return false;
113
95
}
96
+ continue ;
114
97
}
115
98
116
- idx += extra_bytes_count ;
99
+ if (idx >= end
100
+ || first_byte > LIT_UTF8_1_BYTE_MAX
101
+ || (first_byte == LIT_UTF8_4_BYTE_MARKER && second_byte <= LIT_UTF8_EXTRA_BYTE_MARKER )
102
+ || (first_byte == LIT_UTF8_1_BYTE_MAX && second_byte > LIT_UTF8_2_BYTE_MAX )
103
+ || (* idx ++ & LIT_UTF8_EXTRA_BYTE_MASK ) != LIT_UTF8_EXTRA_BYTE_MARKER )
104
+ {
105
+ return false;
106
+ }
117
107
}
118
108
119
109
return true;
0 commit comments