@@ -96,6 +96,73 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
9696 raise_generator_error_str (invalid_object , str );
9797}
9898
99+ // 0 - single byte char that don't need to be escaped.
100+ // (x | 8) - char that needs to be escaped.
101+ static const unsigned char CHAR_LENGTH_MASK = 7 ;
102+
103+ static const unsigned char escape_table [256 ] = {
104+ // ASCII Control Characters
105+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
106+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
107+ // ASCII Characters
108+ 0 , 0 , 9 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // '"'
109+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
110+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
111+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 9 , 0 , 0 , 0 , // '\\'
112+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
113+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
114+ };
115+
116+ static const unsigned char ascii_only_escape_table [256 ] = {
117+ // ASCII Control Characters
118+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
119+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
120+ // ASCII Characters
121+ 0 , 0 , 9 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // '"'
122+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
123+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
124+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 9 , 0 , 0 , 0 , // '\\'
125+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
126+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
127+ // Continuation byte
128+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
129+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
130+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
131+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
132+ // First byte of a 2-byte code point
133+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
134+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
135+ // First byte of a 3-byte code point
136+ 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 ,
137+ //First byte of a 4+ byte code point
138+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 9 , 9 ,
139+ };
140+
141+ static const unsigned char script_safe_escape_table [256 ] = {
142+ // ASCII Control Characters
143+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
144+ 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 ,
145+ // ASCII Characters
146+ 0 , 0 , 9 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 9 , // '"' and '/'
147+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
148+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
149+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 9 , 0 , 0 , 0 , // '\\'
150+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
151+ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
152+ // Continuation byte
153+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
154+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
155+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
156+ 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
157+ // First byte of a 2-byte code point
158+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
159+ 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
160+ // First byte of a 3-byte code point
161+ 3 , 3 ,11 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xE2 is the start of \u2028 and \u2029
162+ //First byte of a 4+ byte code point
163+ 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 9 , 9 ,
164+ };
165+
99166/* Converts in_string to a JSON string (without the wrapping '"'
100167 * characters) in FBuffer out_buffer.
101168 *
@@ -106,13 +173,13 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
106173 *
107174 * - If out_ascii_only: non-ASCII characters (>0x7F)
108175 *
109- * - If out_script_safe : forwardslash, line separator (U+2028), and
176+ * - If script_safe : forwardslash (/) , line separator (U+2028), and
110177 * paragraph separator (U+2029)
111178 *
112179 * Everything else (should be UTF-8) is just passed through and
113180 * appended to the result.
114181 */
115- static void convert_UTF8_to_JSON (FBuffer * out_buffer , VALUE str , const char escape_table [256 ], bool out_script_safe )
182+ static inline void convert_UTF8_to_JSON (FBuffer * out_buffer , VALUE str , const unsigned char escape_table [256 ])
116183{
117184 const char * hexdig = "0123456789abcdef" ;
118185 char scratch [12 ] = { '\\' , 'u' , 0 , 0 , 0 , 0 , '\\' , 'u' };
@@ -131,7 +198,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
131198
132199 if (RB_UNLIKELY (ch_len )) {
133200 switch (ch_len ) {
134- case 1 : {
201+ case 9 : {
135202 FLUSH_POS (1 );
136203 switch (ch ) {
137204 case '"' : fbuffer_append (out_buffer , "\\\"" , 2 ); break ;
@@ -153,9 +220,9 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
153220 }
154221 break ;
155222 }
156- case 3 : {
223+ case 11 : {
157224 unsigned char b2 = ptr [pos + 1 ];
158- if (RB_UNLIKELY (out_script_safe && ch == 0xE2 && b2 == 0x80 )) {
225+ if (RB_UNLIKELY (b2 == 0x80 )) {
159226 unsigned char b3 = ptr [pos + 2 ];
160227 if (b3 == 0xA8 ) {
161228 FLUSH_POS (3 );
@@ -167,6 +234,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
167234 break ;
168235 }
169236 }
237+ ch_len = 3 ;
170238 // fallthrough
171239 }
172240 default :
@@ -186,104 +254,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char esca
186254 RB_GC_GUARD (str );
187255}
188256
189- static const char escape_table [256 ] = {
190- // ASCII Control Characters
191- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
192- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
193- // ASCII Characters
194- 0 ,0 ,1 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , // '"'
195- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
196- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
197- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,0 , // '\\'
198- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
199- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
200- // Continuation byte
201- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
202- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
203- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
204- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
205- // First byte of a 2-byte code point
206- 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,
207- 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,
208- // First byte of a 4-byte code point
209- 3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,
210- //First byte of a 4+byte code point
211- 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 ,5 ,5 ,5 ,5 ,6 ,6 ,1 ,1 ,
212- };
213-
214- static const char script_safe_escape_table [256 ] = {
215- // ASCII Control Characters
216- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
217- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
218- // ASCII Characters
219- 0 ,0 ,1 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 , // '"' and '/'
220- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
221- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
222- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0 ,0 , // '\\'
223- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
224- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
225- // Continuation byte
226- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
227- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
228- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
229- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
230- // First byte of a 2-byte code point
231- 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,
232- 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,
233- // First byte of a 4-byte code point
234- 3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,
235- //First byte of a 4+byte code point
236- 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 ,5 ,5 ,5 ,5 ,6 ,6 ,1 ,1 ,
237- };
238-
239- static void convert_ASCII_to_JSON (FBuffer * out_buffer , VALUE str , const char escape_table [256 ])
240- {
241- const char * hexdig = "0123456789abcdef" ;
242- char scratch [12 ] = { '\\' , 'u' , 0 , 0 , 0 , 0 , '\\' , 'u' };
243-
244- const char * ptr = RSTRING_PTR (str );
245- unsigned long len = RSTRING_LEN (str );
246-
247- unsigned long beg = 0 , pos ;
248-
249- for (pos = 0 ; pos < len ;) {
250- unsigned char ch = ptr [pos ];
251- /* JSON encoding */
252- if (escape_table [ch ]) {
253- if (pos > beg ) {
254- fbuffer_append (out_buffer , & ptr [beg ], pos - beg );
255- }
256-
257- beg = pos + 1 ;
258- switch (ch ) {
259- case '"' : fbuffer_append (out_buffer , "\\\"" , 2 ); break ;
260- case '\\' : fbuffer_append (out_buffer , "\\\\" , 2 ); break ;
261- case '/' : fbuffer_append (out_buffer , "\\/" , 2 ); break ;
262- case '\b' : fbuffer_append (out_buffer , "\\b" , 2 ); break ;
263- case '\f' : fbuffer_append (out_buffer , "\\f" , 2 ); break ;
264- case '\n' : fbuffer_append (out_buffer , "\\n" , 2 ); break ;
265- case '\r' : fbuffer_append (out_buffer , "\\r" , 2 ); break ;
266- case '\t' : fbuffer_append (out_buffer , "\\t" , 2 ); break ;
267- default :
268- scratch [2 ] = '0' ;
269- scratch [3 ] = '0' ;
270- scratch [4 ] = hexdig [(ch >> 4 ) & 0xf ];
271- scratch [5 ] = hexdig [ch & 0xf ];
272- fbuffer_append (out_buffer , scratch , 6 );
273- }
274- }
275-
276- pos ++ ;
277- }
278-
279- if (beg < len ) {
280- fbuffer_append (out_buffer , & ptr [beg ], len - beg );
281- }
282-
283- RB_GC_GUARD (str );
284- }
285-
286- static void convert_UTF8_to_ASCII_only_JSON (FBuffer * out_buffer , VALUE str , const char escape_table [256 ], bool out_script_safe )
257+ static void convert_UTF8_to_ASCII_only_JSON (FBuffer * out_buffer , VALUE str , const unsigned char escape_table [256 ])
287258{
288259 const char * hexdig = "0123456789abcdef" ;
289260 char scratch [12 ] = { '\\' , 'u' , 0 , 0 , 0 , 0 , '\\' , 'u' };
@@ -301,7 +272,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
301272
302273 if (RB_UNLIKELY (ch_len )) {
303274 switch (ch_len ) {
304- case 1 : {
275+ case 9 : {
305276 FLUSH_POS (1 );
306277 switch (ch ) {
307278 case '"' : fbuffer_append (out_buffer , "\\\"" , 2 ); break ;
@@ -325,6 +296,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
325296 }
326297 default : {
327298 uint32_t wchar = 0 ;
299+ ch_len = ch_len & CHAR_LENGTH_MASK ;
300+
328301 switch (ch_len ) {
329302 case 2 :
330303 wchar = ptr [pos ] & 0x1F ;
@@ -935,13 +908,11 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
935908
936909 switch (rb_enc_str_coderange (obj )) {
937910 case ENC_CODERANGE_7BIT :
938- convert_ASCII_to_JSON (buffer , obj , state -> script_safe ? script_safe_escape_table : escape_table );
939- break ;
940911 case ENC_CODERANGE_VALID :
941912 if (RB_UNLIKELY (state -> ascii_only )) {
942- convert_UTF8_to_ASCII_only_JSON (buffer , obj , state -> script_safe ? script_safe_escape_table : escape_table , state -> script_safe );
913+ convert_UTF8_to_ASCII_only_JSON (buffer , obj , state -> script_safe ? script_safe_escape_table : ascii_only_escape_table );
943914 } else {
944- convert_UTF8_to_JSON (buffer , obj , state -> script_safe ? script_safe_escape_table : escape_table , state -> script_safe );
915+ convert_UTF8_to_JSON (buffer , obj , state -> script_safe ? script_safe_escape_table : escape_table );
945916 }
946917 break ;
947918 default :
0 commit comments