@@ -105,225 +105,101 @@ int swap_int32(int x) {
105105 */
106106char * utf8_unicode_inplace_ex (apr_pool_t * mp , unsigned char * input , long int input_len , int * changed ) {
107107 int unicode_len = 0 , length = 0 ;
108- unsigned int d = 0 , count = 0 ;
108+ unsigned int d = 0 ;
109109 unsigned char c , * utf ;
110110 char * rval , * data ;
111111 unsigned int i , len , j ;
112112 unsigned int bytes_left = input_len ;
113113 unsigned char * unicode = NULL ;
114114
115+ assert (input != NULL );
116+
115117 * changed = 0 ;
116118 /* RFC3629 states that UTF-8 are encoded using sequences of 1 to 4 octets. */
117- /* Max size per character should fit in 4 bytes */
118- len = input_len * 4 + 1 ;
119+ /* Max size per character should fit in 4 bytes (%u01020304) */
120+ len = input_len * 10 + 1 ;
119121 data = rval = apr_palloc (mp , len );
120122 if (rval == NULL ) return NULL ;
121123
122-
123- if (input == NULL ) return NULL ;
124-
125- for (i = 0 ; i < bytes_left ;) {
124+ for (i = 0 ; i < bytes_left ;) {
126125 unicode_len = 0 ; d = 0 ;
127126 utf = (unsigned char * )& input [i ];
128-
129127 c = * utf ;
130128
131- /* If first byte begins with binary 0 it is single byte encoding */
129+ /* If first byte begins with binary 0 it may be single byte encoding */
132130 if ((c & 0x80 ) == 0 ) {
133- /* single byte unicode (7 bit ASCII equivilent) has no validation */
134- count ++ ;
135- if (count <= len ) {
136- if (c == 0 )
137- * data = x2c (& c );
138- else
139- * data ++ = c ;
131+ if (c == 0 ) {
132+ unicode_len = 2 ;
133+ d = utf [1 ];
140134 }
141-
142135 }
143136 /* If first byte begins with binary 110 it is two byte encoding*/
144137 else if ((c & 0xE0 ) == 0xC0 ) {
145138 /* check we have at least two bytes */
146139 if (bytes_left < 2 ) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING ;
147140 /* check second byte starts with binary 10 */
148- else if ((( * ( utf + 1 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
141+ else if ((utf [ 1 ] & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
149142 else {
150143 unicode_len = 2 ;
151- count += 6 ;
152- if (count <= len ) {
153- /* compute character number */
154- d = ((c & 0x1F ) << 6 ) | (* (utf + 1 ) & 0x3F );
155- * data ++ = '%' ;
156- * data ++ = 'u' ;
157- unicode = apr_psprintf (mp , "%x" , d );
158- length = strlen (unicode );
159-
160- switch (length ) {
161- case 1 :
162- * data ++ = '0' ;
163- * data ++ = '0' ;
164- * data ++ = '0' ;
165- break ;
166- case 2 :
167- * data ++ = '0' ;
168- * data ++ = '0' ;
169- break ;
170- case 3 :
171- * data ++ = '0' ;
172- break ;
173- case 4 :
174- case 5 :
175- break ;
176- }
177-
178- for (j = 0 ; j < length ; j ++ ) {
179- * data ++ = unicode [j ];
180- }
181-
182- * changed = 1 ;
183- }
144+ /* compute character number */
145+ d = ((c & 0x1F ) << 6 ) | (utf [1 ] & 0x3F );
184146 }
185147 }
186148 /* If first byte begins with binary 1110 it is three byte encoding */
187149 else if ((c & 0xF0 ) == 0xE0 ) {
188150 /* check we have at least three bytes */
189151 if (bytes_left < 3 ) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING ;
190152 /* check second byte starts with binary 10 */
191- else if ((( * ( utf + 1 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
153+ else if ((utf [ 1 ] & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
192154 /* check third byte starts with binary 10 */
193155 else if (((* (utf + 2 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
194156 else {
195157 unicode_len = 3 ;
196- count += 6 ;
197- if (count <= len ) {
198- /* compute character number */
199- d = ((c & 0x0F ) << 12 ) | ((* (utf + 1 ) & 0x3F ) << 6 ) | (* (utf + 2 ) & 0x3F );
200- * data ++ = '%' ;
201- * data ++ = 'u' ;
202- unicode = apr_psprintf (mp , "%x" , d );
203- length = strlen (unicode );
204-
205- switch (length ) {
206- case 1 :
207- * data ++ = '0' ;
208- * data ++ = '0' ;
209- * data ++ = '0' ;
210- break ;
211- case 2 :
212- * data ++ = '0' ;
213- * data ++ = '0' ;
214- break ;
215- case 3 :
216- * data ++ = '0' ;
217- break ;
218- case 4 :
219- case 5 :
220- break ;
221- }
222-
223- for (j = 0 ; j < length ; j ++ ) {
224- * data ++ = unicode [j ];
225- }
226-
227- * changed = 1 ;
228-
229- }
158+ /* compute character number */
159+ d = ((c & 0x0F ) << 12 ) | ((utf [1 ] & 0x3F ) << 6 ) | (* (utf + 2 ) & 0x3F );
230160 }
231161 }
232162 /* If first byte begins with binary 11110 it is four byte encoding */
233163 else if ((c & 0xF8 ) == 0xF0 ) {
234164 /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
235- if (c >= 0xF5 ) {
236- * data ++ = c ;
237- }
165+ if (c >= 0xF5 ) unicode_len = UNICODE_ERROR_RESTRICTED_CHARACTER ;
238166 /* check we have at least four bytes */
239- if (bytes_left < 4 ) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING ;
167+ else if (bytes_left < 4 ) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING ;
240168 /* check second byte starts with binary 10 */
241- else if ((( * ( utf + 1 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
169+ else if ((utf [ 1 ] & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
242170 /* check third byte starts with binary 10 */
243171 else if (((* (utf + 2 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
244172 /* check forth byte starts with binary 10 */
245173 else if (((* (utf + 3 )) & 0xC0 ) != 0x80 ) unicode_len = UNICODE_ERROR_INVALID_ENCODING ;
246174 else {
247175 unicode_len = 4 ;
248- count += 7 ;
249- if (count <= len ) {
250- /* compute character number */
251- d = ((c & 0x07 ) << 18 ) | ((* (utf + 1 ) & 0x3F ) << 12 ) | ((* (utf + 2 ) & 0x3F ) << 6 ) | (* (utf + 3 ) & 0x3F );
252- * data ++ = '%' ;
253- * data ++ = 'u' ;
254- unicode = apr_psprintf (mp , "%x" , d );
255- length = strlen (unicode );
256-
257- switch (length ) {
258- case 1 :
259- * data ++ = '0' ;
260- * data ++ = '0' ;
261- * data ++ = '0' ;
262- break ;
263- case 2 :
264- * data ++ = '0' ;
265- * data ++ = '0' ;
266- break ;
267- case 3 :
268- * data ++ = '0' ;
269- break ;
270- case 4 :
271- case 5 :
272- break ;
273- }
274-
275- for (j = 0 ; j < length ; j ++ ) {
276- * data ++ = unicode [j ];
277- }
278-
279- * changed = 1 ;
280-
281- }
176+ /* compute character number */
177+ d = ((c & 0x07 ) << 18 ) | ((utf [1 ] & 0x3F ) << 12 ) | ((* (utf + 2 ) & 0x3F ) << 6 ) | (* (utf + 3 ) & 0x3F );
282178 }
283179 }
284- /* any other first byte is invalid (RFC 3629) */
285- else {
286- count ++ ;
287- if (count <= len )
288- * data ++ = c ;
289- }
290-
291180 /* invalid UTF-8 character number range (RFC 3629) */
292- if ((d >= 0xD800 ) && (d <= 0xDFFF )) {
293- count ++ ;
294- if (count <= len )
295- * data ++ = c ;
296- }
297-
181+ if ((d >= 0xD800 ) && (d <= 0xDFFF )) unicode_len = UNICODE_ERROR_RESTRICTED_CHARACTER ;
298182 /* check for overlong */
299- if ((unicode_len == 4 ) && (d < 0x010000 )) {
300- /* four byte could be represented with less bytes */
301- count ++ ;
302- if (count <= len )
303- * data ++ = c ;
304- }
305- else if ((unicode_len == 3 ) && (d < 0x0800 )) {
306- /* three byte could be represented with less bytes */
307- count ++ ;
308- if (count <= len )
309- * data ++ = c ;
310- }
311- else if ((unicode_len == 2 ) && (d < 0x80 )) {
312- /* two byte could be represented with less bytes */
313- count ++ ;
314- if (count <= len )
315- * data ++ = c ;
316- }
183+ if ((unicode_len == 4 ) && (d < 0x010000 )) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER ;
184+ /* three byte could be represented with less bytes */
185+ if ((unicode_len == 3 ) && (d < 0x0800 )) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER ;
186+ /* two byte could be represented with less bytes */
187+ if ((unicode_len == 2 ) && (d < 0x80 )) unicode_len = UNICODE_ERROR_OVERLONG_CHARACTER ;
317188
318- if (unicode_len > 0 ) {
189+ if (unicode_len > 0 ) {
319190 i += unicode_len ;
320- } else {
191+ sprintf (data , "%%u%04x" , d );
192+ data += 6 ;
193+ * changed = 1 ;
194+ }
195+ else {
196+ /* any other first byte is invalid (RFC 3629), so assume it's an ASCII character */
197+ * data ++ = c ;
321198 i ++ ;
322199 }
323200 }
324201
325- * data = '\0' ;
326-
202+ * data = '\0' ;
327203 return rval ;
328204}
329205
0 commit comments