22* PCRE maintainers' helper program: UTF-8 converter *
33****************************************************/
44
5- /* This is a test program for converting character code points to UTF-8 and
5+ /* This is a test program for converting character codepoints to UTF-8 and
66vice versa. Note that this program conforms to the original definition of
77UTF-8, which allows codepoints up to 7fffffff. The more recent definition
8- limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff , and
9- forbids the "surrogate" code points . This program now gives warnings for these
10- invalid code points .
8+ limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffff , and
9+ forbids the "surrogate" codepoints . This program now gives warnings for these
10+ invalid codepoints .
1111
12- The arguments are either single code point values written as U+hh.. or 0xhh..
13- for conversion to UTF-8, or sequences of hex values, written without 0x and
14- optionally including spaces (but such arguments must be quoted), for conversion
15- from UTF-8 to codepoints. For example:
12+ The arguments are either single codepoint values written as U+hh.. or 0xhh..
13+ for conversion to UTF-8, or sequences of hex values, written without a prefix
14+ and optionally including spaces (but such arguments must be quoted), for
15+ encoding from UTF-8 code units to Unicode codepoints. For example:
1616
17- ./utf8 0x1234
18- U+00001234 => e1 88 b4
17+ ./utf8 0x1234
18+ U+00001234 => e1 88 b4
1919
20- ./utf8 "e1 88 b4"
21- U+00001234 <= e1 88 b4
20+ ./utf8 "e1 88 b4"
21+ U+00001234 <= e1 88 b4
2222
2323In the second case, a number of UTF-8 characters can be present in one
2424argument. In other words, each such argument is interpreted (after ignoring
2525spaces) as a string of UTF-8 bytes representing a string of characters:
2626
27- ./utf8 "65 e188b4 77"
28- 0x00000065 <= 65
29- 0x00001234 <= e1 88 b4
30- 0x00000077 <= 77
27+ ./utf8 "65 e188b4 77"
28+ 0x00000065 <= 65
29+ 0x00001234 <= e1 88 b4
30+ 0x00000077 <= 77
3131
3232If the option -s is given, the sequence of UTF-bytes is written out between
33- angle brackets at the end of the line. On a UTF-8 terminal, this will show the
34- appropriate graphic for the code point.
33+ angle brackets at the end of the line, if valid. On a UTF-8 terminal, this
34+ should show the appropriate graphic for the character or a question mark.
35+
36+ if the option -b is given, a file with the encoded bytes is written for use
37+ with pcre2test in utf8_input format.
3538
3639Errors provoke error messages, but the program carries on with the next
37- argument. The return code is always zero.
40+ argument. The return code is always zero unless there was nothing to process
41+ or an invalid option was provided and the "usage" was printed.
3842
3943Philip Hazel
40- Original creation data : unknown
44+ Original creation date : unknown
4145Code extended and tidied to avoid compiler warnings: 26 March 2020
46+ Support for encoding utf8_input; 31 August 2025
4247*/
4348
4449
4550#include <stdio.h>
4651#include <stdlib.h>
4752#include <ctype.h>
4853#include <string.h>
54+ #include <errno.h>
4955
5056/* The valid ranges for UTF-8 characters are:
5157
@@ -61,10 +67,10 @@ Code extended and tidied to avoid compiler warnings: 26 March 2020
6167static const unsigned int utf8_table1 [] = {
6268 0x0000007f , 0x000007ff , 0x0000ffff , 0x001fffff , 0x03ffffff , 0x7fffffff };
6369
64- static const int utf8_table2 [] = {
70+ static const unsigned char utf8_table2 [] = {
6571 0 , 0xc0 , 0xe0 , 0xf0 , 0xf8 , 0xfc };
6672
67- static const int utf8_table3 [] = {
73+ static const unsigned char utf8_table3 [] = {
6874 0xff , 0x1f , 0x0f , 0x07 , 0x03 , 0x01 };
6975
7076
@@ -80,7 +86,7 @@ static const int utf8_table3[] = {
8086 buffer pointer to buffer for result - at least 6 bytes long
8187
8288Returns: number of bytes placed in the buffer
83- 0 if input code point is too big
89+ 0 if input codepoint is too big
8490*/
8591
8692static size_t
@@ -96,7 +102,7 @@ for (j = i; j > 0; j--)
96102 * buffer -- = 0x80 | (cvalue & 0x3f );
97103 cvalue >>= 6 ;
98104 }
99- * buffer = utf8_table2 [i ] | cvalue ;
105+ * buffer = ( unsigned char )( utf8_table2 [i ] | cvalue ) ;
100106return i + 1 ;
101107}
102108
@@ -126,7 +132,7 @@ Returns: > 0 => the number of bytes consumed
126132*/
127133
128134static int
129- utf82ord (unsigned char * buffer , unsigned char * buffend ,
135+ utf82ord (const unsigned char * buffer , const unsigned char * buffend ,
130136 long unsigned int * vptr , int * lenptr )
131137{
132138unsigned int c = * buffer ++ ;
@@ -162,7 +168,7 @@ switch (i)
162168
163169/* i now has a value in the range 1-5 */
164170
165- s = 6 * i ;
171+ s = 6 * i ;
166172d = (c & utf8_table3 [i ]) << s ;
167173
168174for (j = 0 ; j < i ; j ++ )
@@ -201,7 +207,26 @@ if (j != i)
201207return i + 1 ;
202208}
203209
210+ /**********************************************
211+ * Usage *
212+ **********************************************/
204213
214+ static void
215+ usage (const char * argv0 )
216+ {
217+ printf ("%s [option ..] argument ..\n\n" , argv0 );
218+ puts ("Encode/decode Unicode codepoints with UTF-8 code units\n" );
219+ puts ("The arguments are either single codepoint values written as U+hh.." );
220+ puts ("or 0xhh.. for conversion to UTF-8, or sequences of hex values," );
221+ puts ("written without a prefix and optionally including spaces (but such" );
222+ puts ("arguments must be quoted), for encoding from UTF-8 code units to" );
223+ puts ("Unicode codepoints." );
224+ puts ("For details on usage and examples read the comments in source code.\n" );
225+ puts ("Options:\n" );
226+ puts (" -h|--help\tthis help" );
227+ puts (" -s\t\tprint character" );
228+ puts (" -b[=<file>]\twrite encoded data to file (default: testinput11)\n" );
229+ }
205230
206231/*************************************************
207232* Main Program *
@@ -213,52 +238,109 @@ main(int argc, char **argv)
213238int i = 1 ;
214239int show = 0 ;
215240unsigned char buffer [64 ];
241+ const char * argv0 = "utf8" ;
242+ FILE * f = NULL ;
243+
244+ for (int c = argc ; c -- > 1 ; i ++ )
245+ {
246+ const char * x = argv [i ];
247+
248+ if (* x ++ != '-' ) break ;
249+ if (* x == '-' && * ++ x == 0 )
250+ {
251+ i ++ ;
252+ break ;
253+ }
254+ switch (* x ++ )
255+ {
256+ case 's' : show = 1 ; break ;
257+ case 'b' :
258+ {
259+ const char * output = "testinput11" ;
260+ if (* x ++ == '=' && * x != 0 ) output = x ;
261+ f = fopen (output , "wb" );
262+ }
263+ break ;
264+ default :
265+ {
266+ const char last_option = x [-1 ];
267+ argv0 = argv [0 ];
268+ usage (argv0 );
269+ return (last_option != 'h' );
270+ }
271+ }
272+ }
216273
217- if (argc > 1 && strcmp ( argv [ 1 ], "-s" ) == 0 )
274+ if (i >= argc )
218275 {
219- show = 1 ;
220- i = 2 ;
276+ usage ( argv0 ) ;
277+ return 1 ;
221278 }
222279
223280for (; i < argc ; i ++ )
224281 {
225- char * x = argv [i ];
226- char * endptr ;
227- if (strncmp (x , "0x" , 2 ) == 0 || strncmp (x , "U+" , 2 ) == 0 )
282+ const char * x = argv [i ];
283+
284+ if (strlen (x ) >= 3 &&
285+ (strncmp (x , "0x" , 2 ) == 0 || strncmp (x , "U+" , 2 ) == 0 ) &&
286+ isxdigit (x [2 ]))
228287 {
229- size_t rc , j ;
230- unsigned long int d = strtoul (x + 2 , & endptr , 16 );
231- if (* endptr != 0 )
288+ size_t rc ;
289+ unsigned long d ;
290+ char * endptr ;
291+ int utf8_input = 0 ;
292+
293+ errno = 0 ;
294+ d = strtoul (x + 2 , & endptr , 16 );
295+ if (errno != 0 || * endptr != 0 )
232296 {
233297 printf ("** Invalid hex number %s\n" , x );
234298 continue ; /* With next argument */
235299 }
300+ if (d > 0xffffffff )
301+ {
302+ puts ("** Code points must fit an uint32_t" );
303+ continue ;
304+ }
305+ else if (f != NULL && d > 0x7fffffff )
306+ {
307+ buffer [0 ] = 0xff ;
308+ fwrite (buffer , 1 , 1 , f );
309+ utf8_input = 1 ;
310+ d &= 0x7fffffff ;
311+ }
312+
236313 rc = ord2utf8 (d , buffer );
237314 printf ("U+%08lx => " , d );
238315 if (rc == 0 )
239- printf ("** Code point greater than 0x7fffffff cannot be encoded" );
316+ fputs ("** -b needed for codepoints greater than 0x7fffffff" , stdout );
240317 else
241318 {
319+ size_t j ;
320+
242321 for (j = 0 ; j < rc ; j ++ ) printf ("%02x " , buffer [j ]);
243- if (show )
322+ if (f != NULL ) fwrite (buffer , rc , 1 , f );
323+ if (utf8_input )
324+ fputs ("** Not valid UTF-8, top bit set" , stdout );
325+ else if (d > 0x10ffff )
326+ fputs ("** Invalid Unicode (greater than U+10ffff)" , stdout );
327+ else if (0xd800 <= d && d <= 0xdfff )
328+ fputs ("** Invalid Unicode (UTF-16 surrogate)" , stdout );
329+ else if (show )
244330 {
245- printf ( ">" );
331+ putchar ( '>' );
246332 for (j = 0 ; j < rc ; j ++ ) printf ("%c" , buffer [j ]);
247- printf ( "< " );
333+ putchar ( '<' );
248334 }
249- if (d >= 0xd800 && d <= 0xdfff )
250- printf ("** Invalid Unicode (surrogate)" );
251- else if (d > 0x10ffff )
252- printf ("** Invalid Unicode (greater than U+10ffff)" );
253335 }
254- printf ( "\n" );
336+ putchar ( '\n' );
255337 }
256338 else
257339 {
258340 unsigned char * bptr ;
259- unsigned char * buffend ;
341+ const unsigned char * buffend ;
342+ unsigned char y = 0 ;
260343 int len = 0 ;
261- int y = 0 ;
262344 int z = 0 ;
263345
264346 for (;;)
@@ -271,7 +353,8 @@ for (; i < argc; i++)
271353 len = -1 ;
272354 break ;
273355 }
274- y = y * 16 + (tolower (* x ) - ((isdigit (* x ))? '0' : 'W' ));
356+ y = y * 16 +
357+ (unsigned char )(tolower (* x ) - ((isdigit (* x ))? '0' : 'a' - 10 ));
275358 x ++ ;
276359 if (z )
277360 {
@@ -297,50 +380,53 @@ for (; i < argc; i++)
297380 {
298381 printf ("U+%08lx <= " , d );
299382 for (j = 0 ; j < rc ; j ++ ) printf ("%02x " , bptr [j ]);
300- if (show )
383+ if (d <= 0x10ffff && ( d < 0xd800 || 0xdfff < d ) && show )
301384 {
302- printf ( ">" );
385+ putchar ( '>' );
303386 for (j = 0 ; j < rc ; j ++ ) printf ("%c" , bptr [j ]);
304- printf ( "<" );
387+ putchar ( '<' );
305388 }
306- printf ( "\n" );
389+ putchar ( '\n' );
307390 bptr += rc ;
308391 }
309392 else if (rc == -4 )
310393 {
311394 printf ("U+%08lx <= " , d );
312395 for (j = 0 ; j < offset ; j ++ ) printf ("%02x " , bptr [j ]);
313- printf ("** Overlong UTF-8 sequence\n " );
396+ puts ("** Overlong UTF-8 sequence" );
314397 bptr += offset ;
315398 }
316399 else
317400 {
318401 switch (rc )
319402 {
320- case 0 : printf ("** First byte missing 0x40 bit" );
403+ case 0 : fputs ("** First byte missing 0x40 bit" , stdout );
321404 break ;
322405
323- case -1 : printf ("** First byte has too many high-order bits" );
406+ case -1 : fputs ("** First byte has too many high-order bits" , stdout );
324407 break ;
325408
326- case -2 : printf ("** Incomplete UTF-8 sequence at end of string" );
409+ case -2 : fputs ("** Incomplete UTF-8 sequence at end of string" ,
410+ stdout );
327411 break ;
328412
329- case -3 : printf ("** Incomplete UTF-8 sequence" );
413+ case -3 : fputs ("** Incomplete UTF-8 sequence" , stdout );
330414 break ;
331415
332416 default : printf ("** Unexpected return %d from utf82ord()" , rc );
333417 break ;
334418 }
335419 printf (" at offset %d in string " , offset );
336420 while (bptr < buffend ) printf ("%02x " , * bptr ++ );
337- printf ( "\n" );
421+ putchar ( '\n' );
338422 break ;
339423 }
340424 }
341425 }
342426 }
343427
428+ if (f != NULL ) fclose (f );
429+
344430return 0 ;
345431}
346432
0 commit comments