Skip to content

Commit fecd1a4

Browse files
authored
maint: teach utf8 how to encode utf8_input (#786)
While at it add a usage() function and some code tidying up, and make sure that pcre2test only does the decoding when needed.
1 parent 498bef0 commit fecd1a4

File tree

3 files changed

+152
-59
lines changed

3 files changed

+152
-59
lines changed

maint/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
ucptest
22
utf8
3+
utf8.*
34

45
pcre2_ucp.h
56
pcre2_ucptables_inc.h
67
pcre2_ucd.c
78

89
testinput
10+
testinput11
911
testoutput
1012

1113
!build-interface

maint/utf8.c

Lines changed: 143 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2,50 +2,56 @@
22
* PCRE maintainers' helper program: UTF-8 converter *
33
****************************************************/
44

5-
/* This is a test program for converting character code points to UTF-8 and
5+
/* This is a test program for converting character codepoints to UTF-8 and
66
vice versa. Note that this program conforms to the original definition of
77
UTF-8, which allows codepoints up to 7fffffff. The more recent definition
8-
limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
9-
forbids the "surrogate" code points. This program now gives warnings for these
10-
invalid code points.
8+
limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffff, and
9+
forbids the "surrogate" codepoints. This program now gives warnings for these
10+
invalid codepoints.
1111
12-
The arguments are either single code point values written as U+hh.. or 0xhh..
13-
for conversion to UTF-8, or sequences of hex values, written without 0x and
14-
optionally including spaces (but such arguments must be quoted), for conversion
15-
from UTF-8 to codepoints. For example:
12+
The arguments are either single codepoint values written as U+hh.. or 0xhh..
13+
for conversion to UTF-8, or sequences of hex values, written without a prefix
14+
and optionally including spaces (but such arguments must be quoted), for
15+
encoding from UTF-8 code units to Unicode codepoints. For example:
1616
17-
./utf8 0x1234
18-
U+00001234 => e1 88 b4
17+
./utf8 0x1234
18+
U+00001234 => e1 88 b4
1919
20-
./utf8 "e1 88 b4"
21-
U+00001234 <= e1 88 b4
20+
./utf8 "e1 88 b4"
21+
U+00001234 <= e1 88 b4
2222
2323
In the second case, a number of UTF-8 characters can be present in one
2424
argument. In other words, each such argument is interpreted (after ignoring
2525
spaces) as a string of UTF-8 bytes representing a string of characters:
2626
27-
./utf8 "65 e188b4 77"
28-
0x00000065 <= 65
29-
0x00001234 <= e1 88 b4
30-
0x00000077 <= 77
27+
./utf8 "65 e188b4 77"
28+
0x00000065 <= 65
29+
0x00001234 <= e1 88 b4
30+
0x00000077 <= 77
3131
3232
If the option -s is given, the sequence of UTF-bytes is written out between
33-
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
34-
appropriate graphic for the code point.
33+
angle brackets at the end of the line, if valid. On a UTF-8 terminal, this
34+
should show the appropriate graphic for the character or a question mark.
35+
36+
if the option -b is given, a file with the encoded bytes is written for use
37+
with pcre2test in utf8_input format.
3538
3639
Errors provoke error messages, but the program carries on with the next
37-
argument. The return code is always zero.
40+
argument. The return code is always zero unless there was nothing to process
41+
or an invalid option was provided and the "usage" was printed.
3842
3943
Philip Hazel
40-
Original creation data: unknown
44+
Original creation date: unknown
4145
Code extended and tidied to avoid compiler warnings: 26 March 2020
46+
Support for encoding utf8_input; 31 August 2025
4247
*/
4348

4449

4550
#include <stdio.h>
4651
#include <stdlib.h>
4752
#include <ctype.h>
4853
#include <string.h>
54+
#include <errno.h>
4955

5056
/* The valid ranges for UTF-8 characters are:
5157
@@ -61,10 +67,10 @@ Code extended and tidied to avoid compiler warnings: 26 March 2020
6167
static const unsigned int utf8_table1[] = {
6268
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
6369

64-
static const int utf8_table2[] = {
70+
static const unsigned char utf8_table2[] = {
6571
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
6672

67-
static const int utf8_table3[] = {
73+
static const unsigned char utf8_table3[] = {
6874
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
6975

7076

@@ -80,7 +86,7 @@ static const int utf8_table3[] = {
8086
buffer pointer to buffer for result - at least 6 bytes long
8187
8288
Returns: number of bytes placed in the buffer
83-
0 if input code point is too big
89+
0 if input codepoint is too big
8490
*/
8591

8692
static size_t
@@ -96,7 +102,7 @@ for (j = i; j > 0; j--)
96102
*buffer-- = 0x80 | (cvalue & 0x3f);
97103
cvalue >>= 6;
98104
}
99-
*buffer = utf8_table2[i] | cvalue;
105+
*buffer = (unsigned char)(utf8_table2[i] | cvalue);
100106
return i + 1;
101107
}
102108

@@ -126,7 +132,7 @@ Returns: > 0 => the number of bytes consumed
126132
*/
127133

128134
static int
129-
utf82ord(unsigned char *buffer, unsigned char *buffend,
135+
utf82ord(const unsigned char *buffer, const unsigned char *buffend,
130136
long unsigned int *vptr, int *lenptr)
131137
{
132138
unsigned int c = *buffer++;
@@ -162,7 +168,7 @@ switch (i)
162168

163169
/* i now has a value in the range 1-5 */
164170

165-
s = 6*i;
171+
s = 6 * i;
166172
d = (c & utf8_table3[i]) << s;
167173

168174
for (j = 0; j < i; j++)
@@ -201,7 +207,26 @@ if (j != i)
201207
return i + 1;
202208
}
203209

210+
/**********************************************
211+
* Usage *
212+
**********************************************/
204213

214+
static void
215+
usage(const char *argv0)
216+
{
217+
printf("%s [option ..] argument ..\n\n", argv0);
218+
puts("Encode/decode Unicode codepoints with UTF-8 code units\n");
219+
puts("The arguments are either single codepoint values written as U+hh..");
220+
puts("or 0xhh.. for conversion to UTF-8, or sequences of hex values,");
221+
puts("written without a prefix and optionally including spaces (but such");
222+
puts("arguments must be quoted), for encoding from UTF-8 code units to");
223+
puts("Unicode codepoints.");
224+
puts("For details on usage and examples read the comments in source code.\n");
225+
puts("Options:\n");
226+
puts(" -h|--help\tthis help");
227+
puts(" -s\t\tprint character");
228+
puts(" -b[=<file>]\twrite encoded data to file (default: testinput11)\n");
229+
}
205230

206231
/*************************************************
207232
* Main Program *
@@ -213,52 +238,109 @@ main(int argc, char **argv)
213238
int i = 1;
214239
int show = 0;
215240
unsigned char buffer[64];
241+
const char *argv0 = "utf8";
242+
FILE *f = NULL;
243+
244+
for (int c = argc; c-- > 1; i++)
245+
{
246+
const char *x = argv[i];
247+
248+
if (*x++ != '-') break;
249+
if (*x == '-' && *++x == 0)
250+
{
251+
i++;
252+
break;
253+
}
254+
switch (*x++)
255+
{
256+
case 's': show = 1; break;
257+
case 'b':
258+
{
259+
const char *output = "testinput11";
260+
if (*x++ == '=' && *x != 0) output = x;
261+
f = fopen(output, "wb");
262+
}
263+
break;
264+
default:
265+
{
266+
const char last_option = x[-1];
267+
argv0 = argv[0];
268+
usage(argv0);
269+
return (last_option != 'h');
270+
}
271+
}
272+
}
216273

217-
if (argc > 1 && strcmp(argv[1], "-s") == 0)
274+
if (i >= argc)
218275
{
219-
show = 1;
220-
i = 2;
276+
usage(argv0);
277+
return 1;
221278
}
222279

223280
for (; i < argc; i++)
224281
{
225-
char *x = argv[i];
226-
char *endptr;
227-
if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
282+
const char *x = argv[i];
283+
284+
if (strlen(x) >= 3 &&
285+
(strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0) &&
286+
isxdigit(x[2]))
228287
{
229-
size_t rc, j;
230-
unsigned long int d = strtoul(x+2, &endptr, 16);
231-
if (*endptr != 0)
288+
size_t rc;
289+
unsigned long d;
290+
char *endptr;
291+
int utf8_input = 0;
292+
293+
errno = 0;
294+
d = strtoul(x + 2, &endptr, 16);
295+
if (errno != 0 || *endptr != 0)
232296
{
233297
printf("** Invalid hex number %s\n", x);
234298
continue; /* With next argument */
235299
}
300+
if (d > 0xffffffff)
301+
{
302+
puts("** Code points must fit an uint32_t");
303+
continue;
304+
}
305+
else if (f != NULL && d > 0x7fffffff)
306+
{
307+
buffer[0] = 0xff;
308+
fwrite(buffer, 1, 1, f);
309+
utf8_input = 1;
310+
d &= 0x7fffffff;
311+
}
312+
236313
rc = ord2utf8(d, buffer);
237314
printf("U+%08lx => ", d);
238315
if (rc == 0)
239-
printf("** Code point greater than 0x7fffffff cannot be encoded");
316+
fputs("** -b needed for codepoints greater than 0x7fffffff", stdout);
240317
else
241318
{
319+
size_t j;
320+
242321
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
243-
if (show)
322+
if (f != NULL) fwrite(buffer, rc, 1, f);
323+
if (utf8_input)
324+
fputs("** Not valid UTF-8, top bit set", stdout);
325+
else if (d > 0x10ffff)
326+
fputs("** Invalid Unicode (greater than U+10ffff)", stdout);
327+
else if (0xd800 <= d && d <= 0xdfff)
328+
fputs("** Invalid Unicode (UTF-16 surrogate)", stdout);
329+
else if (show)
244330
{
245-
printf(">");
331+
putchar('>');
246332
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
247-
printf("< ");
333+
putchar('<');
248334
}
249-
if (d >= 0xd800 && d <= 0xdfff)
250-
printf("** Invalid Unicode (surrogate)");
251-
else if (d > 0x10ffff)
252-
printf("** Invalid Unicode (greater than U+10ffff)");
253335
}
254-
printf("\n");
336+
putchar('\n');
255337
}
256338
else
257339
{
258340
unsigned char *bptr;
259-
unsigned char *buffend;
341+
const unsigned char *buffend;
342+
unsigned char y = 0;
260343
int len = 0;
261-
int y = 0;
262344
int z = 0;
263345

264346
for (;;)
@@ -271,7 +353,8 @@ for (; i < argc; i++)
271353
len = -1;
272354
break;
273355
}
274-
y = y * 16 + (tolower(*x) - ((isdigit(*x))? '0' : 'W'));
356+
y = y * 16 +
357+
(unsigned char)(tolower(*x) - ((isdigit(*x))? '0' : 'a' - 10));
275358
x++;
276359
if (z)
277360
{
@@ -297,50 +380,53 @@ for (; i < argc; i++)
297380
{
298381
printf("U+%08lx <= ", d);
299382
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
300-
if (show)
383+
if (d <= 0x10ffff && (d < 0xd800 || 0xdfff < d) && show)
301384
{
302-
printf(">");
385+
putchar('>');
303386
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
304-
printf("<");
387+
putchar('<');
305388
}
306-
printf("\n");
389+
putchar('\n');
307390
bptr += rc;
308391
}
309392
else if (rc == -4)
310393
{
311394
printf("U+%08lx <= ", d);
312395
for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
313-
printf("** Overlong UTF-8 sequence\n");
396+
puts("** Overlong UTF-8 sequence");
314397
bptr += offset;
315398
}
316399
else
317400
{
318401
switch (rc)
319402
{
320-
case 0: printf("** First byte missing 0x40 bit");
403+
case 0: fputs("** First byte missing 0x40 bit", stdout);
321404
break;
322405

323-
case -1: printf("** First byte has too many high-order bits");
406+
case -1: fputs("** First byte has too many high-order bits", stdout);
324407
break;
325408

326-
case -2: printf("** Incomplete UTF-8 sequence at end of string");
409+
case -2: fputs("** Incomplete UTF-8 sequence at end of string",
410+
stdout);
327411
break;
328412

329-
case -3: printf("** Incomplete UTF-8 sequence");
413+
case -3: fputs("** Incomplete UTF-8 sequence", stdout);
330414
break;
331415

332416
default: printf("** Unexpected return %d from utf82ord()", rc);
333417
break;
334418
}
335419
printf(" at offset %d in string ", offset);
336420
while (bptr < buffend) printf("%02x ", *bptr++);
337-
printf("\n");
421+
putchar('\n');
338422
break;
339423
}
340424
}
341425
}
342426
}
343427

428+
if (f != NULL) fclose(f);
429+
344430
return 0;
345431
}
346432

0 commit comments

Comments
 (0)