maint: teach utf8 how to encode utf8_input (#786)

carenas · web-flow · commit fecd1a464349 · 2025-09-11T14:08:57.000+01:00
While at it add a usage() function and some code tidying up, and
make sure that pcre2test only does the decoding when needed.
diff --git a/maint/.gitignore b/maint/.gitignore
@@ -1,11 +1,13 @@
 ucptest
 utf8
+utf8.*
 
 pcre2_ucp.h
 pcre2_ucptables_inc.h
 pcre2_ucd.c
 
 testinput
+testinput11
 testoutput
 
 !build-interface
diff --git a/maint/utf8.c b/maint/utf8.c
@@ -2,50 +2,56 @@
 * PCRE maintainers' helper program: UTF-8 converter *
 ****************************************************/
 
-/* This is a test program for converting character code points to UTF-8 and
+/* This is a test program for converting character codepoints to UTF-8 and
 vice versa. Note that this program conforms to the original definition of
 UTF-8, which allows codepoints up to 7fffffff. The more recent definition
-limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
-forbids the "surrogate" code points. This program now gives warnings for these
-invalid code points.
+limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffff, and
+forbids the "surrogate" codepoints. This program now gives warnings for these
+invalid codepoints.
 
-The arguments are either single code point values written as U+hh.. or 0xhh..
-for conversion to UTF-8, or sequences of hex values, written without 0x and
-optionally including spaces (but such arguments must be quoted), for conversion
-from UTF-8 to codepoints. For example:
+The arguments are either single codepoint values written as U+hh.. or 0xhh..
+for conversion to UTF-8, or sequences of hex values, written without a prefix
+and optionally including spaces (but such arguments must be quoted), for
+encoding from UTF-8 code units to Unicode codepoints. For example:
 
-./utf8 0x1234
-U+00001234 => e1 88 b4
+  ./utf8 0x1234
+  U+00001234 => e1 88 b4
 
-./utf8 "e1 88 b4"
-U+00001234 <= e1 88 b4
+  ./utf8 "e1 88 b4"
+  U+00001234 <= e1 88 b4
 
 In the second case, a number of UTF-8 characters can be present in one
 argument. In other words, each such argument is interpreted (after ignoring
 spaces) as a string of UTF-8 bytes representing a string of characters:
 
-./utf8 "65 e188b4 77"
-0x00000065 <= 65
-0x00001234 <= e1 88 b4
-0x00000077 <= 77
+  ./utf8 "65 e188b4 77"
+  0x00000065 <= 65
+  0x00001234 <= e1 88 b4
+  0x00000077 <= 77
 
 If the option -s is given, the sequence of UTF-bytes is written out between
-angle brackets at the end of the line. On a UTF-8 terminal, this will show the
-appropriate graphic for the code point.
+angle brackets at the end of the line, if valid. On a UTF-8 terminal, this
+should show the appropriate graphic for the character or a question mark.
+
+if the option -b is given, a file with the encoded bytes is written for use
+with pcre2test in utf8_input format.
 
 Errors provoke error messages, but the program carries on with the next
-argument. The return code is always zero.
+argument. The return code is always zero unless there was nothing to process
+or an invalid option was provided and the "usage" was printed.
 
 Philip Hazel
-Original creation data: unknown
+Original creation date: unknown
 Code extended and tidied to avoid compiler warnings: 26 March 2020
+Support for encoding utf8_input; 31 August 2025
 */
 
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <string.h>
+#include <errno.h>
 
 /* The valid ranges for UTF-8 characters are:
 
@@ -61,10 +67,10 @@ Code extended and tidied to avoid compiler warnings: 26 March 2020
 static const unsigned int utf8_table1[] = {
   0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
 
-static const int utf8_table2[] = {
+static const unsigned char utf8_table2[] = {
   0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 
-static const int utf8_table3[] = {
+static const unsigned char utf8_table3[] = {
   0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 
 
@@ -80,7 +86,7 @@ static const int utf8_table3[] = {
   buffer     pointer to buffer for result - at least 6 bytes long
 
 Returns:     number of bytes placed in the buffer
-             0 if input code point is too big
+             0 if input codepoint is too big
 */
 
 static size_t
@@ -96,7 +102,7 @@ for (j = i; j > 0; j--)
  *buffer-- = 0x80 | (cvalue & 0x3f);
  cvalue >>= 6;
  }
-*buffer = utf8_table2[i] | cvalue;
+*buffer = (unsigned char)(utf8_table2[i] | cvalue);
 return i + 1;
 }
 
@@ -126,7 +132,7 @@ Returns:   > 0 => the number of bytes consumed
 */
 
 static int
-utf82ord(unsigned char *buffer, unsigned char *buffend,
+utf82ord(const unsigned char *buffer, const unsigned char *buffend,
   long unsigned int *vptr, int *lenptr)
 {
 unsigned int c = *buffer++;
@@ -162,7 +168,7 @@ switch (i)
 
 /* i now has a value in the range 1-5 */
 
-s = 6*i;
+s = 6 * i;
 d = (c & utf8_table3[i]) << s;
 
 for (j = 0; j < i; j++)
@@ -201,7 +207,26 @@ if (j != i)
 return i + 1;
 }
 
+/**********************************************
+*                    Usage                    *
+**********************************************/
 
+static void
+usage(const char *argv0)
+{
+printf("%s [option ..] argument ..\n\n", argv0);
+puts("Encode/decode Unicode codepoints with UTF-8 code units\n");
+puts("The arguments are either single codepoint values written as U+hh..");
+puts("or 0xhh.. for conversion to UTF-8, or sequences of hex values,");
+puts("written without a prefix and optionally including spaces (but such");
+puts("arguments must be quoted), for encoding from UTF-8 code units to");
+puts("Unicode codepoints.");
+puts("For details on usage and examples read the comments in source code.\n");
+puts("Options:\n");
+puts("  -h|--help\tthis help");
+puts("  -s\t\tprint character");
+puts("  -b[=<file>]\twrite encoded data to file (default: testinput11)\n");
+}
 
 /*************************************************
 *                 Main Program                   *
@@ -213,52 +238,109 @@ main(int argc, char **argv)
 int i = 1;
 int show = 0;
 unsigned char buffer[64];
+const char *argv0 = "utf8";
+FILE *f = NULL;
+
+for (int c = argc; c-- > 1; i++)
+  {
+  const char *x = argv[i];
+
+  if (*x++ != '-') break;
+  if (*x == '-' && *++x == 0)
+    {
+    i++;
+    break;
+    }
+  switch (*x++)
+    {
+    case 's': show = 1; break;
+    case 'b':
+      {
+      const char *output = "testinput11";
+      if (*x++ == '=' && *x != 0) output = x;
+      f = fopen(output, "wb");
+      }
+    break;
+    default:
+      {
+      const char last_option = x[-1];
+      argv0 = argv[0];
+      usage(argv0);
+      return (last_option != 'h');
+      }
+    }
+  }
 
-if (argc > 1 && strcmp(argv[1], "-s") == 0)
+if (i >= argc)
   {
-  show = 1;
-  i = 2;
+  usage(argv0);
+  return 1;
   }
 
 for (; i < argc; i++)
   {
-  char *x = argv[i];
-  char *endptr;
-  if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
+  const char *x = argv[i];
+
+  if (strlen(x) >= 3 &&
+      (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0) &&
+      isxdigit(x[2]))
     {
-    size_t rc, j;
-    unsigned long int d = strtoul(x+2, &endptr, 16);
-    if (*endptr != 0)
+    size_t rc;
+    unsigned long d;
+    char *endptr;
+    int utf8_input = 0;
+
+    errno = 0;
+    d = strtoul(x + 2, &endptr, 16);
+    if (errno != 0 || *endptr != 0)
       {
       printf("** Invalid hex number %s\n", x);
       continue;   /* With next argument */
       }
+    if (d > 0xffffffff)
+      {
+      puts("** Code points must fit an uint32_t");
+      continue;
+      }
+    else if (f != NULL && d > 0x7fffffff)
+      {
+      buffer[0] = 0xff;
+      fwrite(buffer, 1, 1, f);
+      utf8_input = 1;
+      d &= 0x7fffffff;
+      }
+
     rc = ord2utf8(d, buffer);
     printf("U+%08lx => ", d);
     if (rc == 0)
-      printf("** Code point greater than 0x7fffffff cannot be encoded");
+      fputs("** -b needed for codepoints greater than 0x7fffffff", stdout);
     else
       {
+      size_t j;
+
       for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
-      if (show)
+      if (f != NULL) fwrite(buffer, rc, 1, f);
+      if (utf8_input)
+        fputs("** Not valid UTF-8, top bit set", stdout);
+      else if (d > 0x10ffff)
+        fputs("** Invalid Unicode (greater than U+10ffff)", stdout);
+      else if (0xd800 <= d && d <= 0xdfff)
+        fputs("** Invalid Unicode (UTF-16 surrogate)", stdout);
+      else if (show)
         {
-        printf(">");
+        putchar('>');
         for (j = 0; j < rc; j++) printf("%c", buffer[j]);
-        printf("< ");
+        putchar('<');
         }
-      if (d >= 0xd800 && d <= 0xdfff)
-        printf("** Invalid Unicode (surrogate)");
-      else if (d > 0x10ffff)
-        printf("** Invalid Unicode (greater than U+10ffff)");
       }
-    printf("\n");
+    putchar('\n');
     }
   else
     {
     unsigned char *bptr;
-    unsigned char *buffend;
+    const unsigned char *buffend;
+    unsigned char y = 0;
     int len = 0;
-    int y = 0;
     int z = 0;
 
     for (;;)
@@ -271,7 +353,8 @@ for (; i < argc; i++)
         len = -1;
         break;
         }
-      y = y * 16 + (tolower(*x) - ((isdigit(*x))? '0' : 'W'));
+      y = y * 16 +
+          (unsigned char)(tolower(*x) - ((isdigit(*x))? '0' : 'a' - 10));
       x++;
       if (z)
         {
@@ -297,50 +380,53 @@ for (; i < argc; i++)
         {
         printf("U+%08lx <= ", d);
         for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
-        if (show)
+        if (d <= 0x10ffff && (d < 0xd800 || 0xdfff < d) && show)
           {
-          printf(">");
+          putchar('>');
           for (j = 0; j < rc; j++) printf("%c", bptr[j]);
-          printf("<");
+          putchar('<');
           }
-        printf("\n");
+        putchar('\n');
         bptr += rc;
         }
       else if (rc == -4)
         {
         printf("U+%08lx <= ", d);
         for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
-        printf("** Overlong UTF-8 sequence\n");
+        puts("** Overlong UTF-8 sequence");
         bptr += offset;
         }
       else
         {
         switch (rc)
           {
-          case 0:  printf("** First byte missing 0x40 bit");
+          case 0:  fputs("** First byte missing 0x40 bit", stdout);
           break;
 
-          case -1: printf("** First byte has too many high-order bits");
+          case -1: fputs("** First byte has too many high-order bits", stdout);
           break;
 
-          case -2: printf("** Incomplete UTF-8 sequence at end of string");
+          case -2: fputs("** Incomplete UTF-8 sequence at end of string",
+                         stdout);
           break;
 
-          case -3: printf("** Incomplete UTF-8 sequence");
+          case -3: fputs("** Incomplete UTF-8 sequence", stdout);
           break;
 
           default: printf("** Unexpected return %d from utf82ord()", rc);
           break;
           }
         printf(" at offset %d in string ", offset);
         while (bptr < buffend) printf("%02x ", *bptr++);
-        printf("\n");
+        putchar('\n');
         break;
         }
       }
     }
   }
 
+if (f != NULL) fclose(f);
+
 return 0;
 }
 
diff --git a/src/pcre2test.c b/src/pcre2test.c