Introduce multibyte string scanner functions

Explorer09 · Explorer09 · commit d02a2dc208aa · 2026-01-09T16:23:47.000+08:00
* String_makePrintable()
* EncodePrintableString()
* String_lineBreakWidth()
* String_mbswidth()

Signed-off-by: Kang-Che Sung &lt;explorer09@gmail.com&gt;
diff --git a/XUtils.c b/XUtils.c
@@ -10,6 +10,7 @@ in the source distribution for its full text.
 #include "XUtils.h"
 
 #include <assert.h>
+#include <ctype.h> // IWYU pragma: keep
 #include <errno.h>
 #include <fcntl.h>
 #include <limits.h>
@@ -236,6 +237,296 @@ size_t strnlen(const char* str, size_t maxLen) {
 }
 #endif
 
+#ifdef HAVE_LIBNCURSESW
+static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
+   assert(!ps->buf || ps->pos < ps->size);
+
+   char tempBuf[MB_LEN_MAX];
+   char* dest = ps->buf ? (char*)ps->buf + ps->pos : tempBuf;
+
+   // It is unnecessarily expensive to fix the output string if the caller
+   // gives an incorrect buffer size. This function would not support any
+   // truncation of the output string.
+   size_t len = wcrtomb(dest, wc, &ps->mbState);
+   assert(len > 0);
+   if (len == (size_t)-1) {
+      assert(len != (size_t)-1);
+      fail();
+   }
+   if (ps->buf && len > ps->size - ps->pos) {
+      assert(!ps->buf || len <= ps->size - ps->pos);
+      fail();
+   }
+
+   ps->pos += len;
+}
+#else
+static void String_encodeWChar(WCharEncoderState* ps, int c) {
+   assert(!ps->buf || ps->pos < ps->size);
+
+   char* buf = ps->buf;
+   if (buf) {
+      buf[ps->pos] = (char)c;
+   }
+
+   ps->pos += 1;
+}
+#endif
+
+void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
+   assert(src || maxLen == 0);
+
+   size_t pos = 0;
+   bool wasReplaced = false;
+
+#ifdef HAVE_LIBNCURSESW
+   const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
+   wchar_t ch;
+
+   mbstate_t decState;
+   memset(&decState, 0, sizeof(decState));
+#else
+   const char replacementChar = '?';
+   char ch;
+#endif
+
+   do {
+      size_t len = 0;
+      bool shouldReplace = false;
+      ch = 0;
+
+      if (pos < maxLen) {
+         // Read the next character from the byte sequence
+#ifdef HAVE_LIBNCURSESW
+         mbstate_t newState;
+         memcpy(&newState, &decState, sizeof(newState));
+         len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
+
+         assert(len != 0 || ch == 0);
+         switch (len) {
+         case (size_t)-2:
+            errno = EILSEQ;
+            shouldReplace = true;
+            len = maxLen - pos;
+            break;
+
+         case (size_t)-1:
+            shouldReplace = true;
+            len = 1;
+            break;
+
+         default:
+            memcpy(&decState, &newState, sizeof(decState));
+         }
+#else
+         len = 1;
+         ch = src[pos];
+#endif
+      }
+
+      pos += len;
+
+      // Filter unprintable characters
+      if (!shouldReplace && ch != 0) {
+#ifdef HAVE_LIBNCURSESW
+         shouldReplace = !iswprint(ch);
+#else
+         shouldReplace = !isprint((unsigned char)ch);
+#endif
+      }
+
+      if (shouldReplace) {
+         ch = replacementChar;
+         if (wasReplaced) {
+            continue;
+         }
+      }
+      wasReplaced = shouldReplace;
+
+      encodeWChar(ps, ch);
+   } while (ch != 0);
+}
+
+char* String_makePrintable(const char* str, size_t maxLen) {
+   WCharEncoderState encState;
+
+   memset(&encState, 0, sizeof(encState));
+   EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
+   size_t size = encState.pos;
+   assert(size > 0);
+
+   memset(&encState, 0, sizeof(encState));
+   char* buf = xMalloc(size);
+   encState.size = size;
+   encState.buf = buf;
+   EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
+   assert(encState.pos == size);
+
+   return buf;
+}
+
+bool String_decodeNextWChar(MBStringDecoderState* ps) {
+   if (!ps->str || ps->maxLen == 0) {
+      return false;
+   }
+
+   // If the previous call of this function encounters an invalid sequence,
+   // do not continue (because the "mbState" object for mbrtowc() is
+   // undefined). The caller is supposed to reset the state.
+#ifdef HAVE_LIBNCURSESW
+   bool isStateDefined = ps->ch != WEOF;
+#else
+   bool isStateDefined = ps->ch != EOF;
+#endif
+   if (!isStateDefined) {
+      return false;
+   }
+
+#ifdef HAVE_LIBNCURSESW
+   wchar_t wc;
+   size_t len = mbrtowc(&wc, ps->str, ps->maxLen, &ps->mbState);
+   switch (len) {
+   case (size_t)-1:
+      // Invalid sequence
+      ps->ch = WEOF;
+      return false;
+
+   case (size_t)-2:
+      // Incomplete sequence
+      ps->str += ps->maxLen;
+      ps->maxLen = 0;
+      return false;
+
+   case 0:
+      assert(wc == 0);
+
+      ps->str = NULL;
+      ps->maxLen = 0;
+      ps->ch = wc;
+      return true;
+
+   default:
+      ps->str += len;
+      ps->maxLen -= len;
+      ps->ch = wc;
+   }
+   return true;
+#else
+   ps->ch = *ps->str;
+   if (ps->ch == 0) {
+      ps->str = NULL;
+      ps->maxLen = 0;
+   } else {
+      ps->str++;
+      ps->maxLen--;
+   }
+   return true;
+#endif
+}
+
+int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator) {
+   assert(*str || maxLen == 0);
+
+   // The caller should ensure (maxWidth >= 0).
+   // It's possible for a Unicode string to occupy 0 terminal columns, so this
+   // function allows (maxWidth == 0).
+   if (maxWidth < 0)
+      maxWidth = INT_MAX;
+
+   MBStringDecoderState state;
+   memset(&state, 0, sizeof(state));
+   state.str = *str;
+   state.maxLen = maxLen;
+
+   int totalWidth = 0;
+   int breakWidth = 0;
+
+   const char* breakPos = NULL;
+   bool inSpaces = true;
+
+   while (String_decodeNextWChar(&state)) {
+      if (state.ch == 0)
+         break;
+
+      if (state.ch == ' ' && separator == ' ' && !inSpaces) {
+         breakWidth = totalWidth;
+         breakPos = *str;
+         inSpaces = true;
+      }
+
+#ifdef HAVE_LIBNCURSESW
+      int w = wcwidth((wchar_t)state.ch);
+      if (w < 0) {
+         // This function should not be used with string containing unprintable
+         // characters. Tolerate them on release build, however.
+         assert(w >= 0);
+         break;
+      }
+#else
+      assert(isprint(state.ch));
+      int w = 1;
+#endif
+
+      if (w > maxWidth - totalWidth) {
+         // This character cannot fit the line with the given maxWidth.
+         if (breakPos) {
+            // Rewind the scanning state to the last found separator.
+            totalWidth = breakWidth;
+            *str = breakPos;
+         }
+         break;
+      }
+
+#ifdef HAVE_LIBNCURSESW
+      // If the character takes zero columns, include the character in the
+      // substring if the working encoding is UTF-8, and ignore it otherwise.
+      // In Unicode, combining characters are always placed after the base
+      // character, but some legacy 8-bit encodings instead place combining
+      // characters before the base character.
+      if (w <= 0 && !CRT_utf8) {
+         continue;
+      }
+#endif
+
+      totalWidth += w;
+
+      // (*str - start) will represent the length of the substring bounded
+      // by the width limit.
+      *str = state.str;
+
+      if (state.ch != ' ')
+         inSpaces = false;
+
+#ifdef HAVE_LIBNCURSESW
+      wint_t sepCast = (wint_t)separator;
+#else
+      int sepCast = (int)separator;
+#endif
+      if (state.ch == sepCast && separator != ' ') {
+         breakWidth = totalWidth;
+         breakPos = *str;
+      }
+   }
+
+   return totalWidth;
+}
+
+int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
+#ifdef HAVE_LIBNCURSESW
+   return String_lineBreakWidth(str, maxLen, maxWidth, '\0');
+#else
+   assert(*str || maxLen == 0);
+
+   if (maxWidth < 0)
+      maxWidth = INT_MAX;
+
+   maxLen = MINIMUM((size_t)maxWidth, maxLen);
+   size_t len = strnlen(*str, maxLen);
+   *str += len;
+   return (int)len;
+#endif
+}
+
 int xAsprintf(char** strp, const char* fmt, ...) {
    *strp = NULL;
 
diff --git a/XUtils.h b/XUtils.h
@@ -22,7 +22,32 @@ in the source distribution for its full text.
 #include <string.h> // IWYU pragma: keep
 
 #include "Macros.h"
+#include "ProvideCurses.h"
+
+
+typedef struct WCharEncoderState_ {
+   size_t pos;
+   size_t size;
+   void* buf;
+   mbstate_t mbState;
+} WCharEncoderState;
+
+typedef struct MBStringDecoderState_ {
+   const char* str;
+   size_t maxLen;
+#ifdef HAVE_LIBNCURSESW
+   wint_t ch;
+   mbstate_t mbState;
+#else
+   int ch;
+#endif
+} MBStringDecoderState;
 
+#ifdef HAVE_LIBNCURSESW
+typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
+#else
+typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
+#endif
 
 ATTR_NORETURN
 void fail(void);
@@ -105,6 +130,21 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
 size_t strnlen(const char* str, size_t maxLen);
 #endif
 
+ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
+void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
+
+ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
+char* String_makePrintable(const char* str, size_t maxLen);
+
+ATTR_NONNULL
+bool String_decodeNextWChar(MBStringDecoderState* ps);
+
+ATTR_NONNULL ATTR_ACCESS2_RW(1)
+int String_lineBreakWidth(const char** str, size_t maxLen, int maxWidth, char separator);
+
+ATTR_NONNULL ATTR_ACCESS2_RW(1)
+int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
+
 ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
 int xAsprintf(char** strp, const char* fmt, ...);