Update to UnicodeData 17.0.0 (#292)

eschnett · web-flow · commit e459078a4786 · 2025-06-18T13:46:52.000-04:00
* Update to UnicodeData 17.0.0

* Tests: Handle grapheme strings containing NUL

* Tests: Remove left-over assert statement

* Correct build errors

* Correct build errors

* Update internal version numbers

* Update more version numbers

* Remove unwanted file
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -85,7 +85,7 @@ endif()
 if(UTF8PROC_ENABLE_TESTING)
   enable_testing()
   file(MAKE_DIRECTORY data)
-  set(UNICODE_VERSION 16.0.0)
+  set(UNICODE_VERSION 17.0.0)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
   file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
   add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ The C library is found in this directory after successful compilation
 and is named `libutf8proc.a` (for the static library) and
 `libutf8proc.so` (for the dynamic library).
 
-The Unicode version supported is 16.0.0.
+The Unicode version supported is 17.0.0.
 
 For Unicode normalizations, the following options are used:
 
diff --git a/data/Makefile b/data/Makefile
@@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
 	$(JULIA) --project=. data_generator.jl > $@
 
 # Unicode data version (must also update utf8proc_unicode_version function)
-UNICODE_VERSION=16.0.0
+UNICODE_VERSION=17.0.0
 
 UnicodeData.txt:
 	$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
diff --git a/test/graphemetest.c b/test/graphemetest.c
@@ -23,8 +23,9 @@ void checkline(const char *_buf, bool verbose) {
             bi += 1;
         }
         else { /* hex-encoded codepoint */
-            size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
-            while (src[si]) ++si; /* advance to NUL termination */
+            size_t dest_len;
+            size_t len = encode((unsigned char*) (src + si), &dest_len, buf + bi) - 1;
+            si += dest_len; /* advance to NUL termination */
             bi += len;
         }
     }
diff --git a/test/iscase.c b/test/iscase.c
@@ -6,12 +6,13 @@ int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
      size_t len = simple_getline(buf, f);
      size_t pos = skipspaces(buf, 0);
      unsigned char s[16];
+     size_t s_len;
      if (pos == len || buf[pos] == '#') return 0;
-     pos += encode(s, buf + pos) - 1;
+     pos += encode(s, &s_len, buf + pos) - 1;
      check(s[0], "invalid line %s in data", buf);
      utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
      if (buf[pos] == '.' && buf[pos+1] == '.') {
-          encode(s, buf + pos + 2);
+          encode(s, &s_len, buf + pos + 2);
           check(s[0], "invalid line %s in data", buf);
           utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
      }
diff --git a/test/normtest.c b/test/normtest.c
@@ -27,11 +27,12 @@ int main(int argc, char **argv)
 
           if (buf[0] == '#') continue;
 
-          offset = encode(source, buf);
-          offset += encode(NFC, buf + offset);
-          offset += encode(NFD, buf + offset);
-          offset += encode(NFKC, buf + offset);
-          offset += encode(NFKD, buf + offset);
+          size_t len;
+          offset = encode(source, &len, buf);
+          offset += encode(NFC, &len, buf + offset);
+          offset += encode(NFD, &len, buf + offset);
+          offset += encode(NFKC, &len, buf + offset);
+          offset += encode(NFKD, &len, buf + offset);
 
           CHECK_NORM(NFC, NFC, source);
           CHECK_NORM(NFC, NFC, NFC);
diff --git a/test/tests.c b/test/tests.c
@@ -27,7 +27,7 @@ size_t skipspaces(const unsigned char *buf, size_t i)
    separated by whitespace, and terminated by any character not in
    [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
    in dest, returning the number of bytes read from buf */
-size_t encode(unsigned char *dest, const unsigned char *buf)
+size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf)
 {
      size_t i = 0, j;
      utf8proc_ssize_t d = 0;
@@ -38,6 +38,7 @@ size_t encode(unsigned char *dest, const unsigned char *buf)
                ; /* find end of hex input */
           if (j == i) { /* no codepoint found */
                dest[d] = 0; /* NUL-terminate destination string */
+               *dest_len = (size_t)d;
                return i + 1;
           }
           check(sscanf((char *) (buf + i), "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i);
diff --git a/test/tests.h b/test/tests.h
@@ -23,5 +23,5 @@ extern size_t lineno;
 
 void check(int cond, const char *format, ...);
 size_t skipspaces(const unsigned char *buf, size_t i);
-size_t encode(unsigned char *dest, const unsigned char *buf);
+size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf);
 size_t simple_getline(unsigned char buf[8192], FILE *f);
diff --git a/utf8proc.c b/utf8proc.c
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
-  return "16.0.0";
+  return "17.0.0";
 }
 
 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
diff --git a/utf8proc_data.c b/utf8proc_data.c

Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,9 @@ void checkline(const char *_buf, bool verbose) {`
`23`	`23`	`bi += 1;`
`24`	`24`	`}`
`25`	`25`	`else { /* hex-encoded codepoint */`
`26`		`- size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;`
`27`		`- while (src[si]) ++si; /* advance to NUL termination */`
	`26`	`+ size_t dest_len;`
	`27`	`+ size_t len = encode((unsigned char*) (src + si), &dest_len, buf + bi) - 1;`
	`28`	`+ si += dest_len; /* advance to NUL termination */`
`28`	`29`	`bi += len;`
`29`	`30`	`}`
`30`	`31`	`}`
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {`
`101`	`101`	`}`
`102`	`102`
`103`	`103`	`UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {`
`104`		`- return "16.0.0";`
	`104`	`+ return "17.0.0";`
`105`	`105`	`}`
`106`	`106`
`107`	`107`	`UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {`