Skip to content

Commit e459078

Browse files
authored
Update to UnicodeData 17.0.0 (#292)
* Update to UnicodeData 17.0.0 * Tests: Handle grapheme strings containing NUL * Tests: Remove left-over assert statement * Correct build errors * Correct build errors * Update internal version numbers * Update more version numbers * Remove unwanted file
1 parent 24e2a19 commit e459078

File tree

10 files changed

+7170
-7131
lines changed

10 files changed

+7170
-7131
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ endif()
8585
if(UTF8PROC_ENABLE_TESTING)
8686
enable_testing()
8787
file(MAKE_DIRECTORY data)
88-
set(UNICODE_VERSION 16.0.0)
88+
set(UNICODE_VERSION 17.0.0)
8989
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
9090
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
9191
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ The C library is found in this directory after successful compilation
6969
and is named `libutf8proc.a` (for the static library) and
7070
`libutf8proc.so` (for the dynamic library).
7171

72-
The Unicode version supported is 16.0.0.
72+
The Unicode version supported is 17.0.0.
7373

7474
For Unicode normalizations, the following options are used:
7575

data/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
2121
$(JULIA) --project=. data_generator.jl > $@
2222

2323
# Unicode data version (must also update utf8proc_unicode_version function)
24-
UNICODE_VERSION=16.0.0
24+
UNICODE_VERSION=17.0.0
2525

2626
UnicodeData.txt:
2727
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

test/graphemetest.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ void checkline(const char *_buf, bool verbose) {
2323
bi += 1;
2424
}
2525
else { /* hex-encoded codepoint */
26-
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
27-
while (src[si]) ++si; /* advance to NUL termination */
26+
size_t dest_len;
27+
size_t len = encode((unsigned char*) (src + si), &dest_len, buf + bi) - 1;
28+
si += dest_len; /* advance to NUL termination */
2829
bi += len;
2930
}
3031
}

test/iscase.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@ int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
66
size_t len = simple_getline(buf, f);
77
size_t pos = skipspaces(buf, 0);
88
unsigned char s[16];
9+
size_t s_len;
910
if (pos == len || buf[pos] == '#') return 0;
10-
pos += encode(s, buf + pos) - 1;
11+
pos += encode(s, &s_len, buf + pos) - 1;
1112
check(s[0], "invalid line %s in data", buf);
1213
utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
1314
if (buf[pos] == '.' && buf[pos+1] == '.') {
14-
encode(s, buf + pos + 2);
15+
encode(s, &s_len, buf + pos + 2);
1516
check(s[0], "invalid line %s in data", buf);
1617
utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
1718
}

test/normtest.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,12 @@ int main(int argc, char **argv)
2727

2828
if (buf[0] == '#') continue;
2929

30-
offset = encode(source, buf);
31-
offset += encode(NFC, buf + offset);
32-
offset += encode(NFD, buf + offset);
33-
offset += encode(NFKC, buf + offset);
34-
offset += encode(NFKD, buf + offset);
30+
size_t len;
31+
offset = encode(source, &len, buf);
32+
offset += encode(NFC, &len, buf + offset);
33+
offset += encode(NFD, &len, buf + offset);
34+
offset += encode(NFKC, &len, buf + offset);
35+
offset += encode(NFKD, &len, buf + offset);
3536

3637
CHECK_NORM(NFC, NFC, source);
3738
CHECK_NORM(NFC, NFC, NFC);

test/tests.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ size_t skipspaces(const unsigned char *buf, size_t i)
2727
separated by whitespace, and terminated by any character not in
2828
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
2929
in dest, returning the number of bytes read from buf */
30-
size_t encode(unsigned char *dest, const unsigned char *buf)
30+
size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf)
3131
{
3232
size_t i = 0, j;
3333
utf8proc_ssize_t d = 0;
@@ -38,6 +38,7 @@ size_t encode(unsigned char *dest, const unsigned char *buf)
3838
; /* find end of hex input */
3939
if (j == i) { /* no codepoint found */
4040
dest[d] = 0; /* NUL-terminate destination string */
41+
*dest_len = (size_t)d;
4142
return i + 1;
4243
}
4344
check(sscanf((char *) (buf + i), "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i);

test/tests.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ extern size_t lineno;
2323

2424
void check(int cond, const char *format, ...);
2525
size_t skipspaces(const unsigned char *buf, size_t i);
26-
size_t encode(unsigned char *dest, const unsigned char *buf);
26+
size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf);
2727
size_t simple_getline(unsigned char buf[8192], FILE *f);

utf8proc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
101101
}
102102

103103
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
104-
return "16.0.0";
104+
return "17.0.0";
105105
}
106106

107107
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {

0 commit comments

Comments
 (0)