Skip to content

Commit 46a442b

Browse files
authored
Unicode 15.1 support (#253)
* Unicode 15.1 support * always update state * fix GB9c logic * print indic_conjunct_break in printproperty * fix grapheme test * update utf8proc_decompose_char docs * more GB9c tests
1 parent 1cb28a6 commit 46a442b

File tree

9 files changed

+11000
-10896
lines changed

9 files changed

+11000
-10896
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ endif()
6868
if(UTF8PROC_ENABLE_TESTING)
6969
enable_testing()
7070
file(MAKE_DIRECTORY data)
71-
set(UNICODE_VERSION 15.0.0)
71+
set(UNICODE_VERSION 15.1.0)
7272
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
7373
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
7474
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
5959
and is named `libutf8proc.a` (for the static library) and
6060
`libutf8proc.so` (for the dynamic library).
6161

62-
The Unicode version supported is 15.0.0.
62+
The Unicode version supported is 15.1.0.
6363

6464
For Unicode normalizations, the following options are used:
6565

data/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CharWidths.txt: charwidths.jl EastAsianWidth.txt
2222
$(JULIA) charwidths.jl > $@
2323

2424
# Unicode data version (must also update utf8proc_unicode_version function)
25-
UNICODE_VERSION=15.0.0
25+
UNICODE_VERSION=15.1.0
2626

2727
UnicodeData.txt:
2828
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt

data/data_generator.rb

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,32 @@
9797
end
9898
end
9999

100+
$icb_linker_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Linker.*?# Total code points:/m]
101+
$icb = Hash.new("UTF8PROC_INDIC_CONJUNCT_BREAK_NONE")
102+
$icb_linker_list.each_line do |entry|
103+
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
104+
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER" }
105+
elsif entry =~ /^[0-9A-F]+/
106+
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER"
107+
end
108+
end
109+
$icb_consonant_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Consonant.*?# Total code points:/m]
110+
$icb_consonant_list.each_line do |entry|
111+
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
112+
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT" }
113+
elsif entry =~ /^[0-9A-F]+/
114+
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT"
115+
end
116+
end
117+
$icb_extend_list = File.read("DerivedCoreProperties.txt", :encoding => 'utf-8')[/# Indic_Conjunct_Break=Extend.*?# Total code points:/m]
118+
$icb_extend_list.each_line do |entry|
119+
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
120+
$1.hex.upto($2.hex) { |e2| $icb[e2] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND" }
121+
elsif entry =~ /^[0-9A-F]+/
122+
$icb[$&.hex] = "UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND"
123+
end
124+
end
125+
100126
$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt", :encoding => 'utf-8')
101127
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
102128
$grapheme_boundclass_list.each_line do |entry|
@@ -174,7 +200,7 @@ def cpary2c(array)
174200
return "UINT16_MAX" if array.nil? || array.length == 0
175201
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
176202
array = cpary2utf16encoded(array)
177-
if lencode >= 3 #we have only 2 bits for the length
203+
if lencode >= 3 #we have only 2 bits for the length
178204
array = [lencode] + array
179205
lencode = 3
180206
end
@@ -249,7 +275,8 @@ def c_entry(comb_indicies)
249275
"#{$ignorable.include?(code)}, " <<
250276
"#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " <<
251277
"#{$charwidth[code]}, 0, " <<
252-
"#{$grapheme_boundclass[code]}},\n"
278+
"#{$grapheme_boundclass[code]}, " <<
279+
"#{$icb[code]}},\n"
253280
end
254281
end
255282

@@ -415,7 +442,7 @@ def c_entry(comb_indicies)
415442
$stdout << "};\n\n"
416443

417444
$stdout << "static const utf8proc_property_t utf8proc_properties[] = {\n"
418-
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER},\n"
445+
$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},\n"
419446
properties.each { |line|
420447
$stdout << line
421448
}

test/graphemetest.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,13 @@ int main(int argc, char **argv)
119119
checkline("/ 1f926 1f3fc 200d 2642 fe0f /", true); /* facepalm + pale skin + zwj + male sign + FE0F */
120120
checkline("/ 1f468 1f3fb 200d 1f91d 200d 1f468 1f3fd /", true); /* man face + pale skin + zwj + hand holding + zwj + man face + dark skin */
121121

122+
/* more GB9c tests */
123+
checkline("/ 0915 0300 094d 0300 0924 / 0915 /", true);
124+
checkline("/ 0915 0300 094d 0300 094d 0924 / 0915 /", true);
125+
checkline("/ 0915 0300 0300 / 0924 / 0915 /", true);
126+
checkline("/ 0915 0300 094d 0300 / 0078 /", true);
127+
checkline("/ 0300 094d 0300 / 0924 / 0915 /", true);
128+
122129
check(utf8proc_grapheme_break(0x03b1, 0x03b2), "failed 03b1 / 03b2 test");
123130
check(!utf8proc_grapheme_break(0x03b1, 0x0302), "failed 03b1 0302 test");
124131

test/printproperty.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ int main(int argc, char **argv)
3939
" ignorable = %d\n"
4040
" control_boundary = %d\n"
4141
" boundclass = %d\n"
42+
" indic_conjunct_break = %d\n"
4243
" charwidth = %d\n",
4344
argv[i], (char*) cstr,
4445
utf8proc_category_string(c),
@@ -55,6 +56,7 @@ int main(int argc, char **argv)
5556
p->ignorable,
5657
p->control_boundary,
5758
p->boundclass,
59+
p->indic_conjunct_break,
5860
utf8proc_charwidth(c));
5961
free(map);
6062
}

utf8proc.c

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
101101
}
102102

103103
UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
104-
return "15.0.0";
104+
return "15.1.0";
105105
}
106106

107107
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
@@ -288,35 +288,54 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
288288
true; // GB999
289289
}
290290

291-
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
291+
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, int licb, int ticb, utf8proc_int32_t *state)
292292
{
293293
if (state) {
294-
int lbc_override;
295-
if (*state == UTF8PROC_BOUNDCLASS_START)
296-
*state = lbc_override = lbc;
297-
else
298-
lbc_override = *state;
299-
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
294+
int state_bc, state_icb; /* boundclass and indic_conjunct_break state */
295+
if (*state == 0) { /* state initialization */
296+
state_bc = lbc;
297+
state_icb = licb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT ? licb : UTF8PROC_INDIC_CONJUNCT_BREAK_NONE;
298+
}
299+
else { /* lbc and licb are already encoded in *state */
300+
state_bc = *state & 0xff; // 1st byte of state is bound class
301+
state_icb = *state >> 8; // 2nd byte of state is indic conjunct break
302+
}
303+
304+
utf8proc_bool break_permitted = grapheme_break_simple(state_bc, tbc) &&
305+
!(state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER
306+
&& ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT); // GB9c
307+
308+
// Special support for GB9c. Don't break between two consonants
309+
// separated 1+ linker characters and 0+ extend characters in any order.
310+
// After a consonant, we enter LINKER state after at least one linker.
311+
if (ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
312+
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT
313+
|| state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND)
314+
state_icb = ticb;
315+
else if (state_icb == UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER)
316+
state_icb = ticb == UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND ?
317+
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER : ticb;
300318

301319
// Special support for GB 12/13 made possible by GB999. After two RI
302320
// class codepoints we want to force a break. Do this by resetting the
303321
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
304322
// after that character according to GB999 (unless of course such a break is
305323
// forbidden by a different rule such as GB9).
306-
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
307-
*state = UTF8PROC_BOUNDCLASS_OTHER;
324+
if (state_bc == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
325+
state_bc = UTF8PROC_BOUNDCLASS_OTHER;
308326
// Special support for GB11 (emoji extend* zwj / emoji)
309-
else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
327+
else if (state_bc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
310328
if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
311-
*state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
329+
state_bc = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
312330
else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
313-
*state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
331+
state_bc = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
314332
else
315-
*state = tbc;
333+
state_bc = tbc;
316334
}
317335
else
318-
*state = tbc;
336+
state_bc = tbc;
319337

338+
*state = state_bc + (state_icb << 8);
320339
return break_permitted;
321340
}
322341
else
@@ -326,8 +345,12 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
326345
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
327346
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
328347

329-
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
330-
utf8proc_get_property(c2)->boundclass,
348+
const utf8proc_property_t *p1 = utf8proc_get_property(c1);
349+
const utf8proc_property_t *p2 = utf8proc_get_property(c2);
350+
return grapheme_break_extended(p1->boundclass,
351+
p2->boundclass,
352+
p1->indic_conjunct_break,
353+
p2->indic_conjunct_break,
331354
state);
332355
}
333356

@@ -498,8 +521,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
498521
}
499522
if (options & UTF8PROC_CHARBOUND) {
500523
utf8proc_bool boundary;
501-
int tbc = property->boundclass;
502-
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
524+
boundary = grapheme_break_extended(0, property->boundclass, 0, property->indic_conjunct_break,
525+
last_boundclass);
503526
if (boundary) {
504527
if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */
505528
if (bufsize >= 2) dst[1] = uc;

utf8proc.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,8 @@ typedef struct utf8proc_property_struct {
273273
* Boundclass.
274274
* @see utf8proc_boundclass_t.
275275
*/
276-
unsigned boundclass:8;
276+
unsigned boundclass:6;
277+
unsigned indic_conjunct_break:2;
277278
} utf8proc_property_t;
278279

279280
/** Unicode categories. */
@@ -388,6 +389,14 @@ typedef enum {
388389
UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
389390
} utf8proc_boundclass_t;
390391

392+
/** Indic_Conjunct_Break property. (TR44) */
393+
typedef enum {
394+
UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0,
395+
UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1,
396+
UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2,
397+
UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3,
398+
} utf8proc_indic_conjunct_break_t;
399+
391400
/**
392401
* Function pointer type passed to @ref utf8proc_map_custom and
393402
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
@@ -481,8 +490,9 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
481490
* - @ref UTF8PROC_STRIPNA - remove unassigned codepoints
482491
* @param last_boundclass
483492
* Pointer to an integer variable containing
484-
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
485-
* option is used. Otherwise, this parameter is ignored.
493+
* the previous codepoint's (boundclass + indic_conjunct_break << 1) if the @ref UTF8PROC_CHARBOUND
494+
* option is used. If the string is being processed in order, this can be initialized to 0 for
495+
* the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
486496
*
487497
* @return
488498
* In case of success, the number of codepoints written is returned; in case

0 commit comments

Comments
 (0)