Skip to content

Commit 3de4596

Browse files
authored
properties: add "ambiguous_width" property for ambiguous East Asian Width (#270)
Some characters have their width defined as "Ambiguous" in UAX#11. These are typically rendered as single-width by modern monospace fonts, and utf8proc correctly returns charwidth==1 for these. However some applications might need to support older CJK fonts where characters which where two-byte in legacy encodings were rendered as double-width. An example of this is the 'ambiwidth' option of vim and neovim which supports rendering in terminals using such wideness rules. Add an 'ambiguous_width' property to utf8proc_property_t for such characters.
1 parent 5568eff commit 3de4596

File tree

5 files changed

+12923
-12862
lines changed

5 files changed

+12923
-12862
lines changed

data/data_generator.jl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ function read_east_asian_widths(filename)
190190
for (rng,widthcode) in read_hex_ranges(filename)
191191
w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
192192
widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
193+
widthcode == "A" ? -1 : # ambiguous width
193194
nothing
194195
if !isnothing(w)
195196
set_all!(ea_widths, rng, w)
@@ -221,7 +222,7 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
221222
# Widths from UAX #11: East Asian Width
222223
eaw = get(ea_widths, code, nothing)
223224
if !isnothing(eaw)
224-
width = eaw
225+
width = eaw < 0 ? 1 : eaw
225226
end
226227

227228
# A few exceptional cases, found by manual comparison to other wcwidth
@@ -242,6 +243,9 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
242243

243244
return width
244245
end
246+
global function is_ambiguous_width(code)
247+
return get(ea_widths, code, 0) < 0
248+
end
245249
end
246250

247251
#-------------------------------------------------------------------------------
@@ -394,6 +398,7 @@ function char_table_properties!(sequences, char)
394398
control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") &&
395399
!(char.code in (0x200C, 0x200D)),
396400
charwidth = derive_char_width(code, char.category),
401+
ambiguous_width = is_ambiguous_width(code),
397402
boundclass = get_grapheme_boundclass(code),
398403
indic_conjunct_break = get_indic_conjunct_break(code),
399404
)
@@ -479,7 +484,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
479484

480485
print(io, """
481486
static const utf8proc_property_t utf8proc_properties[] = {
482-
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
487+
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
483488
""")
484489
for prop in deduplicated_props
485490
print(io, " {",
@@ -498,6 +503,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
498503
prop.ignorable, ", ",
499504
prop.control_boundary, ", ",
500505
prop.charwidth, ", ",
506+
prop.ambiguous_width, ", ",
501507
"0, ", # bitfield padding
502508
c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
503509
c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),

test/charwidth.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ int main(int argc, char **argv)
2525
for (c = 0; c <= 0x110000; ++c) {
2626
int cat = utf8proc_get_property(c)->category;
2727
int w = utf8proc_charwidth(c);
28+
int ambiguous = utf8proc_charwidth_ambiguous(c);
2829
if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
2930
fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
3031
error += 1;
@@ -42,6 +43,10 @@ int main(int argc, char **argv)
4243
isprint(c) ? "printable" : "non-printable", c);
4344
error += 1;
4445
}
46+
if (c <= 127 && utf8proc_charwidth_ambiguous(c)) {
47+
fprintf(stderr, "ambiwith set for ASCII %x\n", c);
48+
error += 1;
49+
}
4550
if (!my_isprint(c) && w > 0) {
4651
fprintf(stderr, "non-printing %x had width %d\n", c, w);
4752
error += 1;
@@ -50,11 +55,20 @@ int main(int argc, char **argv)
5055
fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
5156
error += 1;
5257
}
58+
if (ambiguous && w >= 2) {
59+
fprintf(stderr, "char %x is both doublewidth and ambiguous\n", c);
60+
error += 1;
61+
}
5362
}
5463
check(!error, "utf8proc_charwidth FAILED %d tests.", error);
5564

5665
check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
66+
check(utf8proc_charwidth_ambiguous(0x00ad) , "incorrect ambiguous width for U+00AD (soft hyphen)");
5767
check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
68+
check(utf8proc_charwidth_ambiguous(0xe000), "incorrect ambiguous width for U+e000 (PUA)");
69+
70+
check(utf8proc_charwidth_ambiguous(0x00A1), "incorrect ambiguous width for U+00A1 (inverted exclamation mark)");
71+
check(!utf8proc_charwidth_ambiguous(0x00A2), "incorrect ambiguous width for U+00A2 (cent sign)");
5872

5973
/* print some other information by compariing with system wcwidth */
6074
printf("Mismatches with system wcwidth (not necessarily errors):\n");

utf8proc.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,10 @@ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
432432
return utf8proc_get_property(c)->charwidth;
433433
}
434434

435+
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t c) {
436+
return utf8proc_get_property(c)->ambiguous_width;
437+
}
438+
435439
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
436440
return (utf8proc_category_t) utf8proc_get_property(c)->category;
437441
}

utf8proc.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,9 @@ typedef struct utf8proc_property_struct {
268268
unsigned control_boundary:1;
269269
/** The width of the codepoint. */
270270
unsigned charwidth:2;
271-
unsigned pad:2;
271+
/** East Asian width class A */
272+
unsigned ambiguous_width:1;
273+
unsigned pad:1;
272274
/**
273275
* Boundclass.
274276
* @see utf8proc_boundclass_t.
@@ -667,6 +669,14 @@ UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);
667669
* (analogous to `isprint` or `iscntrl`), use utf8proc_category(). */
668670
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
669671

672+
/**
673+
* Given a codepoint, return whether it has East Asian width class A (Ambiguous)
674+
*
675+
* Codepoints with this property are considered to have charwidth 1 (if they are printable)
676+
* but some East Asian fonts render them as double width.
677+
*/
678+
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_charwidth_ambiguous(utf8proc_int32_t codepoint);
679+
670680
/**
671681
* Return the Unicode category for the codepoint (one of the
672682
* @ref utf8proc_category_t constants.)

0 commit comments

Comments
 (0)