Skip to content

Commit 958d087

Browse files
committed
CLDR-18080 add Hant-Latn transform, add Hans-Latn alias for Hani-Latn
1 parent 71fe179 commit 958d087

File tree

5 files changed

+229
-3
lines changed

5 files changed

+229
-3
lines changed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# compounds
2+
藏文 zàng wén
3+
重庆 chóng qìng
4+
沈阳 shěn yáng
5+
秘鲁 bì lǔ
6+
# some chars that transform the same for Hans, Hant (incl first chars of 3 compounds)
7+
㶼 āi
8+
锿 āi
9+
𤸖 āi
10+
藏 cáng
11+
秘 mì
12+
䃺 mó
13+
麽 mó
14+
𰈶 mó
15+
重 zhòng
16+
㑅 zuò
17+
飵 zuò
18+
𫗢 zuò
19+
# some chars that transform differently for Hans, Hant
20+
# extension A
21+
㪅 gèng
22+
㴔 jí
23+
䏲 tī
24+
# CJK unified
25+
万 wàn
26+
卜 bo
27+
叚 xiá
28+
沈 shěn
29+
沓 dá
30+
舍 shě
31+
著 zhù
32+
髪 fà
33+
髮 fà
34+
麃 páo
35+
# supplementary
36+
𩷕 láng
37+
𪟝 jì
38+
𲆦 xī
39+
𲆰 xī
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# compounds
2+
藏文 zàng wén
3+
重庆 chóng qìng
4+
沈阳 shěn yáng
5+
秘鲁 bì lǔ
6+
# some chars that transform the same for Hans, Hant (incl first chars of 3 compounds)
7+
㶼 āi
8+
锿 āi
9+
𤸖 āi
10+
藏 cáng
11+
秘 mì
12+
䃺 mó
13+
麽 mó
14+
𰈶 mó
15+
重 zhòng
16+
㑅 zuò
17+
飵 zuò
18+
𫗢 zuò
19+
# some chars that transform differently for Hans, Hant
20+
# extension A
21+
㪅 gēng
22+
㴔 xī
23+
䏲 dié
24+
# CJK unified
25+
万 mò
26+
卜 bǔ
27+
叚 jiǎ
28+
沈 chén
29+
沓 tà
30+
舍 shè
31+
著 zhe
32+
髪 fǎ
33+
髮 fǎ
34+
麃 biāo
35+
# supplementary
36+
𩷕 liáng
37+
𪟝 jī
38+
𲆦 xì
39+
𲆰 xí

common/transforms/Han-Latin.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
<?xml version='1.0' encoding='UTF-8' ?>
22
<!DOCTYPE supplementalData SYSTEM '../../common/dtd/ldmlSupplemental.dtd'>
33
<!--
4-
Copyright © 1991-2015 Unicode, Inc.
4+
Copyright © 1991-2025 Unicode, Inc.
55
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
66
For terms of use, see http://www.unicode.org/copyright.html
77
-->
88
<supplementalData>
99
<version number='$Revision$'/>
1010
<transforms>
11-
<transform source="Hani" target="Latn" direction="forward" alias="Han-Latin und-Latn-t-und-hani">
11+
<transform source="Hani" target="Latn" direction="forward" alias="Han-Latin Hans-Latn und-Latn-t-und-hani und-Latn-t-und-hans">
1212
<tRule>
1313
# Warning: does not do round-trip mapping!!
1414
# Convert CJK characters
@@ -17,7 +17,7 @@ For terms of use, see http://www.unicode.org/copyright.html
1717
# Note that Han-Spacedhan() has already been applied, so there should be spaces between Han characters.
1818
藏 } \u0020? 文 →zàng;# 藏 is zàng (not cáng) if followed by 文 wén: 藏文 language Zàngwén = Tibetan
1919
重 } \u0020? 庆 →chóng;# 重 is chóng (not zhòng) if followed by 庆 qìng: 重庆 city Chóngqìng
20-
沈 } \u0020? 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
20+
# "沈 } \u0020? 阳 →shěn" is obsolete for Hans, the kMandarin entry for 沈 changed from "chén" to "shěn chén" in Unicode 14
2121
秘 } \u0020? 鲁 →bì;# 秘 is bì (not mì) if followed by 鲁 lǔ: 秘鲁 country Bìlǔ = Peru
2222
# START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin)
2323
[吖錒锕阿𠼞𥥩𨉚𱚱]→ā;

common/transforms/Hant-Latin.xml

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
<?xml version='1.0' encoding='UTF-8' ?>
2+
<!DOCTYPE supplementalData SYSTEM '../../common/dtd/ldmlSupplemental.dtd'>
3+
<!--
4+
Copyright © 2025 Unicode, Inc.
5+
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
6+
For terms of use, see http://www.unicode.org/copyright.html
7+
-->
8+
<supplementalData>
9+
<version number='$Revision$'/>
10+
<transforms>
11+
<transform source="Hant" target="Latn" direction="forward" alias="und-Latn-t-und-hant">
12+
<tRule>
13+
# Warning: does not do round-trip mapping!!
14+
# Convert compounds; these are added individually, not derived from Unihan kMandarin.
15+
# Here Han-Spacedhan() has not yet been applied.
16+
# The following was moved from Hans-Latn; in a Hant/Taiwan context, the simplified-form city name 沈阳 should still transform to shěnyáng.
17+
沈 } 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
18+
# START From Unicode 17, the following should be autogenerated:
19+
[棓]→bàng; # U+68D3
20+
[繃]→bēng; # U+7E43
21+
[俾]→bì; # U+4FFE
22+
[萹]→biǎn; # U+8439
23+
[摽脿蔈麃]→biāo; # U+647D,813F,8508,9E83
24+
[啵]→bō; # U+5575
25+
[柏薄]→bó; # U+67CF,8584
26+
[卜]→bǔ; # U+535C
27+
[差]→chā; # U+5DEE
28+
[沈]→chén; # U+6C88
29+
[牚]→chēng; # U+725A
30+
[埫]→chǒng; # U+57EB
31+
[槭]→cù; # U+69ED
32+
[噠]→dá; # U+5660
33+
[蹬]→dèng; # U+8E6C
34+
[地]→dì; # U+5730
35+
[嗲]→diē; # U+55F2
36+
[䏲跌]→dié; # U+43F2,8DCC
37+
[町]→dīng; # U+753A
38+
[斗]→dǒu; # U+6597
39+
[都]→dū; # U+90FD
40+
[碡]→dú; # U+78A1
41+
[柁]→duò; # U+67C1
42+
[嗯]→en; # U+55EF
43+
[髪髮]→fǎ; # U+9AEA,9AEE
44+
[蕃]→fān; # U+8543
45+
[帆]→fán; # U+5E06
46+
[氾]→fàn; # U+6C3E
47+
[彷]→fǎng; # U+5F77
48+
[坋]→fèn; # U+574B
49+
[諷讽]→fèng; # U+8AF7,8BBD
50+
[乾]→gān; # U+4E7E
51+
[㪅]→gēng; # U+3A85
52+
[蓇]→gǔ; # U+84C7
53+
[聒]→guā; # U+8052
54+
[氿]→guǐ; # U+6C3F
55+
[炔]→guì; # U+7094
56+
[欻]→hū; # U+6B3B
57+
[砉]→huò; # U+7809
58+
[𪟝]→jī; # U+2A7DD
59+
[蓻]→jí; # U+84FB
60+
[袷]→jiá; # U+88B7
61+
[叚]→jiǎ; # U+53DA
62+
[菹]→jū; # U+83F9
63+
[剋]→kè; # U+524B
64+
[框]→kuāng; # U+6846
65+
[适]→kuò; # U+9002
66+
[肋]→lè; # U+808B
67+
[釐]→lí; # U+91D0
68+
[峛]→lǐ; # U+5CDB
69+
[𩷕]→liáng; # U+29DD5
70+
[瞭]→liǎo; # U+77AD
71+
[蹣]→mán; # U+8E63
72+
[眄]→miǎn; # U+7704
73+
[碈]→mín; # U+7888
74+
[万]→mò; # U+4E07
75+
[伲]→nǐ; # U+4F32
76+
[耙]→pá; # U+8019
77+
[芘]→pí; # U+8298
78+
[諞]→pián; # U+8ADE
79+
[剽]→piào; # U+527D
80+
[剖頗]→pǒ; # U+5256,9817
81+
[醱]→pò; # U+91B1
82+
[呇]→qǐ; # U+5447
83+
[癿]→qié; # U+767F
84+
[芎]→qiōng; # U+828E
85+
[杣]→shān; # U+6763
86+
[杓]→sháo; # U+6753
87+
[舍]→shè; # U+820D
88+
[誰]→shéi; # U+8AB0
89+
[識识]→shì; # U+8B58,8BC6
90+
[楯]→shǔn; # U+696F
91+
[洓]→suǒ; # U+6D13
92+
[沓]→tà; # U+6C93
93+
[堤隄]→tí; # U+5824,9684
94+
[萎]→wēi; # U+840E
95+
[硊]→wěi; # U+784A
96+
[筽]→wú; # U+7B7D
97+
[嘸]→wǔ; # U+5638
98+
[㴔]→xī; # U+3D14
99+
[𲆰]→xí; # U+321B0
100+
[𲆦]→xì; # U+321A6
101+
[呷]→xiá; # U+5477
102+
[硍]→xiàn; # U+784D
103+
[崾]→yǎo; # U+5D3E
104+
[畬]→yú; # U+756C
105+
[薁]→yù; # U+8581
106+
[嶦]→zhān; # U+5DA6
107+
[著]→zhe; # U+8457
108+
[徵]→zhēng; # U+5FB5
109+
[苧]→zhù; # U+82E7
110+
# END From Unicode 17, the above should be autogenerated:
111+
# Then run the normal Hani-Latn transform for the rest
112+
::Hani-Latn();
113+
</tRule>
114+
</transform>
115+
</transforms>
116+
</supplementalData>

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,38 @@ public void TestHiraKata() { // for CLDR-13127 and ...
795795
assertEquals("Hira-Kata", hiraKata.transform("゛゜ わ゙ ゟ"), "゛゜ ヷ ヨリ");
796796
}
797797

798+
public void TestHani() {
799+
register();
800+
Transliterator haniLatn = getTransliterator("und-Latn-t-und-hani");
801+
Transliterator hansLatn1 = getTransliterator("und-Latn-t-und-hans");
802+
Transliterator hansLatn2 = getTransliterator("Hans-Latn");
803+
Transliterator hantLatn1 = getTransliterator("und-Latn-t-und-hant");
804+
Transliterator hantLatn2 = getTransliterator("Hant-Latn");
805+
806+
// Text that should transform the same way for Hans, Hant
807+
String commonSource = "藏文 重庆 沈阳 秘鲁 㶼 锿 𤸖 藏 秘 䃺 麽 𰈶 重 㑅 飵 𫗢";
808+
String commonExpect =
809+
"zàng wén chóng qìng shěn yáng bì lǔ āi āi āi cáng mì mó mó mó zhòng zuò zuò zuò";
810+
811+
// Text that should transform differently for Hans, Hant
812+
String variantSource = "㪅 㴔 䏲 万 卜 叚 沈 沓 舍 著 髪 髮 麃 𩷕 𪟝 𲆦 𲆰";
813+
String variantExpectHans = "gèng jí tī wàn bo xiá shěn dá shě zhù fà fà páo láng jì xī xī";
814+
String variantExpectHant =
815+
"gēng xī dié mò bǔ jiǎ chén tà shè zhe fǎ fǎ biāo liáng jī xì xí";
816+
817+
assertEquals("common haniLatn ", commonExpect, haniLatn.transform(commonSource));
818+
assertEquals("common hansLatn1", commonExpect, hansLatn1.transform(commonSource));
819+
assertEquals("common hansLatn2", commonExpect, hansLatn2.transform(commonSource));
820+
assertEquals("common hantLatn1", commonExpect, hantLatn1.transform(commonSource));
821+
assertEquals("common hantLatn2", commonExpect, hantLatn2.transform(commonSource));
822+
823+
assertEquals("variant haniLatn ", variantExpectHans, haniLatn.transform(variantSource));
824+
assertEquals("variant hansLatn1", variantExpectHans, hansLatn1.transform(variantSource));
825+
assertEquals("variant hansLatn2", variantExpectHans, hansLatn2.transform(variantSource));
826+
assertEquals("variant hantLatn1", variantExpectHant, hantLatn1.transform(variantSource));
827+
assertEquals("variant hantLatn2", variantExpectHant, hantLatn2.transform(variantSource));
828+
}
829+
798830
public void TestZawgyiToUnicode10899() {
799831
// Some tests for the transformation of Zawgyi font encoding to Unicode Burmese.
800832
Transliterator z2u = getTransliterator("my-t-my-s0-zawgyi");

0 commit comments

Comments
 (0)