CLDR-18080 add Hant-Latn transform, add Hans-Latn alias for Hani-Latn

pedberg-icu · pedberg-icu · commit 958d087fe41b · 2025-01-22T13:08:18.000-08:00
diff --git a/common/testData/transforms/und-Latn-t-und-hans.txt b/common/testData/transforms/und-Latn-t-und-hans.txt
@@ -0,0 +1,39 @@
+# compounds
+藏文	zàng wén
+重庆	chóng qìng
+沈阳	shěn yáng
+秘鲁	bì lǔ
+# some chars that transform the same for Hans, Hant (incl first chars of 3 compounds)
+㶼	āi
+锿	āi
+𤸖	āi
+藏	cáng
+秘	mì
+䃺	mó
+麽	mó
+𰈶	mó
+重	zhòng
+㑅	zuò
+飵	zuò
+𫗢	zuò
+# some chars that transform differently for Hans, Hant
+# extension A
+㪅	gèng
+㴔	jí
+䏲	tī
+# CJK unified
+万	wàn
+卜	bo
+叚	xiá
+沈	shěn
+沓	dá
+舍	shě
+著	zhù
+髪	fà
+髮	fà
+麃	páo
+# supplementary
+𩷕	láng
+𪟝	jì
+𲆦	xī
+𲆰	xī
diff --git a/common/testData/transforms/und-Latn-t-und-hant.txt b/common/testData/transforms/und-Latn-t-und-hant.txt
@@ -0,0 +1,39 @@
+# compounds
+藏文	zàng wén
+重庆	chóng qìng
+沈阳	shěn yáng
+秘鲁	bì lǔ
+# some chars that transform the same for Hans, Hant (incl first chars of 3 compounds)
+㶼	āi
+锿	āi
+𤸖	āi
+藏	cáng
+秘	mì
+䃺	mó
+麽	mó
+𰈶	mó
+重	zhòng
+㑅	zuò
+飵	zuò
+𫗢	zuò
+# some chars that transform differently for Hans, Hant
+# extension A
+㪅	gēng
+㴔	xī
+䏲	dié
+# CJK unified
+万	mò
+卜	bǔ
+叚	jiǎ
+沈	chén
+沓	tà
+舍	shè
+著	zhe
+髪	fǎ
+髮	fǎ
+麃	biāo
+# supplementary
+𩷕	liáng
+𪟝	jī
+𲆦	xì
+𲆰	xí
diff --git a/common/transforms/Han-Latin.xml b/common/transforms/Han-Latin.xml
@@ -1,14 +1,14 @@
 <?xml version='1.0' encoding='UTF-8' ?>
 <!DOCTYPE supplementalData SYSTEM '../../common/dtd/ldmlSupplemental.dtd'>
 <!--
-Copyright © 1991-2015 Unicode, Inc.
+Copyright © 1991-2025 Unicode, Inc.
 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 For terms of use, see http://www.unicode.org/copyright.html
 -->
 <supplementalData>
 	<version number='$Revision$'/>
 	<transforms>
-		<transform source="Hani" target="Latn" direction="forward" alias="Han-Latin und-Latn-t-und-hani">
+		<transform source="Hani" target="Latn" direction="forward" alias="Han-Latin Hans-Latn und-Latn-t-und-hani und-Latn-t-und-hans">
 			<tRule>
 # Warning: does not do round-trip mapping!!
 # Convert CJK characters
@@ -17,7 +17,7 @@ For terms of use, see http://www.unicode.org/copyright.html
 # Note that Han-Spacedhan() has already been applied, so there should be spaces between Han characters.
 藏 } \u0020? 文 →zàng;# 藏 is zàng (not cáng) if followed by 文 wén: 藏文 language Zàngwén = Tibetan
 重 } \u0020? 庆 →chóng;# 重 is chóng (not zhòng) if followed by 庆 qìng: 重庆 city Chóngqìng
-沈 } \u0020? 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
+# "沈 } \u0020? 阳 →shěn" is obsolete for Hans, the kMandarin entry for 沈 changed from "chén" to "shěn chén" in Unicode 14
 秘 } \u0020? 鲁 →bì;# 秘 is bì (not mì) if followed by 鲁 lǔ: 秘鲁 country Bìlǔ = Peru
 # START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin)
 [吖錒锕阿𠼞𥥩𨉚𱚱]→ā;
diff --git a/common/transforms/Hant-Latin.xml b/common/transforms/Hant-Latin.xml
@@ -0,0 +1,116 @@
+<?xml version='1.0' encoding='UTF-8' ?>
+<!DOCTYPE supplementalData SYSTEM '../../common/dtd/ldmlSupplemental.dtd'>
+<!--
+Copyright © 2025 Unicode, Inc.
+CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+For terms of use, see http://www.unicode.org/copyright.html
+-->
+<supplementalData>
+	<version number='$Revision$'/>
+	<transforms>
+		<transform source="Hant" target="Latn" direction="forward" alias="und-Latn-t-und-hant">
+			<tRule>
+# Warning: does not do round-trip mapping!!
+# Convert compounds; these are added individually, not derived from Unihan kMandarin.
+# Here Han-Spacedhan() has not yet been applied.
+# The following was moved from Hans-Latn; in a Hant/Taiwan context, the simplified-form city name 沈阳 should still transform to shěnyáng.
+沈 } 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
+# START From Unicode 17, the following should be autogenerated:
+[棓]→bàng;  # U+68D3
+[繃]→bēng;  # U+7E43
+[俾]→bì;    # U+4FFE
+[萹]→biǎn;  # U+8439
+[摽脿蔈麃]→biāo; # U+647D,813F,8508,9E83
+[啵]→bō;    # U+5575
+[柏薄]→bó;   # U+67CF,8584
+[卜]→bǔ;    # U+535C
+[差]→chā;   # U+5DEE
+[沈]→chén;  # U+6C88
+[牚]→chēng; # U+725A
+[埫]→chǒng; # U+57EB
+[槭]→cù;    # U+69ED
+[噠]→dá;    # U+5660
+[蹬]→dèng;  # U+8E6C
+[地]→dì;    # U+5730
+[嗲]→diē;   # U+55F2
+[䏲跌]→dié;  # U+43F2,8DCC
+[町]→dīng;  # U+753A
+[斗]→dǒu;   # U+6597
+[都]→dū;    # U+90FD
+[碡]→dú;    # U+78A1
+[柁]→duò;   # U+67C1
+[嗯]→en;    # U+55EF
+[髪髮]→fǎ;   # U+9AEA,9AEE
+[蕃]→fān;   # U+8543
+[帆]→fán;   # U+5E06
+[氾]→fàn;   # U+6C3E
+[彷]→fǎng;  # U+5F77
+[坋]→fèn;   # U+574B
+[諷讽]→fèng; # U+8AF7,8BBD
+[乾]→gān;   # U+4E7E
+[㪅]→gēng;  # U+3A85
+[蓇]→gǔ;    # U+84C7
+[聒]→guā;   # U+8052
+[氿]→guǐ;   # U+6C3F
+[炔]→guì;   # U+7094
+[欻]→hū;    # U+6B3B
+[砉]→huò;   # U+7809
+[𪟝]→jī;    # U+2A7DD
+[蓻]→jí;    # U+84FB
+[袷]→jiá;   # U+88B7
+[叚]→jiǎ;   # U+53DA
+[菹]→jū;    # U+83F9
+[剋]→kè;    # U+524B
+[框]→kuāng; # U+6846
+[适]→kuò;   # U+9002
+[肋]→lè;    # U+808B
+[釐]→lí;    # U+91D0
+[峛]→lǐ;    # U+5CDB
+[𩷕]→liáng; # U+29DD5
+[瞭]→liǎo;  # U+77AD
+[蹣]→mán;   # U+8E63
+[眄]→miǎn;  # U+7704
+[碈]→mín;   # U+7888
+[万]→mò;    # U+4E07
+[伲]→nǐ;    # U+4F32
+[耙]→pá;    # U+8019
+[芘]→pí;    # U+8298
+[諞]→pián;  # U+8ADE
+[剽]→piào;  # U+527D
+[剖頗]→pǒ;   # U+5256,9817
+[醱]→pò;    # U+91B1
+[呇]→qǐ;    # U+5447
+[癿]→qié;   # U+767F
+[芎]→qiōng; # U+828E
+[杣]→shān;  # U+6763
+[杓]→sháo;  # U+6753
+[舍]→shè;   # U+820D
+[誰]→shéi;  # U+8AB0
+[識识]→shì; # U+8B58,8BC6
+[楯]→shǔn;  # U+696F
+[洓]→suǒ;   # U+6D13
+[沓]→tà;    # U+6C93
+[堤隄]→tí;   # U+5824,9684
+[萎]→wēi;   # U+840E
+[硊]→wěi;   # U+784A
+[筽]→wú;    # U+7B7D
+[嘸]→wǔ;    # U+5638
+[㴔]→xī;    # U+3D14
+[𲆰]→xí;    # U+321B0
+[𲆦]→xì;    # U+321A6
+[呷]→xiá;   # U+5477
+[硍]→xiàn;   # U+784D
+[崾]→yǎo;    # U+5D3E
+[畬]→yú;   # U+756C
+[薁]→yù;   # U+8581
+[嶦]→zhān;   # U+5DA6
+[著]→zhe;   # U+8457
+[徵]→zhēng;   # U+5FB5
+[苧]→zhù;   # U+82E7
+# END From Unicode 17, the above should be autogenerated:
+# Then run the normal Hani-Latn transform for the rest
+::Hani-Latn();
+			</tRule>
+		</transform>
+	</transforms>
+</supplementalData>
diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestTransforms.java
@@ -795,6 +795,38 @@ public void TestHiraKata() { // for CLDR-13127 and ...
         assertEquals("Hira-Kata", hiraKata.transform("゛゜ わ゙ ゟ"), "゛゜ ヷ ヨリ");
     }
 
+    public void TestHani() {
+        register();
+        Transliterator haniLatn = getTransliterator("und-Latn-t-und-hani");
+        Transliterator hansLatn1 = getTransliterator("und-Latn-t-und-hans");
+        Transliterator hansLatn2 = getTransliterator("Hans-Latn");
+        Transliterator hantLatn1 = getTransliterator("und-Latn-t-und-hant");
+        Transliterator hantLatn2 = getTransliterator("Hant-Latn");
+
+        // Text that should transform the same way for Hans, Hant
+        String commonSource = "藏文 重庆 沈阳 秘鲁 㶼 锿 𤸖 藏 秘 䃺 麽 𰈶 重 㑅 飵 𫗢";
+        String commonExpect =
+                "zàng wén chóng qìng shěn yáng bì lǔ āi āi āi cáng mì mó mó mó zhòng zuò zuò zuò";
+
+        // Text that should transform differently for Hans, Hant
+        String variantSource = "㪅 㴔 䏲 万 卜 叚 沈 沓 舍 著 髪 髮 麃 𩷕 𪟝 𲆦 𲆰";
+        String variantExpectHans = "gèng jí tī wàn bo xiá shěn dá shě zhù fà fà páo láng jì xī xī";
+        String variantExpectHant =
+                "gēng xī dié mò bǔ jiǎ chén tà shè zhe fǎ fǎ biāo liáng jī xì xí";
+
+        assertEquals("common haniLatn ", commonExpect, haniLatn.transform(commonSource));
+        assertEquals("common hansLatn1", commonExpect, hansLatn1.transform(commonSource));
+        assertEquals("common hansLatn2", commonExpect, hansLatn2.transform(commonSource));
+        assertEquals("common hantLatn1", commonExpect, hantLatn1.transform(commonSource));
+        assertEquals("common hantLatn2", commonExpect, hantLatn2.transform(commonSource));
+
+        assertEquals("variant haniLatn ", variantExpectHans, haniLatn.transform(variantSource));
+        assertEquals("variant hansLatn1", variantExpectHans, hansLatn1.transform(variantSource));
+        assertEquals("variant hansLatn2", variantExpectHans, hansLatn2.transform(variantSource));
+        assertEquals("variant hantLatn1", variantExpectHant, hantLatn1.transform(variantSource));
+        assertEquals("variant hantLatn2", variantExpectHant, hantLatn2.transform(variantSource));
+    }
+
     public void TestZawgyiToUnicode10899() {
         // Some tests for the transformation of Zawgyi font encoding to Unicode Burmese.
         Transliterator z2u = getTransliterator("my-t-my-s0-zawgyi");