Skip to content

Commit fbb3d6e

Browse files
committed
Merged main
2 parents b963a8a + 637341b commit fbb3d6e

39 files changed

+692
-189
lines changed

docs/pipeline.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ git checkout la-vache/main unicodetools/data/ucd/dev/extracted/*;
155155
git checkout la-vache/main unicodetools/data/ucd/dev/auxiliary/*;
156156
rm .\Generated\* -recurse -force;
157157
mvn compile exec:java '-Dexec.mainClass="org.unicode.text.UCD.Main"' '-Dexec.args="build MakeUnicodeFiles"' -am -pl unicodetools "-DCLDR_DIR=..\cldr\" "-DUNICODETOOLS_GEN_DIR=Generated" "-DUNICODETOOLS_REPO_DIR=.";
158-
cp .\Generated\UCD\17.0.0\* .\unicodetools\data\ucd\dev -recurse -force;
158+
cp .\Generated\UCD\18.0.0\* .\unicodetools\data\ucd\dev -recurse -force;
159159
rm unicodetools\data\ucd\dev\zzz-unchanged-*;
160160
rm unicodetools\data\ucd\dev\*\zzz-unchanged-*;
161161
rm .\unicodetools\data\ucd\dev\extra\*;
@@ -247,4 +247,4 @@ eggrobin (Windows, in-source).
247247
```powershell
248248
mvn test -am -pl unicodetools "-DCLDR_DIR=$(gl|split-path -parent)\cldr\" "-DUNICODETOOLS_GEN_DIR=$(gl|split-path -parent)\unicodetools\Generated\" "-DUNICODETOOLS_REPO_DIR=$(gl|split-path -parent)\unicodetools\" "-DUVERSION=18.0.0" "-Dtest=TestTestUnicodeInvariants#testAdditionComparisons" -DfailIfNoTests=false -DtrimStackTrace=false
249249
```
250-
Results are in Generated\UnicodeTestResults-addition-comparisons-[RMG issue number].html.
250+
Results are in Generated\UnicodeTestResults-addition-comparisons-[RMG issue number].html.

unicodetools/data/security/dev/confusables.txt

Lines changed: 70 additions & 20 deletions
Large diffs are not rendered by default.

unicodetools/data/security/dev/confusablesSummary.txt

Lines changed: 117 additions & 43 deletions
Large diffs are not rendered by default.

unicodetools/data/security/dev/data/confusablesSummaryIdentifier.txt

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# confusablesSummaryIdentifier.txt
2-
# Date: 2025-10-11, 02:30:37 GMT
2+
# Date: 2025-10-25, 07:52:31 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -459,9 +459,17 @@
459459
(‎ φ ‎) 03C6 GREEK SMALL LETTER PHI
460460
← (‎ ф ‎) 0444 CYRILLIC SMALL LETTER EF # →ɸ→
461461

462-
# ́ َ
462+
# ́ َ ަ
463463
(‎ ́ ‎) 0301 COMBINING ACUTE ACCENT
464464
← (‎ َ ‎) 064E ARABIC FATHA
465+
← (‎ ަ ‎) 07A6 THAANA ABAFILI
466+
467+
# ́́ ަަ ̋ ً ާ
468+
(‎ ́́ ‎) 0301 0301 COMBINING ACUTE ACCENT, COMBINING ACUTE ACCENT
469+
← (‎ ަަ ‎) 07A6 07A6 THAANA ABAFILI, THAANA ABAFILI
470+
← (‎ ̋ ‎) 030B COMBINING DOUBLE ACUTE ACCENT
471+
← (‎ ً ‎) 064B ARABIC FATHATAN # →̋→
472+
← (‎ ާ ‎) 07A7 THAANA AABAAFILI # →ަަ→
465473

466474
# ̆ ̌ ॅ
467475
(‎ ̆ ‎) 0306 COMBINING BREVE
@@ -481,29 +489,32 @@
481489
← (‎ ં ‎) 0A82 GUJARATI SIGN ANUSVARA
482490
← (‎ ் ‎) 0BCD TAMIL SIGN VIRAMA
483491

484-
# ̊ ْ ํ ໍ ံ ំ
492+
# ̊ ْ ް ํ ໍ ံ ំ
485493
(‎ ̊ ‎) 030A COMBINING RING ABOVE
486494
← (‎ ْ ‎) 0652 ARABIC SUKUN
495+
← (‎ ް ‎) 07B0 THAANA SUKUN
487496
← (‎ ํ ‎) 0E4D THAI CHARACTER NIKHAHIT
488497
← (‎ ໍ ‎) 0ECD LAO NIGGAHITA
498+
← (‎ ཾ ‎) 0F7E TIBETAN SIGN RJES SU NGA RO
489499
← (‎ ံ ‎) 1036 MYANMAR SIGN ANUSVARA
490500
← (‎ ំ ‎) 17C6 KHMER SIGN NIKAHIT
491501

492-
# ̋ ً
493-
(‎ ̋ ‎) 030B COMBINING DOUBLE ACUTE ACCENT
494-
← (‎ ً ‎) 064B ARABIC FATHATAN
502+
# ِ ި
503+
(‎ ِ ‎) 0650 ARABIC KASRA
504+
← (‎ ި ‎) 07A8 THAANA IBIFILI # →̗→
495505

496506
# ̦ ̧
497507
(‎ ̦ ‎) 0326 COMBINING COMMA BELOW
498508
← (‎ ̧ ‎) 0327 COMBINING CEDILLA # →̡→
499509

500-
# ̣ ़ ় ਼ ઼ ଼
510+
# ̣ ़ ় ਼ ઼ ଼
501511
(‎ ̣ ‎) 0323 COMBINING DOT BELOW
502512
← (‎ ़ ‎) 093C DEVANAGARI SIGN NUKTA
503513
← (‎ ় ‎) 09BC BENGALI SIGN NUKTA
504514
← (‎ ਼ ‎) 0A3C GURMUKHI SIGN NUKTA
505515
← (‎ ઼ ‎) 0ABC GUJARATI SIGN NUKTA
506516
← (‎ ଼ ‎) 0B3C ORIYA SIGN NUKTA
517+
← (‎ ฺ ‎) 0E3A THAI CHARACTER PHINTHU
507518

508519
# ॖ ੁ
509520
(‎ ॖ ‎) 0956 DEVANAGARI VOWEL SIGN UE
@@ -513,6 +524,10 @@
513524
(‎ ॗ ‎) 0957 DEVANAGARI VOWEL SIGN UUE
514525
← (‎ ੂ ‎) 0A42 GURMUKHI VOWEL SIGN UU
515526

527+
# ުު ޫ
528+
(‎ ުު ‎) 07AA 07AA THAANA UBUFILI, THAANA UBUFILI
529+
← (‎ ޫ ‎) 07AB THAANA OOBOOFILI
530+
516531
# Γ Г
517532
(‎ Γ ‎) 0393 GREEK CAPITAL LETTER GAMMA
518533
← (‎ Г ‎) 0413 CYRILLIC CAPITAL LETTER GHE
@@ -684,6 +699,11 @@
684699
(‎ پ̆ ‎) 067E 0306 ARABIC LETTER PEH, COMBINING BREVE
685700
← (‎ ࢾ ‎) 08BE ARABIC LETTER PEH WITH SMALL V # →‎پٚ‎→
686701

702+
# ِِ ިި ީ
703+
(‎ ِِ ‎) 0650 0650 ARABIC KASRA, ARABIC KASRA
704+
← (‎ ިި ‎) 07A8 07A8 THAANA IBIFILI, THAANA IBIFILI
705+
← (‎ ީ ‎) 07A9 THAANA EEBEEFILI # →ިި→
706+
687707
# ٢ ۲
688708
(‎ ٢ ‎) 0662 ARABIC-INDIC DIGIT TWO
689709
← (‎ ۲ ‎) 06F2 EXTENDED ARABIC-INDIC DIGIT TWO
@@ -725,6 +745,14 @@
725745
(‎ ݧ ‎) 0767 ARABIC LETTER NOON WITH TWO DOTS BELOW
726746
← (‎ ࢩ ‎) 08A9 ARABIC LETTER YEH WITH TWO DOTS BELOW AND DOT ABOVE # →‎ݔ‎→
727747

748+
# ެު ޮ
749+
(‎ ެު ‎) 07AC 07AA THAANA EBEFILI, THAANA UBUFILI
750+
← (‎ ޮ ‎) 07AE THAANA OBOFILI
751+
752+
# ެެ ޭ
753+
(‎ ެެ ‎) 07AC 07AC THAANA EBEFILI, THAANA EBEFILI
754+
← (‎ ޭ ‎) 07AD THAANA EYBEYFILI
755+
728756
# अ̆ अॅ ॲ
729757
(‎ अ̆ ‎) 0905 0306 DEVANAGARI LETTER A, COMBINING BREVE
730758
← (‎ अॅ ‎) 0905 0945 DEVANAGARI LETTER A, DEVANAGARI VOWEL SIGN CANDRA E
@@ -888,13 +916,15 @@
888916
(‎ ८ ‎) 096E DEVANAGARI DIGIT EIGHT
889917
← (‎ ૮ ‎) 0AEE GUJARATI DIGIT EIGHT
890918

891-
# ঃ ః ಃ ഃ ඃ း
919+
# ঃ ః ಃ ഃ ඃ း ះ 𑌃
892920
(‎ ঃ ‎) 0983 BENGALI SIGN VISARGA
893921
← (‎ ః ‎) 0C03 TELUGU SIGN VISARGA # →ਃ→
894922
← (‎ ಃ ‎) 0C83 KANNADA SIGN VISARGA # →ః→→ਃ→
895923
← (‎ ഃ ‎) 0D03 MALAYALAM SIGN VISARGA # →ಃ→→ః→→ਃ→
896924
← (‎ ඃ ‎) 0D83 SINHALA SIGN VISARGAYA # →ഃ→→ಃ→→ః→→ਃ→
897925
← (‎ း ‎) 1038 MYANMAR SIGN VISARGA # →ඃ→→ഃ→→ಃ→→ః→→ਃ→
926+
← (‎ ះ ‎) 17C7 KHMER SIGN REAHMUK
927+
← (‎ 𑌃 ‎) 11303 GRANTHA SIGN VISARGA
898928

899929
# অা আ
900930
(‎ অা ‎) 0985 09BE BENGALI LETTER A, BENGALI VOWEL SIGN AA
@@ -1255,6 +1285,14 @@
12551285
(‎ භ ‎) 0DB7 SINHALA LETTER MAHAAPRAANA BAYANNA
12561286
← (‎ හ ‎) 0DC4 SINHALA LETTER HAYANNA
12571287

1288+
# ෘෘ ෲ
1289+
(‎ ෘෘ ‎) 0DD8 0DD8 SINHALA VOWEL SIGN GAETTA-PILLA, SINHALA VOWEL SIGN GAETTA-PILLA
1290+
← (‎ ෲ ‎) 0DF2 SINHALA VOWEL SIGN DIGA GAETTA-PILLA
1291+
1292+
# ෙෙ ෛ
1293+
(‎ ෙෙ ‎) 0DD9 0DD9 SINHALA VOWEL SIGN KOMBUVA, SINHALA VOWEL SIGN KOMBUVA
1294+
← (‎ ෛ ‎) 0DDB SINHALA VOWEL SIGN KOMBU DEKA
1295+
12581296
# ข ฃ
12591297
(‎ ข ‎) 0E02 THAI CHARACTER KHO KHAI
12601298
← (‎ ฃ ‎) 0E03 THAI CHARACTER KHO KHUAT
@@ -1360,6 +1398,10 @@
13601398
(‎ ๋ ‎) 0E4B THAI CHARACTER MAI CHATTAWA
13611399
← (‎ ໋ ‎) 0ECB LAO TONE MAI CATAWA
13621400

1401+
# ເເ ແ
1402+
(‎ ເເ ‎) 0EC0 0EC0 LAO VOWEL SIGN E, LAO VOWEL SIGN E
1403+
← (‎ ແ ‎) 0EC1 LAO VOWEL SIGN EI
1404+
13631405
# ེེ ཻ
13641406
(‎ ེེ ‎) 0F7A 0F7A TIBETAN VOWEL SIGN E, TIBETAN VOWEL SIGN E
13651407
← (‎ ཻ ‎) 0F7B TIBETAN VOWEL SIGN EE
@@ -1386,6 +1428,10 @@
13861428
← (‎ ဩော် ‎) 1029 1031 102C 103A MYANMAR LETTER O, MYANMAR VOWEL SIGN E, MYANMAR VOWEL SIGN AA, MYANMAR SIGN ASAT
13871429
← (‎ ဪ ‎) 102A MYANMAR LETTER AU # →ဩော်→
13881430

1431+
# ၵာ ဢ
1432+
(‎ ဢ ‎) 1022 MYANMAR LETTER SHAN A
1433+
← (‎ ၵာ ‎) 1075 102C MYANMAR LETTER SHAN KA, MYANMAR VOWEL SIGN AA
1434+
13891435
# ၽှ ၾ
13901436
(‎ ၽှ ‎) 107D 103E MYANMAR LETTER SHAN PHA, MYANMAR CONSONANT SIGN MEDIAL HA
13911437
← (‎ ၾ ‎) 107E MYANMAR LETTER SHAN FA
@@ -1394,9 +1440,26 @@
13941440
(‎ ដ ‎) 178A KHMER LETTER DA
13951441
← (‎ ត ‎) 178F KHMER LETTER TA
13961442

1443+
# ទ្ប ឡ
1444+
(‎ ទ្ប ‎) 1791 17D2 1794 KHMER LETTER TO, KHMER SIGN COENG, KHMER LETTER BA
1445+
← (‎ ឡ ‎) 17A1 KHMER LETTER LA
1446+
1447+
# ព្ធ ឰ
1448+
(‎ ព្ធ ‎) 1796 17D2 1792 KHMER LETTER PO, KHMER SIGN COENG, KHMER LETTER THO
1449+
← (‎ ឰ ‎) 17B0 KHMER INDEPENDENT VOWEL QAI
1450+
1451+
# េี េី ើ
1452+
(‎ ើ ‎) 17BE KHMER VOWEL SIGN OE
1453+
← (‎ េี ‎) 17C1 0E35 KHMER VOWEL SIGN E, THAI CHARACTER SARA II # →េី→
1454+
← (‎ េី ‎) 17C1 17B8 KHMER VOWEL SIGN E, KHMER VOWEL SIGN II
1455+
1456+
# េា ោ
1457+
(‎ េា ‎) 17C1 17B6 KHMER VOWEL SIGN E, KHMER VOWEL SIGN AA
1458+
← (‎ ោ ‎) 17C4 KHMER VOWEL SIGN OO
1459+
13971460
# へ ヘ
13981461
(‎ へ ‎) 3078 HIRAGANA LETTER HE
13991462
← (‎ ヘ ‎) 30D8 KATAKANA LETTER HE
14001463

1401-
# total : 500
1464+
# total : 524
14021465

unicodetools/data/security/dev/data/source/confusables-source.txt

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5759,3 +5759,55 @@ A7F1 ; 02E2 # ( ꟱ → ˢ ) MODIFIER LETTER CAPITAL S → MODIFIER LETTER SMAL
57595759

57605760
# Confusables data for U+00A1 INVERTED EXCLAMATION MARK (PAG ref #453)
57615761
00A1 ; 0069
5762+
5763+
# Confusables data for dandas and double dandas (PAG ref #468)
5764+
0964 ; 007C # DEVANAGARI DANDA
5765+
104A ; 007C # MYANMAR SIGN LITTLE SECTION
5766+
A8CE ; 007C # SAURASHTRA DANDA
5767+
11047 ; 007C # BRAHMI DANDA
5768+
110C0 ; 007C # KAITHI DANDA
5769+
11141 ; 007C # CHAKMA DANDA
5770+
111C5 ; 007C # SHARADA DANDA
5771+
1144B ; 007C # NEWA DANDA
5772+
11641 ; 007C # MODI DANDA
5773+
11C41 ; 007C # BHAIKSUKI DANDA
5774+
AA5D ; 007C # CHAM PUNCTUATION DANDA
5775+
113D4 ; 007C # TULU-TIGALARI DANDA
5776+
115C5 ; 007C # SIDDHAM SEPARATOR BAR
5777+
16D63 ; 007C # KIRAT RAI VOWEL SIGN AA
5778+
0965 ; 007C 007C # DEVANAGARI DOUBLE DANDA
5779+
104B ; 007C 007C # MYANMAR SIGN SECTION
5780+
A8CF ; 007C 007C # SAURASHTRA DOUBLE DANDA
5781+
11048 ; 007C 007C # BRAHMI DOUBLE DANDA
5782+
110C1 ; 007C 007C # KAITHI DOUBLE DANDA
5783+
11142 ; 007C 007C # CHAKMA DOUBLE DANDA
5784+
111C6 ; 007C 007C # SHARADA DOUBLE DANDA
5785+
1144C ; 007C 007C # NEWA DOUBLE DANDA
5786+
11642 ; 007C 007C # MODI DOUBLE DANDA
5787+
11C42 ; 007C 007C # BHAIKSUKI DOUBLE DANDA
5788+
113D5 ; 007C 007C # TULU-TIGALARI DOUBLE DANDA
5789+
5790+
# High-priority confusables data (PAG ref #458)
5791+
030B ; 0301 0301
5792+
07A6 ; 0301
5793+
07A7 ; 07A6 07A6
5794+
07A8 ; 0317
5795+
07A9 ; 07A8 07A8
5796+
07AA ; 0350
5797+
07AB ; 07AA 07AA
5798+
07AC ; 1DFE
5799+
07AD ; 07AC 07AC
5800+
07AE ; 07AC 07AA
5801+
07B0 ; 030A
5802+
0DDB ; 0DD9 0DD9
5803+
0DF2 ; 0DD8 0DD8
5804+
0E3A ; 0323
5805+
0EC1 ; 0EC0 0EC0
5806+
0F7E ; 030A
5807+
1022 ; 1075 102C
5808+
17A1 ; 1791 17D2 1794
5809+
17B0 ; 1796 17D2 1792
5810+
17BE ; 17C1 17B8
5811+
17C4 ; 17C1 17B6
5812+
17C7 ; 0983
5813+
11303 ; 0983

0 commit comments

Comments
 (0)