Skip to content

Commit 72db91b

Browse files
authored
Regenerate ArabicShaping.txt (#1242)
* Almost there * Good enough
1 parent 0de54c8 commit 72db91b

File tree

3 files changed

+201
-3
lines changed

3 files changed

+201
-3
lines changed

unicodetools/data/ucd/dev/ArabicShaping.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1-
# ArabicShaping-17.0.0.txt
2-
# Date: 2025-08-14
1+
# ArabicShaping-18.0.0.txt
2+
# Date: 2025-11-15, 02:15:28 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
66
#
7+
# Unicode Character Database
8+
# For documentation, see https://www.unicode.org/reports/tr44/
9+
#
710
# This file is a normative contributory data file in the
811
# Unicode Character Database.
912
#

unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.unicode.props.UcdPropertyValues.Bidi_Class_Values;
4747
import org.unicode.props.UcdPropertyValues.Block_Values;
4848
import org.unicode.props.UcdPropertyValues.East_Asian_Width_Values;
49+
import org.unicode.props.UcdPropertyValues.Joining_Group_Values;
4950
import org.unicode.props.UcdPropertyValues.Line_Break_Values;
5051
import org.unicode.props.UnicodeProperty;
5152
import org.unicode.text.UCD.MakeUnicodeFiles.Format.PrintStyle;
@@ -609,6 +610,9 @@ public static void generateFile(String filename) throws IOException {
609610
case "UnicodeData":
610611
generateUnicodeData(filename);
611612
break;
613+
case "ArabicShaping":
614+
generateArabicShaping(filename);
615+
break;
612616
default:
613617
generatePropertyFile(filename);
614618
break;
@@ -718,6 +722,83 @@ private static void generateScriptNfkc(String filename) throws IOException {
718722
udf.close();
719723
}
720724

725+
private static void generateArabicShaping(String filename) throws IOException {
726+
final UnicodeDataFile udf =
727+
UnicodeDataFile.openAndWriteHeader("UCD/" + Default.ucdVersion() + '/', filename);
728+
final PrintWriter pw = udf.out;
729+
Format.theFormat.printFileComments(pw, filename);
730+
pw.println();
731+
pw.println("# Unicode; Schematic Name; Joining Type; Joining Group");
732+
final var iup = IndexUnicodeProperties.make();
733+
final var schematicName = iup.getProperty(UcdProperty.Arabic_Shaping_Schematic_Name);
734+
// Other_Joining_Type is the Joining_Type value actually listed in ArabicShaping; characters
735+
// not listed have the value Deduce_From_General_Category.
736+
final var otherJoiningType = iup.getProperty(UcdProperty.Other_Joining_Type);
737+
final var joiningGroup = iup.getProperty(UcdProperty.Joining_Group);
738+
final var block = iup.getProperty(UcdProperty.Block);
739+
740+
final var scope = otherJoiningType.getSet("Deduce_From_General_Category").complement();
741+
String lastBlock = null;
742+
for (int codePoint : scope.codePoints()) {
743+
String blk =
744+
block.getValue(codePoint)
745+
.replace('_', ' ')
746+
.replaceAll("\\b(Extended) ([A-Z])$", "$1-$2");
747+
if (!blk.equals(lastBlock)) {
748+
String section;
749+
String[] comments = {};
750+
switch (Block_Values.forName(blk)) {
751+
case General_Punctuation:
752+
section = "Other";
753+
break;
754+
case Phags_Pa:
755+
section = "Phags-Pa Characters";
756+
break;
757+
case NKo:
758+
section = "N'Ko Characters";
759+
break;
760+
case Kaithi:
761+
section = "Kaithi Number Signs";
762+
// TODO(egg): These comments should probably live in MakeUnicodeFiles.txt,
763+
// but since they are a one-off it is not yet worth defining a syntax for
764+
// that.
765+
comments =
766+
new String[] {
767+
"These are prepended concatenation marks, comparable",
768+
"to the number signs in the Arabic script.",
769+
"Listed here for consistency in property values."
770+
};
771+
break;
772+
default:
773+
section = blk + " Characters";
774+
}
775+
pw.println();
776+
pw.println("# " + section);
777+
for (String line : comments) {
778+
pw.println("# " + line);
779+
}
780+
pw.println();
781+
lastBlock = blk;
782+
}
783+
String jg = joiningGroup.getValue(codePoint);
784+
if (Joining_Group_Values.forName(jg) != Joining_Group_Values.No_Joining_Group) {
785+
jg = jg.toUpperCase().replace('_', ' ');
786+
}
787+
pw.println(
788+
Utility.hex(codePoint)
789+
+ "; "
790+
+ schematicName.getValue(codePoint)
791+
+ "; "
792+
+ otherJoiningType.getFirstValueAlias(
793+
otherJoiningType.getValue(codePoint))
794+
+ "; "
795+
+ jg);
796+
}
797+
pw.println();
798+
pw.println("# EOF");
799+
udf.close();
800+
}
801+
721802
private static void generateUnicodeData(String filename) throws IOException {
722803
final UnicodeDataFile udf =
723804
UnicodeDataFile.openAndWriteHeader("UCD/" + Default.ucdVersion() + '/', filename);

unicodetools/src/main/resources/org/unicode/text/UCD/MakeUnicodeFiles.txt

Lines changed: 115 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1312,6 +1312,121 @@ Value: Brahmi_Joining_Number
13121312
File: UnicodeData
13131313
Property: SPECIAL
13141314

1315+
File: ArabicShaping
1316+
#
1317+
# This file is a normative contributory data file in the
1318+
# Unicode Character Database.
1319+
#
1320+
# This file defines the Joining_Type and Joining_Group property
1321+
# values for Arabic, Syriac, N'Ko, Mandaic, and Manichaean positional
1322+
# shaping, repeating in machine readable form the information
1323+
# exemplified in various tables of The Unicode Standard core specification.
1324+
#
1325+
# This file also defines Joining_Type values for Mongolian, Phags-pa,
1326+
# Psalter Pahlavi, Sogdian, Old Uyghur, Chorasmian, and Adlam positional
1327+
# shaping, and Joining_Type and Joining_Group values for Hanifi Rohingya
1328+
# positional shaping, which are not listed in tables in the core
1329+
# specification.
1330+
#
1331+
# Script Section Table(s)
1332+
#
1333+
# Arabic 9.2 9-3, 9-4, 9-5, 9-7, 9-8, 9-9, 9-10, 9-11, 9-13
1334+
# Syriac 9.3 9-15, 9-16, 9-17, 9-18, 9-19
1335+
# Mandaic 9.5 9-22, 9-23
1336+
# Manichaean 10.5 10-4, 10-5, 10-6, 10-7
1337+
# Psalter Pahlavi 10.6 --
1338+
# Chorasmian 10.8 --
1339+
# Mongolian 13.5 --
1340+
# Phags-pa 14.4 14-7
1341+
# Sogdian 14.10 --
1342+
# Old Uyghur 14.11 --
1343+
# Hanifi Rohingya 16.14 --
1344+
# N'Ko 19.4 19-5
1345+
# Adlam 19.9 --
1346+
#
1347+
# Each line contains four fields, separated by a semicolon.
1348+
#
1349+
# Field 0: the code point of a character, in hexadecimal form.
1350+
#
1351+
# Field 1: gives a short schematic name for that character.
1352+
# The schematic name is descriptive of the shape, based as
1353+
# consistently as possible on a name for the skeleton and
1354+
# then the diacritic marks applied to the skeleton, if any.
1355+
# Note that this schematic name is considered a comment,
1356+
# and does not constitute a formal property value.
1357+
#
1358+
# Field 2: defines the joining type (property name: Joining_Type)
1359+
# R Right_Joining
1360+
# L Left_Joining
1361+
# D Dual_Joining
1362+
# C Join_Causing
1363+
# U Non_Joining
1364+
# T Transparent
1365+
#
1366+
# See Section 9.2, Arabic for more information on these joining types.
1367+
# Note that for cursive joining scripts which are typically rendered
1368+
# top-to-bottom, rather than right-to-left, Joining_Type=L conventionally
1369+
# refers to bottom joining, and Joining_Type=R conventionally refers
1370+
# to top joining. See Section 14.4, Phags-pa for more information on the
1371+
# interpretation of joining types in vertical layout.
1372+
#
1373+
# Field 3: defines the joining group (property name: Joining_Group)
1374+
#
1375+
# The values of the joining group are based schematically on character
1376+
# names. Where a schematic character name consists of two or more parts
1377+
# separated by spaces, the formal Joining_Group property value, as specified in
1378+
# PropertyValueAliases.txt, consists of the same name parts joined by
1379+
# underscores. Hence, the entry:
1380+
#
1381+
# 0629; TEH MARBUTA; R; TEH MARBUTA
1382+
#
1383+
# corresponds to [Joining_Group = Teh_Marbuta].
1384+
#
1385+
# Note: The property value now designated [Joining_Group = Teh_Marbuta_Goal]
1386+
# used to apply to both of the following characters
1387+
# in earlier versions of the standard:
1388+
#
1389+
# U+06C2 ARABIC LETTER HEH GOAL WITH HAMZA ABOVE
1390+
# U+06C3 ARABIC LETTER TEH MARBUTA GOAL
1391+
#
1392+
# However, it currently applies only to U+06C3, and *not* to U+06C2.
1393+
# To avoid destabilizing existing Joining_Group property aliases, the
1394+
# prior Joining_Group value for U+06C3 (Hamza_On_Heh_Goal) has been
1395+
# retained as a property value alias, despite the fact that it
1396+
# no longer applies to its namesake character, U+06C2.
1397+
# See PropertyValueAliases.txt.
1398+
#
1399+
# When other cursive scripts are added to the Unicode Standard in the
1400+
# future, the joining group value of all its letters will default to
1401+
# jg=No_Joining_Group in this data file. Other, more specific
1402+
# joining group values will be defined only if an explicit proposal
1403+
# to define those values exactly has been approved by the UTC. This
1404+
# is the convention exemplified by the N'Ko, Mandaic, Mongolian,
1405+
# Phags-pa, Psalter Pahlavi, Sogdian, Old Uyghur, Chorasmian, and Adlam scripts.
1406+
# Only the Arabic, Manichaean, and Syriac scripts currently have
1407+
# explicit joining group values defined for all characters, including
1408+
# those which have only a single character in a particular Joining_Group
1409+
# class. Hanifi Rohingya has explicit Joining_Group values assigned only for
1410+
# the few characters which share a particular Joining_Group class, but
1411+
# assigns jg=No_Joining_Group to all the singletons.
1412+
#
1413+
# Note: Code points that are not explicitly listed in this file are
1414+
# either of Joining_Type T or U:
1415+
#
1416+
# - Those that are not explicitly listed and that are of General_Category Mn, Me, or Cf
1417+
# are Joining_Type=T.
1418+
# - All others not explicitly listed are Joining_Type=U.
1419+
#
1420+
# For an explicit listing of all characters of Joining_Type=T, see
1421+
# the derived property file DerivedJoiningType.txt.
1422+
# For an implementation that needs to parse for the values of
1423+
# Joining_Type, it is recommended to use DerivedJoiningType.txt
1424+
# instead of ArabicShaping.txt, to avoid the separate required step of
1425+
# calculating the set for Joining_Type=T based on General_Category values.
1426+
#
1427+
# #############################################################
1428+
Property: SPECIAL
1429+
13151430
File: SpecialCasing
13161431
Property: SPECIAL
13171432

@@ -1361,7 +1476,6 @@ HackName:
13611476
FinalComments
13621477
Note that PropertyAliases sorts by the long name, while PropertyValueAliases
13631478
sorts by the short name
1364-
ArabicShaping
13651479
BidiMirroring
13661480
CompositionExclusions
13671481
StandardizedVariants

0 commit comments

Comments
 (0)