Skip to content

Commit 9a88f80

Browse files
committed
script charts: each character on exactly one
1 parent 3d69f72 commit 9a88f80

File tree

1 file changed

+51
-55
lines changed

1 file changed

+51
-55
lines changed

unicodetools/src/main/java/org/unicode/text/UCA/WriteCharts.java

Lines changed: 51 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import java.text.SimpleDateFormat;
2020
import java.util.ArrayList;
2121
import java.util.Arrays;
22-
import java.util.BitSet;
2322
import java.util.HashMap;
2423
import java.util.HashSet;
2524
import java.util.Iterator;
@@ -540,13 +539,9 @@ public static void caseChart() throws IOException {
540539
public static void scriptChart() throws IOException {
541540
HACK_KANA = false;
542541

543-
final Set set = new TreeSet();
544-
final BitSet toReturn = new BitSet();
542+
final Set<Pair> set = new TreeSet<>();
545543

546544
for (int i = 0; i <= 0x10FFFF; ++i) {
547-
if (i == 0x0342) {
548-
System.out.println("?");
549-
}
550545
if (!Default.ucd().isRepresented(i)) {
551546
continue;
552547
}
@@ -555,24 +550,22 @@ public static void scriptChart() throws IOException {
555550
continue;
556551
}
557552

558-
final String code = UTF16.valueOf(i);
559-
553+
// TODO: Consider building a Map from script to set-of-code points.
554+
// Or maybe one Map by script, and one Map by General_Category.
560555
final String decomp = Default.nfkd().normalize(i);
561-
getBestScript(i, decomp.equals(code) ? null : decomp, toReturn);
562-
for (int script = toReturn.nextSetBit(0);
563-
script >= 0;
564-
script = toReturn.nextSetBit(script + 1)) {
565-
set.add(
566-
new Pair(
567-
script == COMMON_SCRIPT ? cat + CAT_OFFSET : script,
568-
new Pair(decomp, i)));
569-
}
556+
int script = getBestScript(i, decomp);
557+
// By adding the decomp string into the inner Pair, the chart is sorted by
558+
// decomp, then by code point.
559+
// TODO: Consider sorting each per-script chart in collation order.
560+
set.add(new Pair(script >= 0 ? script : cat + CAT_OFFSET, new Pair(decomp, i)));
561+
// TODO: Consider sorting the scripts in the index in collation order.
562+
// Currently it is in the order of our numeric internal script IDs,
563+
// which is meaningless.
564+
// (Putting the non-script General_Category groups at the end is probably fine.)
570565
}
571566

572567
PrintWriter output = null;
573568

574-
final Iterator it = set.iterator();
575-
576569
int oldScript = -127;
577570

578571
final int counter = 0;
@@ -608,16 +601,13 @@ public static void scriptChart() throws IOException {
608601

609602
int columnCount = 0;
610603

611-
while (it.hasNext()) {
604+
for (Pair p : set) {
612605
Utility.dot(counter);
613606

614-
final Pair p = (Pair) it.next();
615607
final int script = ((Integer) p.first).intValue();
616608
final int cp = ((Integer) ((Pair) p.second).second).intValue();
617609

618-
if (script != oldScript
619-
// && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT)
620-
) {
610+
if (script != oldScript) {
621611
closeFile(output);
622612
output = null;
623613
oldScript = script;
@@ -909,46 +899,52 @@ static short getBestScript(String s) {
909899
// static final UnicodeMap<String> SCRIPT_EXTENSIONS =
910900
// INDEX_UNICODE_PROPS.load(UcdProperty.Script_Extensions);
911901

912-
static BitSet getBestScript(int original, String transformed, BitSet toReturn) {
913-
toReturn.clear();
914-
addScript(original, toReturn);
915-
if (transformed != null) {
916-
int cp;
917-
for (int i = 0; i < transformed.length(); i += UTF16.getCharCount(cp)) {
918-
cp = UTF16.charAt(transformed, i);
919-
addScript(cp, toReturn);
902+
/**
903+
* Returns the best explicit Script value for cp or else for decomp. If there is no such
904+
* explicit script, returns a negative value. Never returns Common or Inherited.
905+
*/
906+
private static int getBestScript(int cp, String decomp) {
907+
int sc = getExplicitScript(cp);
908+
if (sc < 0 && !equals(decomp, cp)) {
909+
for (int i = 0; sc < 0 && i < decomp.length(); i += Character.charCount(cp)) {
910+
cp = decomp.codePointAt(i);
911+
sc = getExplicitScript(cp);
920912
}
921913
}
922-
if (toReturn.isEmpty()) {
923-
toReturn.set(COMMON_SCRIPT);
924-
}
925-
return toReturn;
914+
return sc;
915+
}
916+
917+
private static boolean equals(CharSequence s, int cp) {
918+
int first;
919+
return s.length() != 0
920+
&& (first = Character.codePointAt(s, 0)) == cp
921+
&& Character.charCount(first) == s.length();
926922
}
927923

928924
static ToolUnicodePropertySource properties =
929925
ToolUnicodePropertySource.make(Default.ucdVersion());
930926
static UnicodeProperty SCRIPT_EXTENSIONS = properties.getProperty("script extensions");
931927

932-
private static void addScript(int cp, BitSet toReturn) {
933-
final short script2 = Default.ucd().getScript(cp);
934-
if (script2 == COMMON_SCRIPT || script2 == INHERITED_SCRIPT) {
935-
final String scriptString = SCRIPT_EXTENSIONS.getValue(cp);
936-
if (scriptString == null) {
937-
return;
938-
}
939-
if (scriptString.equals("Zinh") || scriptString.equals("Zyyy")) {
940-
return;
941-
}
942-
if (scriptString.contains(" ")) {
943-
for (final String part : scriptString.split(" ")) {
944-
toReturn.set(findScriptCode(part));
945-
}
946-
} else {
947-
toReturn.set(findScriptCode(scriptString));
948-
}
949-
return;
928+
/**
929+
* Returns cp's explicit Script if it has one. Otherwise returns the script in its
930+
* Script_Extensions, if there is exactly one and that one is explicit. Otherwise returns a
931+
* negative value. Never returns Common or Inherited.
932+
*/
933+
private static int getExplicitScript(int cp) {
934+
int sc = Default.ucd().getScript(cp);
935+
if (isExplicitScript(sc)) {
936+
return sc;
950937
}
951-
toReturn.set(script2);
938+
// See if there is exactly one explicit script in the Script_Extensions.
939+
String scx = SCRIPT_EXTENSIONS.getValue(cp);
940+
if (scx != null && !scx.contains(" ") && !scx.equals("Zinh") && !scx.equals("Zyyy")) {
941+
sc = findScriptCode(scx);
942+
}
943+
return isExplicitScript(sc) ? sc : -1;
944+
}
945+
946+
private static boolean isExplicitScript(int sc) {
947+
return !(sc == COMMON_SCRIPT || sc == INHERITED_SCRIPT);
952948
}
953949

954950
private static int findScriptCode(String part) {

0 commit comments

Comments
 (0)