1919import java .text .SimpleDateFormat ;
2020import java .util .ArrayList ;
2121import java .util .Arrays ;
22- import java .util .BitSet ;
2322import java .util .HashMap ;
2423import java .util .HashSet ;
2524import java .util .Iterator ;
@@ -540,13 +539,9 @@ public static void caseChart() throws IOException {
540539 public static void scriptChart () throws IOException {
541540 HACK_KANA = false ;
542541
543- final Set set = new TreeSet ();
544- final BitSet toReturn = new BitSet ();
542+ final Set <Pair > set = new TreeSet <>();
545543
546544 for (int i = 0 ; i <= 0x10FFFF ; ++i ) {
547- if (i == 0x0342 ) {
548- System .out .println ("?" );
549- }
550545 if (!Default .ucd ().isRepresented (i )) {
551546 continue ;
552547 }
@@ -555,24 +550,22 @@ public static void scriptChart() throws IOException {
555550 continue ;
556551 }
557552
558- final String code = UTF16 . valueOf ( i );
559-
553+ // TODO: Consider building a Map from script to set-of-code points.
554+ // Or maybe one Map by script, and one Map by General_Category.
560555 final String decomp = Default .nfkd ().normalize (i );
561- getBestScript (i , decomp . equals ( code ) ? null : decomp , toReturn );
562- for ( int script = toReturn . nextSetBit ( 0 );
563- script >= 0 ;
564- script = toReturn . nextSetBit ( script + 1 )) {
565- set . add (
566- new Pair (
567- script == COMMON_SCRIPT ? cat + CAT_OFFSET : script ,
568- new Pair ( decomp , i )));
569- }
556+ int script = getBestScript (i , decomp );
557+ // By adding the decomp string into the inner Pair, the chart is sorted by
558+ // decomp, then by code point.
559+ // TODO: Consider sorting each per- script chart in collation order.
560+ set . add ( new Pair ( script >= 0 ? script : cat + CAT_OFFSET , new Pair ( decomp , i )));
561+ // TODO: Consider sorting the scripts in the index in collation order.
562+ // Currently it is in the order of our numeric internal script IDs ,
563+ // which is meaningless.
564+ // (Putting the non-script General_Category groups at the end is probably fine.)
570565 }
571566
572567 PrintWriter output = null ;
573568
574- final Iterator it = set .iterator ();
575-
576569 int oldScript = -127 ;
577570
578571 final int counter = 0 ;
@@ -608,16 +601,13 @@ public static void scriptChart() throws IOException {
608601
609602 int columnCount = 0 ;
610603
611- while ( it . hasNext () ) {
604+ for ( Pair p : set ) {
612605 Utility .dot (counter );
613606
614- final Pair p = (Pair ) it .next ();
615607 final int script = ((Integer ) p .first ).intValue ();
616608 final int cp = ((Integer ) ((Pair ) p .second ).second ).intValue ();
617609
618- if (script != oldScript
619- // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT)
620- ) {
610+ if (script != oldScript ) {
621611 closeFile (output );
622612 output = null ;
623613 oldScript = script ;
@@ -909,46 +899,52 @@ static short getBestScript(String s) {
909899 // static final UnicodeMap<String> SCRIPT_EXTENSIONS =
910900 // INDEX_UNICODE_PROPS.load(UcdProperty.Script_Extensions);
911901
912- static BitSet getBestScript (int original , String transformed , BitSet toReturn ) {
913- toReturn .clear ();
914- addScript (original , toReturn );
915- if (transformed != null ) {
916- int cp ;
917- for (int i = 0 ; i < transformed .length (); i += UTF16 .getCharCount (cp )) {
918- cp = UTF16 .charAt (transformed , i );
919- addScript (cp , toReturn );
902+ /**
903+ * Returns the best explicit Script value for cp or else for decomp. If there is no such
904+ * explicit script, returns a negative value. Never returns Common or Inherited.
905+ */
906+ private static int getBestScript (int cp , String decomp ) {
907+ int sc = getExplicitScript (cp );
908+ if (sc < 0 && !equals (decomp , cp )) {
909+ for (int i = 0 ; sc < 0 && i < decomp .length (); i += Character .charCount (cp )) {
910+ cp = decomp .codePointAt (i );
911+ sc = getExplicitScript (cp );
920912 }
921913 }
922- if (toReturn .isEmpty ()) {
923- toReturn .set (COMMON_SCRIPT );
924- }
925- return toReturn ;
914+ return sc ;
915+ }
916+
917+ private static boolean equals (CharSequence s , int cp ) {
918+ int first ;
919+ return s .length () != 0
920+ && (first = Character .codePointAt (s , 0 )) == cp
921+ && Character .charCount (first ) == s .length ();
926922 }
927923
928924 static ToolUnicodePropertySource properties =
929925 ToolUnicodePropertySource .make (Default .ucdVersion ());
930926 static UnicodeProperty SCRIPT_EXTENSIONS = properties .getProperty ("script extensions" );
931927
932- private static void addScript (int cp , BitSet toReturn ) {
933- final short script2 = Default .ucd ().getScript (cp );
934- if (script2 == COMMON_SCRIPT || script2 == INHERITED_SCRIPT ) {
935- final String scriptString = SCRIPT_EXTENSIONS .getValue (cp );
936- if (scriptString == null ) {
937- return ;
938- }
939- if (scriptString .equals ("Zinh" ) || scriptString .equals ("Zyyy" )) {
940- return ;
941- }
942- if (scriptString .contains (" " )) {
943- for (final String part : scriptString .split (" " )) {
944- toReturn .set (findScriptCode (part ));
945- }
946- } else {
947- toReturn .set (findScriptCode (scriptString ));
948- }
949- return ;
928+ /**
929+ * Returns cp's explicit Script if it has one. Otherwise returns the script in its
930+ * Script_Extensions, if there is exactly one and that one is explicit. Otherwise returns a
931+ * negative value. Never returns Common or Inherited.
932+ */
933+ private static int getExplicitScript (int cp ) {
934+ int sc = Default .ucd ().getScript (cp );
935+ if (isExplicitScript (sc )) {
936+ return sc ;
950937 }
951- toReturn .set (script2 );
938+ // See if there is exactly one explicit script in the Script_Extensions.
939+ String scx = SCRIPT_EXTENSIONS .getValue (cp );
940+ if (scx != null && !scx .contains (" " ) && !scx .equals ("Zinh" ) && !scx .equals ("Zyyy" )) {
941+ sc = findScriptCode (scx );
942+ }
943+ return isExplicitScript (sc ) ? sc : -1 ;
944+ }
945+
946+ private static boolean isExplicitScript (int sc ) {
947+ return !(sc == COMMON_SCRIPT || sc == INHERITED_SCRIPT );
952948 }
953949
954950 private static int findScriptCode (String part ) {
0 commit comments