Skip to content

Commit c96e180

Browse files
committed
write FractionalUCA with blanked weights
1 parent b00e298 commit c96e180

File tree

4 files changed

+200
-138
lines changed

4 files changed

+200
-138
lines changed

docs/uca/index.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,10 +115,19 @@ Judgment call. See Cherokee, Deseret, Osage, Vithkuqi for examples.
115115

116116
After running the tool, diff the main mapping file and look for bad changes
117117
(for example, more bytes per weight for common characters).
118+
119+
A good way for checking changes to both
120+
the sort order and the number of bytes in collation elements
121+
is to "blank" most of the non-zero weights in FractionalUCA.txt.
122+
There is a Python script for that (see below).
123+
Starting in 2025-may, the output of the Java tool directly includes
124+
FractionalUCA_blanked.txt which is nearly identical to running the Python script
125+
over FractionalUCA.txt.
126+
118127
```
119-
~/unitools/mine/src$ sed -r -f ~/cldr/uni/src/tools/scripts/uca/blankweights.sed ~/cldr/uni/src/common/uca/FractionalUCA.txt > ../frac-14.0.txt
120-
~/unitools/mine/src$ sed -r -f ~/cldr/uni/src/tools/scripts/uca/blankweights.sed ../Generated/UCA/15.0.0/CollationAuxiliary/FractionalUCA.txt > ../frac-15.0.txt
121-
~/unitools/mine/src$ meld ../frac-14.0.txt ../frac-15.0.txt
128+
~/unitools/mine/src$ sed -r -f ~/cldr/uni/src/tools/scripts/uca/blankweights.sed ~/cldr/uni/src/common/uca/FractionalUCA.txt > ../frac-16.0.txt
129+
~/unitools/mine/src$ sed -r -f ~/cldr/uni/src/tools/scripts/uca/blankweights.sed ../Generated/UCA/17.0.0/CollationAuxiliary/FractionalUCA.txt > ../frac-17.0.txt
130+
~/unitools/mine/src$ meld ../frac-16.0.txt ../frac-17.0.txt
122131
```
123132

124133
CLDR root data files are checked into $CLDR_SRC/common/uca/

unicodetools/src/main/java/org/unicode/text/UCA/CEList.java

Lines changed: 29 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -302,18 +302,30 @@ public static byte remap(int ch, byte type, int t) {
302302
return (byte) t;
303303
}
304304

305+
enum ToString {
306+
DEFAULT,
307+
NO_SPACES,
308+
BLANKED
309+
}
310+
305311
@Override
306312
public String toString() {
313+
return toString(ToString.DEFAULT);
314+
}
315+
316+
String toString(ToString option) {
307317
if (isEmpty()) {
308318
return toString(0);
309319
}
310320

311321
final StringBuilder result = new StringBuilder();
312322
for (int i = startOffset; i < endOffset; ++i) {
313-
if (i != startOffset) {
323+
if (i != startOffset && option == ToString.DEFAULT) {
314324
result.append(' ');
315325
}
316-
result.append(toString(contents[i]));
326+
int ce = contents[i];
327+
String s = option == ToString.BLANKED ? toBlankedString(ce) : toString(ce);
328+
result.append(s);
317329
}
318330
return result.toString();
319331
}
@@ -355,45 +367,21 @@ public static String toString(int ce) {
355367
+ Utility.hex(getSecondary(ce))
356368
+ "."
357369
+ Utility.hex(getTertiary(ce))
358-
+ "]"
359-
// + "(" + NAME3[getTertiary(ce)] + ")"
360-
;
361-
}
362-
363-
static final String[] NAME3 = {
364-
"IGNORE", // 0
365-
"BLK", // Unused?
366-
"MIN",
367-
"WIDE",
368-
"COMPAT",
369-
"FONT",
370-
"CIRCLE",
371-
"RES-2",
372-
"CAP",
373-
"WIDECAP",
374-
"COMPATCAP",
375-
"FONTCAP",
376-
"CIRCLECAP",
377-
"HIRA-SMALL",
378-
"HIRA",
379-
"SMALL",
380-
"SMALL-NARROW",
381-
"KATA",
382-
"NARROW",
383-
"CIRCLE-KATA",
384-
"SUP-MNN",
385-
"SUB-MNS",
386-
"VERT", // Missing??
387-
"AINI",
388-
"AMED",
389-
"AFIN",
390-
"AISO",
391-
"NOBREAK", // Missing?
392-
"SQUARED",
393-
"SQUAREDCAP",
394-
"FRACTION",
395-
"MAX"
396-
};
370+
+ "]";
371+
}
372+
373+
static String toBlankedString(int ce) {
374+
char p = getPrimary(ce);
375+
char s = getSecondary(ce);
376+
// Tertiary allkeys weights are fixed; do not blank them.
377+
return "["
378+
+ (p == 0 ? "0000" : "pppp")
379+
+ "."
380+
+ (s == 0 ? "0000" : "ssss")
381+
+ "."
382+
+ Utility.hex(getTertiary(ce))
383+
+ "]";
384+
}
397385

398386
public boolean containsZero() {
399387
for (int i = startOffset; i < endOffset; ++i) {

unicodetools/src/main/java/org/unicode/text/UCA/Fractional.java

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -238,26 +238,39 @@ public static WeightIterator assignTertiaryWeightsForTertiaryCEs(int numWeights)
238238
FIRST_IGNORABLE_TER_ASSIGNED, FIRST_IGNORABLE_TER_RESERVED, 0x40, numWeights, 20);
239239
}
240240

241-
public static void hexBytes(long x, StringBuffer result) {
241+
static StringBuilder hexBytes(long x, StringBuilder result) {
242242
final int oldLength = result.length();
243-
// byte lastb = 1;
244243
for (int shift = 24; shift >= 0; shift -= 8) {
245244
final byte b = (byte) (x >>> shift);
246245
if (b != 0) {
247246
if (result.length() != oldLength) {
248247
result.append(" ");
249248
}
250249
result.append(Utility.hex(b));
251-
// if (lastb == 0) System.err.println(" bad zero byte: " + result);
252250
}
253-
// lastb = b;
254251
}
252+
return result;
255253
}
256254

257-
public static String hexBytes(long x) {
258-
final StringBuffer temp = new StringBuffer();
259-
hexBytes(x, temp);
260-
return temp.toString();
255+
static StringBuilder blankedHexBytes(long x, String blanked, StringBuilder result) {
256+
if (result == null) {
257+
result = new StringBuilder();
258+
}
259+
final int oldLength = result.length();
260+
for (int shift = 24; shift >= 0; shift -= 8) {
261+
final byte b = (byte) (x >>> shift);
262+
if (b != 0) {
263+
if (result.length() != oldLength) {
264+
result.append(" ");
265+
}
266+
result.append(blanked);
267+
}
268+
}
269+
return result;
270+
}
271+
272+
static String hexBytes(long x) {
273+
return hexBytes(x, new StringBuilder()).toString();
261274
}
262275

263276
/* package */ static short getFixedScript(int ch) {

0 commit comments

Comments
 (0)