unicode-org · macchiati · Jan 1, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java b/unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java
@@ -0,0 +1,95 @@
+package org.unicode.tools;
+
+import com.google.common.base.Objects;
+import com.ibm.icu.impl.UnicodeMap;
+import com.ibm.icu.text.Normalizer2;
+import com.ibm.icu.text.UnicodeSet;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import org.unicode.cldr.util.Pair;
+import org.unicode.text.utility.Settings.UnicodeTools;
+import org.unicode.text.utility.Utility;
+
+public class CheckMissingNFKC {
+    public static void main(String[] args) throws IOException {
+        Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance();
+
+        UnicodeMap<String> n4m = new UnicodeMap<>();
+        System.out.println(UnicodeTools.UNICODETOOLS_RSRC_DIR);
+        // "/unicodetools/src/main/resources/org/unicode/tools/nfkc-extended.txt"
+        Path filePath =
+                Paths.get(
+                        UnicodeTools.UNICODETOOLS_RSRC_DIR, "org/unicode/tools/nfkc-extended.txt");
+
+        // Unfortunately the internal tools in ICU aren't accessible, so parse it ourselves
+        // https://unicode-org.github.io/icu/userguide/transforms/normalization/#data-file-syntax
+
+        Files.lines(filePath)
+                .forEach(
+                        line -> {
+                            if (line.isBlank() || line.startsWith("*")) {
+                                return;
+                            }
+                            int greaterPos = line.indexOf('>');
+                            if (greaterPos < 0) {
+                                greaterPos =
+                                        line.indexOf('='); // for our purposed, = is the same as >
+                                if (greaterPos < 0) {
+                                    int colonPos = line.indexOf(':');
+                                    if (colonPos < 0) {
+                                        throw new IllegalArgumentException("line: " + line);
+                                    }
+                                }
+                                return;
+                            }
+                            String rawSource = line.substring(0, greaterPos);
+                            String target = Utility.fromHex(line.substring(greaterPos + 1));
+
+                            int rangePos = rawSource.indexOf("..");
+                            if (rangePos < 0) {
+                                String source = Utility.fromHex(rawSource);
+                                n4m.put(source, target);
+                            } else {
+                                int sourceStart =
+                                        Utility.fromHex(rawSource.substring(0, rangePos))
+                                                .codePointAt(0);
+                                int sourceEnd =
+                                        Utility.fromHex(rawSource.substring(rangePos + 2))
+                                                .codePointAt(0);
+                                n4m.putAll(sourceStart, sourceEnd, target);
+                            }
+                        });
+
+        Map<String, Pair<String, String>> diff = new LinkedHashMap<>();
+        UnicodeSet toCheck = new UnicodeSet("[[\\P{C}]-\\p{cf}]");
+        System.out.println("Checking: " + toCheck.size() + " \t" + toCheck);
+        for (int cp : toCheck.codePoints()) {
+            String string = Character.toString(cp); // wish there were a code point interface
+            String nfc_cfString = nfkc_cf.normalize(string);
+            String n4mString = n4m.get(cp);
+            if (n4mString == null) {
+                n4mString = string;
+            }
+            if (Objects.equal(nfc_cfString, n4mString)) {
+                continue;
+            }
+            diff.put(string, Pair.of(n4mString, nfc_cfString));
+        }
+        System.out.println("Differences:\t" + diff.size());
+        System.out.println("Source" + "\t" + "N4M" + "\t" + "nfkc_cf");
+
+        for (Entry<String, Pair<String, String>> entry : diff.entrySet()) {
+            System.out.println(
+                    Utility.hex(entry.getKey())
+                            + "\t"
+                            + Utility.hex(entry.getValue().getFirst())
+                            + "\t"
+                            + Utility.hex(entry.getValue().getSecond()));
+        }
+    }
+}
diff --git a/unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java b/unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java
@@ -0,0 +1,147 @@
+package org.unicode.tools;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.lang.UProperty.NameChoice;
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.util.Output;
+import java.awt.Font;
+import java.awt.geom.Rectangle2D;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.UncheckedIOException;
+import javax.imageio.ImageIO;
+import org.unicode.text.utility.Settings;
+import org.unicode.text.utility.Utility;
+import org.unicode.utilities.GlyphRenderer;
+
+public class FindBlankGlyphs {
+    public static final String DATA_DIR =
+            Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/unicodetools/data/temp/";
+
+    public static void main(String[] args) {
+        // Configuration
+        String fontName = "Noto Sans";
+        int fontSize = 144;
+        Font font = new Font(fontName, Font.PLAIN, fontSize);
+
+        String specialCases =
+                "[\\N{HANGUL CHOSEONG FILLER}"
+                        + "\\N{HANGUL JUNGSEONG FILLER}"
+                        + "\\N{HANGUL FILLER}"
+                        + "\\N{HALFWIDTH HANGUL FILLER}"
+                        + "\\N{COMBINING GRAPHEME JOINER}"
+                        + "\\N{KHMER VOWEL INHERENT AQ}"
+                        + "\\N{KHMER VOWEL INHERENT AA}"
+                        + "\\N{BRAILLE PATTERN BLANK}"
+                        + "\\p{variation_selector}]"
+                        + "]";
+        UnicodeSet exclusions =
+                new UnicodeSet(
+                                "["
+                                        + "\\p{C}"
+                                        + "\\p{Z}"
+                                        + "\\p{rgi_emoji}"
+                                        // + "\\p{RGI_Emoji_Qualification=Minimally_Qualified}"
+                                        // ICU doesn't support this yet!
+                                        + "[\\p{emoji}-\\p{emoji_component}]"
+                                        + "\\p{whitespace}"
+                                        + "\\p{deprecated}"
+
+                                        // special cases
+                                        + specialCases)
+                        .freeze();
+
+        UnicodeSet showAnyway = new UnicodeSet("[]"); //  \\u034F
+        UnicodeSet chars =
+                new UnicodeSet(0, 0x10FFFF).removeAll(exclusions).addAll(showAnyway).freeze();
+        UnicodeSet show = new UnicodeSet();
+        Output<Rectangle2D> visualBounds = new Output<>();
+
+        int count = 0;
+        for (int cp : chars.codePoints()) {
+            if ((count % 10000) == 0) {
+                System.out.println(count + "\t" + Utility.hex(cp));
+            }
+            ++count;
+            String character = Character.toString(cp);
+
+            BufferedImage image =
+                    GlyphRenderer.createGlyphBitmap(
+                            font, character, visualBounds, fontSize * 2, fontSize * 3 / 2);
+            if (!GlyphRenderer.isImageSingleColor(image, 0) && !showAnyway.contains(cp)) {
+                continue;
+            }
+            show.add(cp);
+            System.out.println(
+                    Utility.hex(cp)
+                            + "\t"
+                            + UCharacter.getExtendedName(cp)
+                            + "\t"
+                            + GlyphRenderer.getPropValueName(
+                                    UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
+                            + "\t"
+                            + visualBounds);
+
+            // Save each image to file
+            File file = new File(DATA_DIR, "glyph_" + Utility.hex(character) + ".png");
+            try {
+                ImageIO.write(image, "png", file);
+            } catch (IOException e) {
+                throw new UncheckedIOException(e);
+            }
+        }
+        // write HTML file
+        File file = new File(DATA_DIR, "list.html");
+        try (PrintWriter writer = new PrintWriter(file)) {
+            writer.println(
+                    "<html><body><style>"
+                            + "table, th, td {\n"
+                            + "  border: 1px solid black; /* Sets a 1px solid black border on all elements */\n"
+                            + "}\n"
+                            + "\n"
+                            + "table {\n"
+                            + "  border-collapse: collapse; /* Merges adjacent borders into a single line */\n"
+                            + "}\n"
+                            + "\n"
+                            + "th, td {\n"
+                            + "  padding: 4px; /* Optional: Adds space between border and content */\n"
+                            + "}\n"
+                            + "body {\n"
+                            + "    font-family: 'Noto Sans', 'Noto Sans Symbols', sans-serif;\n"
+                            + "}\n"
+                            + ""
+                            + "</style><table>");
+            for (int cp : show.codePoints()) {
+                writer.println(
+                        "<tr>"
+                                + "<td>"
+                                + "<img src='"
+                                + "glyph_"
+                                + Utility.hex(cp)
+                                + ".png"
+                                + "' alt='"
+                                + UCharacter.getExtendedName(cp)
+                                + "' width='auto' height='32'>"
+                                + "</td><td style='font-size: 24px; text-align: center'>"
+                                + Character.toString(cp)
+                                + "</td><td>"
+                                + Utility.hex(cp)
+                                + "</td><td>"
+                                + UCharacter.getExtendedName(cp)
+                                + "</td><td>"
+                                + GlyphRenderer.getPropValueName(
+                                        UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
+                                + "</td>"
+                                + "<tr>");
+            }
+            writer.println("</table></body></html>");
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+        System.out.println("Checked: " + count);
+        System.out.println(specialCases);
+    }
+}