diff --git a/unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java b/unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java new file mode 100644 index 000000000..84fa76981 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java @@ -0,0 +1,95 @@ +package org.unicode.tools; + +import com.google.common.base.Objects; +import com.ibm.icu.impl.UnicodeMap; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; +import org.unicode.cldr.util.Pair; +import org.unicode.text.utility.Settings.UnicodeTools; +import org.unicode.text.utility.Utility; + +public class CheckMissingNFKC { + public static void main(String[] args) throws IOException { + Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance(); + + UnicodeMap n4m = new UnicodeMap<>(); + System.out.println(UnicodeTools.UNICODETOOLS_RSRC_DIR); + // "/unicodetools/src/main/resources/org/unicode/tools/nfkc-extended.txt" + Path filePath = + Paths.get( + UnicodeTools.UNICODETOOLS_RSRC_DIR, "org/unicode/tools/nfkc-extended.txt"); + + // Unfortunately the internal tools in ICU aren't accessible, so parse it ourselves + // https://unicode-org.github.io/icu/userguide/transforms/normalization/#data-file-syntax + + Files.lines(filePath) + .forEach( + line -> { + if (line.isBlank() || line.startsWith("*")) { + return; + } + int greaterPos = line.indexOf('>'); + if (greaterPos < 0) { + greaterPos = + line.indexOf('='); // for our purposed, = is the same as > + if (greaterPos < 0) { + int colonPos = line.indexOf(':'); + if (colonPos < 0) { + throw new IllegalArgumentException("line: " + line); + } + } + return; + } + String rawSource = line.substring(0, greaterPos); + String target = Utility.fromHex(line.substring(greaterPos + 1)); + + int rangePos = rawSource.indexOf(".."); + if (rangePos < 0) { + String source = Utility.fromHex(rawSource); + n4m.put(source, target); + } else { + int sourceStart = + Utility.fromHex(rawSource.substring(0, rangePos)) + .codePointAt(0); + int sourceEnd = + Utility.fromHex(rawSource.substring(rangePos + 2)) + .codePointAt(0); + n4m.putAll(sourceStart, sourceEnd, target); + } + }); + + Map> diff = new LinkedHashMap<>(); + UnicodeSet toCheck = new UnicodeSet("[[\\P{C}]-\\p{cf}]"); + System.out.println("Checking: " + toCheck.size() + " \t" + toCheck); + for (int cp : toCheck.codePoints()) { + String string = Character.toString(cp); // wish there were a code point interface + String nfc_cfString = nfkc_cf.normalize(string); + String n4mString = n4m.get(cp); + if (n4mString == null) { + n4mString = string; + } + if (Objects.equal(nfc_cfString, n4mString)) { + continue; + } + diff.put(string, Pair.of(n4mString, nfc_cfString)); + } + System.out.println("Differences:\t" + diff.size()); + System.out.println("Source" + "\t" + "N4M" + "\t" + "nfkc_cf"); + + for (Entry> entry : diff.entrySet()) { + System.out.println( + Utility.hex(entry.getKey()) + + "\t" + + Utility.hex(entry.getValue().getFirst()) + + "\t" + + Utility.hex(entry.getValue().getSecond())); + } + } +} diff --git a/unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java b/unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java new file mode 100644 index 000000000..3ea4aed7f --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java @@ -0,0 +1,147 @@ +package org.unicode.tools; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; +import java.awt.Font; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.UncheckedIOException; +import javax.imageio.ImageIO; +import org.unicode.text.utility.Settings; +import org.unicode.text.utility.Utility; +import org.unicode.utilities.GlyphRenderer; + +public class FindBlankGlyphs { + public static final String DATA_DIR = + Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/unicodetools/data/temp/"; + + public static void main(String[] args) { + // Configuration + String fontName = "Noto Sans"; + int fontSize = 144; + Font font = new Font(fontName, Font.PLAIN, fontSize); + + String specialCases = + "[\\N{HANGUL CHOSEONG FILLER}" + + "\\N{HANGUL JUNGSEONG FILLER}" + + "\\N{HANGUL FILLER}" + + "\\N{HALFWIDTH HANGUL FILLER}" + + "\\N{COMBINING GRAPHEME JOINER}" + + "\\N{KHMER VOWEL INHERENT AQ}" + + "\\N{KHMER VOWEL INHERENT AA}" + + "\\N{BRAILLE PATTERN BLANK}" + + "\\p{variation_selector}]" + + "]"; + UnicodeSet exclusions = + new UnicodeSet( + "[" + + "\\p{C}" + + "\\p{Z}" + + "\\p{rgi_emoji}" + // + "\\p{RGI_Emoji_Qualification=Minimally_Qualified}" + // ICU doesn't support this yet! + + "[\\p{emoji}-\\p{emoji_component}]" + + "\\p{whitespace}" + + "\\p{deprecated}" + + // special cases + + specialCases) + .freeze(); + + UnicodeSet showAnyway = new UnicodeSet("[]"); // \\u034F + UnicodeSet chars = + new UnicodeSet(0, 0x10FFFF).removeAll(exclusions).addAll(showAnyway).freeze(); + UnicodeSet show = new UnicodeSet(); + Output visualBounds = new Output<>(); + + int count = 0; + for (int cp : chars.codePoints()) { + if ((count % 10000) == 0) { + System.out.println(count + "\t" + Utility.hex(cp)); + } + ++count; + String character = Character.toString(cp); + + BufferedImage image = + GlyphRenderer.createGlyphBitmap( + font, character, visualBounds, fontSize * 2, fontSize * 3 / 2); + if (!GlyphRenderer.isImageSingleColor(image, 0) && !showAnyway.contains(cp)) { + continue; + } + show.add(cp); + System.out.println( + Utility.hex(cp) + + "\t" + + UCharacter.getExtendedName(cp) + + "\t" + + GlyphRenderer.getPropValueName( + UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp) + + "\t" + + visualBounds); + + // Save each image to file + File file = new File(DATA_DIR, "glyph_" + Utility.hex(character) + ".png"); + try { + ImageIO.write(image, "png", file); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + // write HTML file + File file = new File(DATA_DIR, "list.html"); + try (PrintWriter writer = new PrintWriter(file)) { + writer.println( + ""); + for (int cp : show.codePoints()) { + writer.println( + "" + + "" + + ""); + } + writer.println("
" + + ""
+                                + UCharacter.getExtendedName(cp)
+                                + "" + + "" + + Character.toString(cp) + + "" + + Utility.hex(cp) + + "" + + UCharacter.getExtendedName(cp) + + "" + + GlyphRenderer.getPropValueName( + UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp) + + "
"); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + System.out.println("Checked: " + count); + System.out.println(specialCases); + } +} diff --git a/unicodetools/src/main/java/org/unicode/utilities/FastSymbolMatcher.java b/unicodetools/src/main/java/org/unicode/utilities/FastSymbolMatcher.java new file mode 100644 index 000000000..0862c8b72 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/utilities/FastSymbolMatcher.java @@ -0,0 +1,219 @@ +package org.unicode.utilities; + +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; +import java.awt.Font; +import java.awt.Graphics2D; +import java.awt.Image; +import java.awt.Point; +import java.awt.Rectangle; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.util.ArrayList; +import java.util.List; +import org.unicode.text.utility.Utility; + +public class FastSymbolMatcher { + + // --- Tuning Parameters --- + private static final double MAX_ASPECT_RATIO_DIFF = + 0.5; // Fail if width/height is too different + private static final double MAX_DENSITY_DIFF = 0.25; // Fail if ink thickness is too different + + public static void main(String[] args) { + UnicodeSet chars = new UnicodeSet(args[0]); + System.out.println(chars.size() + "\t" + chars); + + List profiles = new ArrayList<>(); + Output visualBounds = new Output<>(); + String fontName = "Noto Sans"; + int fontSize = 144; + Font font = new Font(fontName, Font.PLAIN, fontSize); + + int count = 0; + + for (int cp : chars.codePoints()) { + if ((count % 10000) == 0) { + System.out.println(count + "\t" + Utility.hex(cp)); + } + ++count; + String character = Character.toString(cp); + + BufferedImage image = + GlyphRenderer.createGlyphBitmap( + font, character, visualBounds, fontSize * 2, fontSize * 3 / 2); + profiles.add(new SymbolProfile(cp, image)); + } + compare(profiles); + } + + public static void compare(List profiles) { + + // 2. PRE-PROCESSING (The "N" Step) + // Convert heavy images into lightweight Profiles ONCE. + // This is O(N) complexity. + + System.out.println("Generating comparisons"); + // 3. COMPARISON (The "N^2" Step) + // Compare every symbol against every other symbol + for (int i = 0; i < profiles.size(); i++) { + System.out.print("\t" + Character.toString(profiles.get(i).cp)); + } + System.out.println(); + for (int i = 0; i < profiles.size(); i++) { + SymbolProfile p1 = profiles.get(i); + System.out.print(Character.toString(p1.cp)); + for (int j = 0; j < i; j++) { + SymbolProfile p2 = profiles.get(j); + + // --- THE QUICK TEST --- + if (shouldFailFast(p1, p2)) { + // Skip expensive logic completely + // System.out.println("Quick Mismatch: " + GlyphRenderer.charInfo(p2.cp)); + System.out.print("\t"); + continue; + } + + // --- EXPENSIVE TEST --- + // Only runs if the symbols are roughly similar shapes + double score = detailedCompare(p1, p2); + System.out.print("\t" + (int) (score * 100)); + } + System.out.println(); + } + } + + /** + * The "Gatekeeper" method. Returns TRUE if the symbols are so different we shouldn't bother + * comparing pixels. + */ + private static boolean shouldFailFast(SymbolProfile p1, SymbolProfile p2) { + // Test 1: Aspect Ratio (Is one tall and thin, and the other short and wide?) + // e.g., prevents comparing "l" with "w" + if (Math.abs(p1.aspectRatio - p2.aspectRatio) > MAX_ASPECT_RATIO_DIFF) { + return true; + } + + // Test 2: Ink Density (Is one heavy/filled and the other light/empty?) + // e.g., prevents comparing "." with "M" + if (Math.abs(p1.inkDensity - p2.inkDensity) > MAX_DENSITY_DIFF) { + return true; + } + + return false; + } + + /** The detailed pixel comparison (same logic as before, but using cached thumbnails). */ + private static double detailedCompare(SymbolProfile p1, SymbolProfile p2) { + // 1. Calculate Weights + double visualSim = getVisualSimilarity(p1.thumbnail, p2.thumbnail); + + // Size Penalty (using cached bounds) + double areaA = p1.bounds.width * p1.bounds.height; + double areaB = p2.bounds.width * p2.bounds.height; + double sizeSim = 1.0 - (Math.abs(areaA - areaB) / Math.max(areaA, areaB)); + + // Position Penalty (using cached centers) + double dist = p1.center.distance(p2.center); + double maxDist = Math.sqrt(Math.pow(1000, 2) + Math.pow(1000, 2)); // Mock max canvas size + double posSim = 1.0 - (dist / maxDist); + if (posSim < 0) posSim = 0; + + return (visualSim * 0.6) + (sizeSim * 0.2) + (posSim * 0.2); + } + + private static double getVisualSimilarity(BufferedImage imgA, BufferedImage imgB) { + long diff = 0; + int w = imgA.getWidth(); + int h = imgA.getHeight(); + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // Quick grayscale diff + int rgbA = imgA.getRGB(x, y) & 0xFF; // Blue channel proxy for gray + int rgbB = imgB.getRGB(x, y) & 0xFF; + diff += Math.abs(rgbA - rgbB); + } + } + return 1.0 - (diff / (w * h * 255.0)); + } + + // --- INNER CLASS FOR PRE-CALCULATED DATA --- + static class SymbolProfile { + int cp; + Rectangle bounds; + Point center; + double aspectRatio; + double inkDensity; + BufferedImage thumbnail; // Small 32x32 cached version + + public SymbolProfile(int cp, BufferedImage raw) { + this.cp = cp; + // 1. Expensive Scan (Done ONLY ONCE per image) + this.bounds = getBounds(raw); + + if (this.bounds == null) { + // Handle empty image + this.bounds = new Rectangle(0, 0, 1, 1); + this.aspectRatio = 0; + this.inkDensity = 0; + this.thumbnail = new BufferedImage(32, 32, BufferedImage.TYPE_INT_ARGB); + return; + } + + this.center = new Point((int) bounds.getCenterX(), (int) bounds.getCenterY()); + + // 2. Pre-calculate Heuristics + this.aspectRatio = (double) bounds.width / bounds.height; + + // 3. Create Cached Thumbnail + BufferedImage cropped = + raw.getSubimage(bounds.x, bounds.y, bounds.width, bounds.height); + this.thumbnail = resize(cropped, 32, 32); + + // 4. Calculate Density (on the small thumbnail to save time) + this.inkDensity = calculateDensity(this.thumbnail); + } + + private double calculateDensity(BufferedImage img) { + long totalPixels = img.getWidth() * img.getHeight(); + long filledPixels = 0; + for (int y = 0; y < img.getHeight(); y++) { + for (int x = 0; x < img.getWidth(); x++) { + int alpha = (img.getRGB(x, y) >> 24) & 0xff; + if (alpha > 0) filledPixels++; // Assuming transparent background + // If white background, check brightness < 200 + } + } + return (double) filledPixels / totalPixels; + } + + private Rectangle getBounds(BufferedImage img) { + int minX = img.getWidth(), minY = img.getHeight(), maxX = -1, maxY = -1; + boolean found = false; + for (int y = 0; y < img.getHeight(); y++) { + int xSum = 0; + for (int x = 0; x < img.getWidth(); x++) { + int alpha = (img.getRGB(x, y) >> 24) & 0xff; + xSum += alpha; + if (alpha != 0) { // Assuming transparent background + if (x < minX) minX = x; + if (x > maxX) maxX = x; + if (y < minY) minY = y; + if (y > maxY) maxY = y; + found = true; + } + } + } + return found ? new Rectangle(minX, minY, maxX - minX + 1, maxY - minY + 1) : null; + } + + private BufferedImage resize(BufferedImage img, int w, int h) { + Image tmp = img.getScaledInstance(w, h, Image.SCALE_SMOOTH); + BufferedImage dimg = new BufferedImage(w, h, BufferedImage.TYPE_INT_ARGB); + Graphics2D g2d = dimg.createGraphics(); + g2d.drawImage(tmp, 0, 0, null); + g2d.dispose(); + return dimg; + } + } +} diff --git a/unicodetools/src/main/java/org/unicode/utilities/GlyphRenderer.java b/unicodetools/src/main/java/org/unicode/utilities/GlyphRenderer.java new file mode 100644 index 000000000..fba053017 --- /dev/null +++ b/unicodetools/src/main/java/org/unicode/utilities/GlyphRenderer.java @@ -0,0 +1,158 @@ +package org.unicode.utilities; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.util.Output; +import java.awt.Color; +import java.awt.Font; +import java.awt.FontMetrics; +import java.awt.Graphics2D; +import java.awt.RenderingHints; +import java.awt.font.FontRenderContext; +import java.awt.font.GlyphVector; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import org.unicode.text.utility.Utility; + +public class GlyphRenderer { + private static final boolean SHOW_METRICS = false; + + public static BufferedImage createGlyphBitmap( + Font font, String text, Output boundsOutput, int width, int height) { + BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB); + Graphics2D g2d = image.createGraphics(); + + // Setup Font and Rendering + g2d.setFont(font); + g2d.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); + g2d.setRenderingHint( + RenderingHints.KEY_TEXT_ANTIALIASING, RenderingHints.VALUE_TEXT_ANTIALIAS_ON); + + // Measure the Glyph + // We need a temporary Graphics context to get the FontRenderContext + FontRenderContext frc = g2d.getFontRenderContext(); + + // createGlyphVector allows us to get the precise visual bounds of the character + GlyphVector gv = font.createGlyphVector(frc, text); + Rectangle2D bounds = gv.getVisualBounds(); + boundsOutput.value = bounds; + + // Get Global Font Metrics + FontMetrics fm = g2d.getFontMetrics(); + + // --- HORIZONTAL CENTERING --- + // Calculate the width of the specific character + int textWidth = fm.stringWidth(text); + // Calculate X to center it in the canvas + int x = (width - textWidth) / 2; + + // --- VERTICAL BASELINE ALIGNMENT --- + // We want the "body" of the font to be centered vertically. + // The body height is Ascent + Descent. + int fontBodyHeight = fm.getMaxAscent() + fm.getMaxDescent(); + + // Calculate how much empty space is left vertically + int emptySpaceY = height - fontBodyHeight; + + // Split the empty space (top margin) + int topMargin = emptySpaceY / 2; + + // The baseline is drawn at: Top Margin + Ascent + int y = topMargin + fm.getMaxAscent(); + + g2d.setColor(Color.BLACK); + g2d.setFont(font); + g2d.drawString(text, x, y); + + // (Optional) draw marks to show metrics (ascent, descent, advance-width + if (SHOW_METRICS) { + g2d.setColor(Color.BLUE); + int ya = y - fm.getAscent(); + int x2 = x + textWidth; + int yd = y + fm.getDescent(); + + // draw horizontals, then verticals + for (int yp : new int[] {ya, y, yd}) { + g2d.drawLine(x, yp, x + 5, yp); + g2d.drawLine(x2 - 5, yp, x2, yp); + } + // draw verticals + g2d.drawLine(x, ya, x, ya + 5); + g2d.drawLine(x2, ya, x2, ya + 5); + g2d.drawLine(x, y - 5, x, y + 5); + g2d.drawLine(x2, y - 5, x2, y + 5); + g2d.drawLine(x, yd, x, yd - 5); + g2d.drawLine(x2, yd, x2, yd - 5); + } + + g2d.dispose(); + return image; + } + + /** + * Compares two BufferedImages pixel by pixel. Returns true if they are identical in dimensions + * and content. + */ + public static boolean compareImages(BufferedImage imgA, BufferedImage imgB) { + // 1. Check dimensions + if (imgA.getWidth() != imgB.getWidth() || imgA.getHeight() != imgB.getHeight()) { + return false; + } + + // 2. Check every pixel + int width = imgA.getWidth(); + int height = imgA.getHeight(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (imgA.getRGB(x, y) != imgB.getRGB(x, y)) { + return false; + } + } + } + + return true; + } + + /** Checks if the image is entirely white (opaque white). */ + public static boolean isAllWhite(BufferedImage img) { + return isImageSingleColor(img, Color.WHITE.getRGB()); + } + + /** Checks if the image is entirely transparent (useful for the previous ARGB script). */ + public static boolean isAllTransparent(BufferedImage img) { + // 0x00000000 is the integer value for fully transparent + return isImageSingleColor(img, 0); + } + + /** Helper: Checks if every pixel in the image matches a specific color. */ + public static boolean isImageSingleColor(BufferedImage img, int colorCode) { + int width = img.getWidth(); + int height = img.getHeight(); + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (img.getRGB(x, y) != colorCode) { + return false; // Found a pixel that isn't the target color + } + } + } + return true; + } + + public static String getPropValueName(int uPropertyNumber, int nameChoice, int cp) { + return UCharacter.getPropertyValueName( + uPropertyNumber, UCharacter.getIntPropertyValue(cp, uPropertyNumber), nameChoice); + } + + public static String charInfo(int cp) { + return Character.toString(cp) + + "\t" + + Utility.hex(cp) + + "\t" + + UCharacter.getExtendedName(cp) + + "\t" + + GlyphRenderer.getPropValueName(UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp); + } +}