Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions unicodetools/src/main/java/org/unicode/tools/CheckMissingNFKC.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package org.unicode.tools;

import com.google.common.base.Objects;
import com.ibm.icu.impl.UnicodeMap;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.unicode.cldr.util.Pair;
import org.unicode.text.utility.Settings.UnicodeTools;
import org.unicode.text.utility.Utility;

public class CheckMissingNFKC {
public static void main(String[] args) throws IOException {
Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance();

UnicodeMap<String> n4m = new UnicodeMap<>();
System.out.println(UnicodeTools.UNICODETOOLS_RSRC_DIR);
// "/unicodetools/src/main/resources/org/unicode/tools/nfkc-extended.txt"
Path filePath =
Paths.get(
UnicodeTools.UNICODETOOLS_RSRC_DIR, "org/unicode/tools/nfkc-extended.txt");

// Unfortunately the internal tools in ICU aren't accessible, so parse it ourselves
// https://unicode-org.github.io/icu/userguide/transforms/normalization/#data-file-syntax

Files.lines(filePath)
.forEach(
line -> {
if (line.isBlank() || line.startsWith("*")) {
return;
}
int greaterPos = line.indexOf('>');
if (greaterPos < 0) {
greaterPos =
line.indexOf('='); // for our purposed, = is the same as >
if (greaterPos < 0) {
int colonPos = line.indexOf(':');
if (colonPos < 0) {
throw new IllegalArgumentException("line: " + line);
}
}
return;
}
String rawSource = line.substring(0, greaterPos);
String target = Utility.fromHex(line.substring(greaterPos + 1));

int rangePos = rawSource.indexOf("..");
if (rangePos < 0) {
String source = Utility.fromHex(rawSource);
n4m.put(source, target);
} else {
int sourceStart =
Utility.fromHex(rawSource.substring(0, rangePos))
.codePointAt(0);
int sourceEnd =
Utility.fromHex(rawSource.substring(rangePos + 2))
.codePointAt(0);
n4m.putAll(sourceStart, sourceEnd, target);
}
});

Map<String, Pair<String, String>> diff = new LinkedHashMap<>();
UnicodeSet toCheck = new UnicodeSet("[[\\P{C}]-\\p{cf}]");
System.out.println("Checking: " + toCheck.size() + " \t" + toCheck);
for (int cp : toCheck.codePoints()) {
String string = Character.toString(cp); // wish there were a code point interface
String nfc_cfString = nfkc_cf.normalize(string);
String n4mString = n4m.get(cp);
if (n4mString == null) {
n4mString = string;
}
if (Objects.equal(nfc_cfString, n4mString)) {
continue;
}
diff.put(string, Pair.of(n4mString, nfc_cfString));
}
System.out.println("Differences:\t" + diff.size());
System.out.println("Source" + "\t" + "N4M" + "\t" + "nfkc_cf");

for (Entry<String, Pair<String, String>> entry : diff.entrySet()) {
System.out.println(
Utility.hex(entry.getKey())
+ "\t"
+ Utility.hex(entry.getValue().getFirst())
+ "\t"
+ Utility.hex(entry.getValue().getSecond()));
}
}
}
147 changes: 147 additions & 0 deletions unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package org.unicode.tools;

import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UProperty.NameChoice;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Output;
import java.awt.Font;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UncheckedIOException;
import javax.imageio.ImageIO;
import org.unicode.text.utility.Settings;
import org.unicode.text.utility.Utility;
import org.unicode.utilities.GlyphRenderer;

public class FindBlankGlyphs {
public static final String DATA_DIR =
Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/unicodetools/data/temp/";

public static void main(String[] args) {
// Configuration
String fontName = "Noto Sans";
int fontSize = 144;
Font font = new Font(fontName, Font.PLAIN, fontSize);

String specialCases =
"[\\N{HANGUL CHOSEONG FILLER}"
+ "\\N{HANGUL JUNGSEONG FILLER}"
+ "\\N{HANGUL FILLER}"
+ "\\N{HALFWIDTH HANGUL FILLER}"
+ "\\N{COMBINING GRAPHEME JOINER}"
+ "\\N{KHMER VOWEL INHERENT AQ}"
+ "\\N{KHMER VOWEL INHERENT AA}"
+ "\\N{BRAILLE PATTERN BLANK}"
+ "\\p{variation_selector}]"
+ "]";
UnicodeSet exclusions =
new UnicodeSet(
"["
+ "\\p{C}"
+ "\\p{Z}"
+ "\\p{rgi_emoji}"
// + "\\p{RGI_Emoji_Qualification=Minimally_Qualified}"
// ICU doesn't support this yet!
+ "[\\p{emoji}-\\p{emoji_component}]"
+ "\\p{whitespace}"
+ "\\p{deprecated}"

// special cases
+ specialCases)
.freeze();

UnicodeSet showAnyway = new UnicodeSet("[]"); // \\u034F
UnicodeSet chars =
new UnicodeSet(0, 0x10FFFF).removeAll(exclusions).addAll(showAnyway).freeze();
UnicodeSet show = new UnicodeSet();
Output<Rectangle2D> visualBounds = new Output<>();

int count = 0;
for (int cp : chars.codePoints()) {
if ((count % 10000) == 0) {
System.out.println(count + "\t" + Utility.hex(cp));
}
++count;
String character = Character.toString(cp);

BufferedImage image =
GlyphRenderer.createGlyphBitmap(
font, character, visualBounds, fontSize * 2, fontSize * 3 / 2);
if (!GlyphRenderer.isImageSingleColor(image, 0) && !showAnyway.contains(cp)) {
continue;
}
show.add(cp);
System.out.println(
Utility.hex(cp)
+ "\t"
+ UCharacter.getExtendedName(cp)
+ "\t"
+ GlyphRenderer.getPropValueName(
UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
+ "\t"
+ visualBounds);

// Save each image to file
File file = new File(DATA_DIR, "glyph_" + Utility.hex(character) + ".png");
try {
ImageIO.write(image, "png", file);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
// write HTML file
File file = new File(DATA_DIR, "list.html");
try (PrintWriter writer = new PrintWriter(file)) {
writer.println(
"<html><body><style>"
+ "table, th, td {\n"
+ " border: 1px solid black; /* Sets a 1px solid black border on all elements */\n"
+ "}\n"
+ "\n"
+ "table {\n"
+ " border-collapse: collapse; /* Merges adjacent borders into a single line */\n"
+ "}\n"
+ "\n"
+ "th, td {\n"
+ " padding: 4px; /* Optional: Adds space between border and content */\n"
+ "}\n"
+ "body {\n"
+ " font-family: 'Noto Sans', 'Noto Sans Symbols', sans-serif;\n"
+ "}\n"
+ ""
+ "</style><table>");
for (int cp : show.codePoints()) {
writer.println(
"<tr>"
+ "<td>"
+ "<img src='"
+ "glyph_"
+ Utility.hex(cp)
+ ".png"
+ "' alt='"
+ UCharacter.getExtendedName(cp)
+ "' width='auto' height='32'>"
+ "</td><td style='font-size: 24px; text-align: center'>"
+ Character.toString(cp)
+ "</td><td>"
+ Utility.hex(cp)
+ "</td><td>"
+ UCharacter.getExtendedName(cp)
+ "</td><td>"
+ GlyphRenderer.getPropValueName(
UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
+ "</td>"
+ "<tr>");
}
writer.println("</table></body></html>");
} catch (IOException e) {
throw new UncheckedIOException(e);
}
System.out.println("Checked: " + count);
System.out.println(specialCases);
}
}
Loading
Loading