Skip to content

Commit f43561a

Browse files
committed
updates
1 parent 847f250 commit f43561a

File tree

4 files changed

+327
-9
lines changed

4 files changed

+327
-9
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package org.unicode.tools;
2+
3+
import java.io.IOException;
4+
import java.nio.file.Files;
5+
import java.nio.file.Path;
6+
import java.nio.file.Paths;
7+
import java.util.LinkedHashMap;
8+
import java.util.Map;
9+
import java.util.Map.Entry;
10+
11+
import org.unicode.cldr.util.Pair;
12+
import org.unicode.text.utility.Settings.UnicodeTools;
13+
import org.unicode.text.utility.Utility;
14+
15+
import com.google.common.base.Objects;
16+
import com.ibm.icu.impl.UnicodeMap;
17+
import com.ibm.icu.text.Normalizer2;
18+
import com.ibm.icu.text.UnicodeSet;
19+
20+
public class CheckMissingNFKC {
21+
public static void main(String[] args) throws IOException {
22+
Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance();
23+
24+
UnicodeMap<String> n4m = new UnicodeMap<>();
25+
System.out.println(UnicodeTools.UNICODETOOLS_RSRC_DIR);
26+
// "/unicodetools/src/main/resources/org/unicode/tools/nfkc-extended.txt"
27+
Path filePath = Paths.get(UnicodeTools.UNICODETOOLS_RSRC_DIR,"org/unicode/tools/nfkc-extended.txt");
28+
29+
// Unfortunately the internal tools in ICU aren't accessible, so parse it ourselves
30+
// https://unicode-org.github.io/icu/userguide/transforms/normalization/#data-file-syntax
31+
32+
Files.lines(filePath).forEach(line -> {
33+
if (line.isBlank() || line.startsWith("*")) {
34+
return;
35+
}
36+
int greaterPos = line.indexOf('>');
37+
if (greaterPos < 0) {
38+
greaterPos = line.indexOf('='); // for our purposed, = is the same as >
39+
if (greaterPos < 0) {
40+
int colonPos = line.indexOf(':');
41+
if (colonPos < 0) {
42+
throw new IllegalArgumentException("line: " + line);
43+
}
44+
}
45+
return;
46+
}
47+
String rawSource = line.substring(0,greaterPos);
48+
String target = Utility.fromHex(line.substring(greaterPos+1));
49+
50+
int rangePos = rawSource.indexOf("..");
51+
if (rangePos < 0) {
52+
String source = Utility.fromHex(rawSource);
53+
n4m.put(source, target);
54+
} else {
55+
int sourceStart = Utility.fromHex(rawSource.substring(0,rangePos)).codePointAt(0);
56+
int sourceEnd = Utility.fromHex(rawSource.substring(rangePos+2)).codePointAt(0);
57+
n4m.putAll(sourceStart, sourceEnd, target);
58+
}
59+
});
60+
61+
Map<String, Pair<String, String>> diff = new LinkedHashMap<>();
62+
UnicodeSet toCheck = new UnicodeSet("[[\\P{C}]-\\p{cf}]");
63+
System.out.println("Checking: " + toCheck.size() + " \t" + toCheck);
64+
for (int cp : toCheck.codePoints()) {
65+
String string = Character.toString(cp); // wish there were a code point interface
66+
String nfc_cfString = nfkc_cf.normalize(string);
67+
String n4mString = n4m.get(cp);
68+
if (n4mString == null) {
69+
n4mString = string;
70+
}
71+
if (Objects.equal(nfc_cfString, n4mString)) {
72+
continue;
73+
}
74+
diff.put(string, Pair.of(n4mString, nfc_cfString));
75+
}
76+
System.out.println("Differences:\t" + diff.size());
77+
System.out.println("Source" + "\t" + "N4M" + "\t" + "nfkc_cf");
78+
79+
for (Entry<String, Pair<String, String>> entry : diff.entrySet()) {
80+
System.out.println(Utility.hex(entry.getKey()) + "\t" + Utility.hex(entry.getValue().getFirst()) + "\t" + Utility.hex(entry.getValue().getSecond()));
81+
}
82+
}
83+
}

unicodetools/src/main/java/org/unicode/tools/FindBlankGlyphs.java

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public static void main(String[] args) {
7979
+ "\t"
8080
+ UCharacter.getExtendedName(cp)
8181
+ "\t"
82-
+ getPropValueName(cp)
82+
+ GlyphRenderer.getPropValueName(UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
8383
+ "\t"
8484
+ visualBounds);
8585

@@ -130,7 +130,7 @@ public static void main(String[] args) {
130130
+ "</td><td>"
131131
+ UCharacter.getExtendedName(cp)
132132
+ "</td><td>"
133-
+ getPropValueName(cp)
133+
+ GlyphRenderer.getPropValueName(UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp)
134134
+ "</td>"
135135
+ "<tr>");
136136
}
@@ -141,11 +141,4 @@ public static void main(String[] args) {
141141
System.out.println("Checked: " + count);
142142
System.out.println(specialCases);
143143
}
144-
145-
private static String getPropValueName(int cp) {
146-
return UCharacter.getPropertyValueName(
147-
UProperty.GENERAL_CATEGORY,
148-
UCharacter.getIntPropertyValue(cp, UProperty.GENERAL_CATEGORY),
149-
NameChoice.SHORT);
150-
}
151144
}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
package org.unicode.utilities;
2+
import java.awt.Font;
3+
import java.awt.Graphics2D;
4+
import java.awt.Image;
5+
import java.awt.Point;
6+
import java.awt.Rectangle;
7+
import java.awt.geom.Rectangle2D;
8+
import java.awt.image.BufferedImage;
9+
import java.util.ArrayList;
10+
import java.util.List;
11+
12+
import org.unicode.text.utility.Utility;
13+
14+
import com.ibm.icu.text.UnicodeSet;
15+
import com.ibm.icu.util.Output;
16+
17+
public class FastSymbolMatcher {
18+
19+
// --- Tuning Parameters ---
20+
private static final double MAX_ASPECT_RATIO_DIFF = 0.5; // Fail if width/height is too different
21+
private static final double MAX_DENSITY_DIFF = 0.25; // Fail if ink thickness is too different
22+
23+
public static void main(String[] args) {
24+
UnicodeSet chars = new UnicodeSet(args[0]);
25+
System.out.println(chars.size() + "\t" + chars);
26+
27+
List<SymbolProfile> profiles = new ArrayList<>();
28+
Output<Rectangle2D> visualBounds = new Output<>();
29+
String fontName = "Noto Sans";
30+
int fontSize = 144;
31+
Font font = new Font(fontName, Font.PLAIN, fontSize);
32+
33+
int count = 0;
34+
35+
for (int cp : chars.codePoints()) {
36+
if ((count % 10000) == 0) {
37+
System.out.println(count + "\t" + Utility.hex(cp));
38+
}
39+
++count;
40+
String character = Character.toString(cp);
41+
42+
BufferedImage image =
43+
GlyphRenderer.createGlyphBitmap(
44+
font, character, visualBounds, fontSize * 2, fontSize * 3 / 2);
45+
profiles.add(new SymbolProfile(cp, image));
46+
}
47+
compare(profiles);
48+
}
49+
50+
public static void compare(List<SymbolProfile> profiles) {
51+
52+
// 2. PRE-PROCESSING (The "N" Step)
53+
// Convert heavy images into lightweight Profiles ONCE.
54+
// This is O(N) complexity.
55+
56+
System.out.println("Generating comparisons");
57+
// 3. COMPARISON (The "N^2" Step)
58+
// Compare every symbol against every other symbol
59+
for (int i = 0; i < profiles.size(); i++) {
60+
System.out.print("\t" + Character.toString(profiles.get(i).cp));
61+
}
62+
System.out.println();
63+
for (int i = 0; i < profiles.size(); i++) {
64+
SymbolProfile p1 = profiles.get(i);
65+
System.out.print(Character.toString(p1.cp));
66+
for (int j = 0; j < i; j++) {
67+
SymbolProfile p2 = profiles.get(j);
68+
69+
// --- THE QUICK TEST ---
70+
if (shouldFailFast(p1, p2)) {
71+
// Skip expensive logic completely
72+
//System.out.println("Quick Mismatch: " + GlyphRenderer.charInfo(p2.cp));
73+
System.out.print("\t");
74+
continue;
75+
}
76+
77+
// --- EXPENSIVE TEST ---
78+
// Only runs if the symbols are roughly similar shapes
79+
double score = detailedCompare(p1, p2);
80+
System.out.print("\t" + (int)(score * 100));
81+
}
82+
System.out.println();
83+
}
84+
}
85+
86+
/**
87+
* The "Gatekeeper" method.
88+
* Returns TRUE if the symbols are so different we shouldn't bother comparing pixels.
89+
*/
90+
private static boolean shouldFailFast(SymbolProfile p1, SymbolProfile p2) {
91+
// Test 1: Aspect Ratio (Is one tall and thin, and the other short and wide?)
92+
// e.g., prevents comparing "l" with "w"
93+
if (Math.abs(p1.aspectRatio - p2.aspectRatio) > MAX_ASPECT_RATIO_DIFF) {
94+
return true;
95+
}
96+
97+
// Test 2: Ink Density (Is one heavy/filled and the other light/empty?)
98+
// e.g., prevents comparing "." with "M"
99+
if (Math.abs(p1.inkDensity - p2.inkDensity) > MAX_DENSITY_DIFF) {
100+
return true;
101+
}
102+
103+
return false;
104+
}
105+
106+
/**
107+
* The detailed pixel comparison (same logic as before, but using cached thumbnails).
108+
*/
109+
private static double detailedCompare(SymbolProfile p1, SymbolProfile p2) {
110+
// 1. Calculate Weights
111+
double visualSim = getVisualSimilarity(p1.thumbnail, p2.thumbnail);
112+
113+
// Size Penalty (using cached bounds)
114+
double areaA = p1.bounds.width * p1.bounds.height;
115+
double areaB = p2.bounds.width * p2.bounds.height;
116+
double sizeSim = 1.0 - (Math.abs(areaA - areaB) / Math.max(areaA, areaB));
117+
118+
// Position Penalty (using cached centers)
119+
double dist = p1.center.distance(p2.center);
120+
double maxDist = Math.sqrt(Math.pow(1000, 2) + Math.pow(1000, 2)); // Mock max canvas size
121+
double posSim = 1.0 - (dist / maxDist);
122+
if (posSim < 0) posSim = 0;
123+
124+
return (visualSim * 0.6) + (sizeSim * 0.2) + (posSim * 0.2);
125+
}
126+
127+
private static double getVisualSimilarity(BufferedImage imgA, BufferedImage imgB) {
128+
long diff = 0;
129+
int w = imgA.getWidth();
130+
int h = imgA.getHeight();
131+
for (int y = 0; y < h; y++) {
132+
for (int x = 0; x < w; x++) {
133+
// Quick grayscale diff
134+
int rgbA = imgA.getRGB(x, y) & 0xFF; // Blue channel proxy for gray
135+
int rgbB = imgB.getRGB(x, y) & 0xFF;
136+
diff += Math.abs(rgbA - rgbB);
137+
}
138+
}
139+
return 1.0 - (diff / (w * h * 255.0));
140+
}
141+
142+
// --- INNER CLASS FOR PRE-CALCULATED DATA ---
143+
static class SymbolProfile {
144+
int cp;
145+
Rectangle bounds;
146+
Point center;
147+
double aspectRatio;
148+
double inkDensity;
149+
BufferedImage thumbnail; // Small 32x32 cached version
150+
151+
public SymbolProfile(int cp, BufferedImage raw) {
152+
this.cp = cp;
153+
// 1. Expensive Scan (Done ONLY ONCE per image)
154+
this.bounds = getBounds(raw);
155+
156+
if (this.bounds == null) {
157+
// Handle empty image
158+
this.bounds = new Rectangle(0,0,1,1);
159+
this.aspectRatio = 0;
160+
this.inkDensity = 0;
161+
this.thumbnail = new BufferedImage(32, 32, BufferedImage.TYPE_INT_ARGB);
162+
return;
163+
}
164+
165+
this.center = new Point((int)bounds.getCenterX(), (int)bounds.getCenterY());
166+
167+
// 2. Pre-calculate Heuristics
168+
this.aspectRatio = (double) bounds.width / bounds.height;
169+
170+
// 3. Create Cached Thumbnail
171+
BufferedImage cropped = raw.getSubimage(bounds.x, bounds.y, bounds.width, bounds.height);
172+
this.thumbnail = resize(cropped, 32, 32);
173+
174+
// 4. Calculate Density (on the small thumbnail to save time)
175+
this.inkDensity = calculateDensity(this.thumbnail);
176+
}
177+
178+
private double calculateDensity(BufferedImage img) {
179+
long totalPixels = img.getWidth() * img.getHeight();
180+
long filledPixels = 0;
181+
for(int y=0; y<img.getHeight(); y++) {
182+
for(int x=0; x<img.getWidth(); x++) {
183+
int alpha = (img.getRGB(x, y) >> 24) & 0xff;
184+
if(alpha > 0) filledPixels++; // Assuming transparent background
185+
// If white background, check brightness < 200
186+
}
187+
}
188+
return (double) filledPixels / totalPixels;
189+
}
190+
191+
private Rectangle getBounds(BufferedImage img) {
192+
int minX = img.getWidth(), minY = img.getHeight(), maxX = -1, maxY = -1;
193+
boolean found = false;
194+
for (int y = 0; y < img.getHeight(); y++) {
195+
int xSum = 0;
196+
for (int x = 0; x < img.getWidth(); x++) {
197+
int alpha = (img.getRGB(x, y) >> 24) & 0xff;
198+
xSum += alpha;
199+
if (alpha != 0) { // Assuming transparent background
200+
if (x < minX) minX = x;
201+
if (x > maxX) maxX = x;
202+
if (y < minY) minY = y;
203+
if (y > maxY) maxY = y;
204+
found = true;
205+
}
206+
}
207+
}
208+
return found ? new Rectangle(minX, minY, maxX - minX + 1, maxY - minY + 1) : null;
209+
}
210+
211+
private BufferedImage resize(BufferedImage img, int w, int h) {
212+
Image tmp = img.getScaledInstance(w, h, Image.SCALE_SMOOTH);
213+
BufferedImage dimg = new BufferedImage(w, h, BufferedImage.TYPE_INT_ARGB);
214+
Graphics2D g2d = dimg.createGraphics();
215+
g2d.drawImage(tmp, 0, 0, null);
216+
g2d.dispose();
217+
return dimg;
218+
}
219+
}
220+
}

unicodetools/src/main/java/org/unicode/utilities/GlyphRenderer.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
package org.unicode.utilities;
22

3+
import com.ibm.icu.lang.UCharacter;
4+
import com.ibm.icu.lang.UProperty;
5+
import com.ibm.icu.lang.UProperty.NameChoice;
36
import com.ibm.icu.util.Output;
47
import java.awt.Color;
58
import java.awt.Font;
@@ -11,6 +14,8 @@
1114
import java.awt.geom.Rectangle2D;
1215
import java.awt.image.BufferedImage;
1316

17+
import org.unicode.text.utility.Utility;
18+
1419
public class GlyphRenderer {
1520
private static final boolean SHOW_METRICS = false;
1621

@@ -136,4 +141,21 @@ public static boolean isImageSingleColor(BufferedImage img, int colorCode) {
136141
}
137142
return true;
138143
}
144+
145+
public static String getPropValueName(int uPropertyNumber, int nameChoice, int cp) {
146+
return UCharacter.getPropertyValueName(
147+
uPropertyNumber,
148+
UCharacter.getIntPropertyValue(cp, uPropertyNumber),
149+
nameChoice);
150+
}
151+
152+
public static String charInfo(int cp) {
153+
return Character.toString(cp)
154+
+ "\t"
155+
+ Utility.hex(cp)
156+
+ "\t"
157+
+ UCharacter.getExtendedName(cp)
158+
+ "\t"
159+
+ GlyphRenderer.getPropValueName(UProperty.GENERAL_CATEGORY, NameChoice.SHORT, cp);
160+
}
139161
}

0 commit comments

Comments
 (0)