Skip to content

Commit 8729e52

Browse files
committed
Cleaned up the LinkDetectionTest
1 parent 586c079 commit 8729e52

File tree

7 files changed

+244
-50
lines changed

7 files changed

+244
-50
lines changed

unicodetools/data/linkification/dev/LinkDetectionTest.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkDetectionTest.txt
2-
# Date: 2025-12-05, 22:41:42 GMT
2+
# Date: 2025-12-06, 18:24:23 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkFormattingTest.txt
2-
# Date: 2025-12-03 23:00:53 GMT
2+
# Date: 2025-12-06, 18:24:23 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -16,43 +16,44 @@
1616
# Empty lines, and lines starting with # are ignored.
1717
# Otherwise # is treated like any other character.
1818
#
19-
# The Path, Query, and Fragment may contain escapes when characters would otherwise be
20-
# syntax characters in that part. For example, a literal / within a path would be \/.
19+
# The Path, Query, and Fragment may contain backslash escapes when characters would otherwise be
20+
# internal syntax characters in that part. For example, a literal / within a path segments would be \/.
2121
# ================================================
22-
Format:
22+
23+
24+
# Selected test cases
2325

2426
# Path only
25-
https://example.com; α; ; ; https://example.com/α
27+
https:// ; example.com ; α ; ; ; https://example.com/α
2628

2729
# Query only
28-
https://example.com; ; α; ; https://example.com?α
30+
https:// ; example.com ; ; α ; ; https://example.com?α
2931

3032
# Fragment only
31-
https://example.com; ; ; α; https://example.com#α
33+
https:// ; example.com ; ; ; α ; https://example.com#α
3234

3335
# All parts
34-
https://example.com; αβγ/δεζ; θ=ικλ&μ=γξο; πρς; https://example.com/αβγ/δεζ?θ=ικλ&μ=γξο#πρς
36+
https:// ; example.com ; αβγ/δεζ ; θ=ικλ&μ=γξο ; πρς ; https://example.com/αβγ/δεζ?θ=ικλ&μ=γξο#πρς
3537

3638
# Escape ? in Path
37-
https://example.com; α?μπ; ; ; https://example.com/α%3Fμπ
39+
https:// ; example.com ; α?μπ ; ; ; https://example.com/α%3Fμπ
3840

3941
# Escape # in Path/Query
40-
https://example.com; α#β; γ=δ#ε; ; https://example.com/α%23β?γ=δ%23ε
42+
https:// ; example.com ; α#β ; γ=δ#ε ; ; https://example.com/α%23β?γ=δ%23ε
4143

4244
# Escape hard (' ')
43-
https://example.com; αβ γ/δεζ; θ=ικ λ&=γξο; πρ σ; https://example.com/αβ%20γ/δεζ?θ=ικ%20λ&=γξο#πρ%20σ
45+
https:// ; example.com ; αβ γ/δεζ ; θ=ικ λ&=γξο ; πρ σ ; https://example.com/αβ%20γ/δεζ?θ=ικ%20λ&=γξο#πρ%20σ
4446

4547
# Escape soft ('.') unless followed by include
46-
https://example.com; αβγ./δεζ.; θ=ικ.λ&=γξο.; πρς.; https://example.com/αβγ./δεζ.?θ=ικ.λ&=γξο.#πρς%2E
48+
https:// ; example.com ; αβγ./δεζ. ; θ=ικ.λ&=γξο. ; πρς. ; https://example.com/αβγ./δεζ.?θ=ικ.λ&=γξο.#πρς%2E
4749

4850
# Escape unmatched brackets
49-
https://example.com; α(β)); γ(δ)); ε(ζ)); https://example.com/α(β)%29?γ(δ)%29#ε(ζ)%29
51+
https:// ; example.com ; α(β)) ; γ(δ)) ; ε(ζ)) ; https://example.com/α(β)%29?γ(δ)%29#ε(ζ)%29
5052

51-
# Query with escapes
52-
https://example.com; ; α\=\&=\=\&=%; ; https://example.com?α%3D%26%=%3D%26%
53+
# Path with escapes (\/ is a literal /, % is escaped)
54+
https:// ; example.com ; α/β\/γ% ; ; ; https://example.com/α/β%2Fγ%25
5355

54-
# Path with escapes
55-
https://example.com; α/β\/γ; ; ; https://example.com/α/β%2Fγ
56+
# Query with escapes (\= is a literal =, so the left side is α=β and the right side is γ%δ)
57+
https:// ; example.com ; ; α\=β=γ%δ ; ; https://example.com?α%3Dβ=γ%25δ
5658

57-
# Query with escapes
58-
https://example.com; ; α\=\&%=\=\&=%; ; https://example.com?α%3D%26%=%3D%26%
59+
https:// ; example.com ; ; α\&β=γ&δ=ε ; ; https://example.com?α%26β=γ&δ=ε

unicodetools/data/linkification/dev/LinkPairedOpener.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkPairedOpener.txt
2-
# Date: 2025-12-05, 22:41:42 GMT
2+
# Date: 2025-12-06, 18:24:23 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/data/linkification/dev/LinkTermination.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkTermination.txt
2-
# Date: 2025-12-05, 22:41:42 GMT
2+
# Date: 2025-12-06, 18:24:23 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html

unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java

Lines changed: 160 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
package org.unicode.tools;
22

3+
import com.google.common.base.Joiner;
34
import com.google.common.base.Splitter;
5+
import com.google.common.collect.ImmutableSortedMap;
46
import com.google.common.collect.Multimap;
57
import com.google.common.collect.TreeMultimap;
68
import com.ibm.icu.text.NumberFormat;
79
import com.ibm.icu.text.SimpleFormatter;
810
import com.ibm.icu.text.UnicodeSet;
11+
import com.ibm.icu.util.OutputInt;
912
import java.io.IOException;
1013
import java.io.PrintWriter;
1114
import java.io.UncheckedIOException;
@@ -14,6 +17,7 @@
1417
import java.time.Instant;
1518
import java.time.ZoneId;
1619
import java.time.format.DateTimeFormatter;
20+
import java.util.EnumMap;
1721
import java.util.List;
1822
import java.util.NavigableMap;
1923
import java.util.function.Consumer;
@@ -41,7 +45,10 @@
4145
*/
4246
class GenerateLinkData {
4347

44-
private static final Splitter SPLIT_TABS = Splitter.on('\t').omitEmptyStrings();
48+
private static final Joiner JOIN_SEMI_SP = Joiner.on(" ;\t");
49+
private static final Splitter SPLIT_TABS = Splitter.on('\t').omitEmptyStrings().trimResults();
50+
private static final Splitter SPLIT_SEMI = Splitter.on(';').trimResults();
51+
4552
private static final String HEADER_BASE =
4653
"# {0}.txt\n"
4754
+ "# Date: {1} \n"
@@ -54,7 +61,8 @@ class GenerateLinkData {
5461

5562
public static void main(String[] args) throws IOException {
5663
generatePropertyData();
57-
generateTestData();
64+
generateDetectionTestData();
65+
generateFormattingTestData();
5866
}
5967

6068
static final Instant now = Instant.now();
@@ -87,7 +95,7 @@ static void writePropHeader(
8795
filename, dt.format(now), dty.format(now), propertyName, missingValue));
8896
}
8997

90-
static final SimpleFormatter HEADER_TEST =
98+
static final SimpleFormatter HEADER_DETECT_TEST =
9199
SimpleFormatter.compile(
92100
HEADER_BASE
93101
+ "# Format: each line contains zero or more marked links, such as ⸠abc.com⸡\n"
@@ -99,8 +107,26 @@ static void writePropHeader(
99107
+ "# Otherwise # is treated like any other character.\n"
100108
+ "# ================================================\n");
101109

102-
static void writeTestHeader(PrintWriter out, String filename, String testName) {
103-
out.println(HEADER_TEST.format(filename, dt.format(now), dty.format(now), testName));
110+
static final SimpleFormatter HEADER_FORMAT_TEST =
111+
SimpleFormatter.compile(
112+
HEADER_BASE
113+
+ "# Format: Each line has the following fields:\n"
114+
+ "# Scheme/host\n"
115+
+ "# Path\n"
116+
+ "# Query\n"
117+
+ "# Fragment\n"
118+
+ "# Result — with minimal escaping\n"
119+
+ "#\n"
120+
+ "# Empty lines, and lines starting with # are ignored.\n"
121+
+ "# Otherwise # is treated like any other character.\n"
122+
+ "#\n"
123+
+ "# The Path, Query, and Fragment may contain backslash escapes when characters would otherwise be \n"
124+
+ "# internal syntax characters in that part. For example, a literal / within a path segments would be \\/.\n"
125+
+ "# ================================================\n");
126+
127+
static void writeTestHeader(
128+
PrintWriter out, SimpleFormatter simpleFormatter, String filename, String testName) {
129+
out.println(simpleFormatter.format(filename, dt.format(now), dty.format(now), testName));
104130
}
105131

106132
/** Generate property data for the UTS */
@@ -138,21 +164,14 @@ static void generatePropertyData() {
138164
}
139165
}
140166

141-
static void generateTestData() {
167+
static void generateDetectionTestData() {
142168

143-
BagFormatter bf = new BagFormatter(LinkUtilities.IUP).setLineSeparator("\n");
144-
145-
// LinkTermination.txt
146-
147-
bf.setValueSource(LinkTermination.PROPERTY);
148-
bf.setLabelSource(LinkUtilities.IUP.getProperty(UcdProperty.Age));
149-
150-
// "/unicodetools/src/main/resources/org/unicode/tools/test_links_lt.txt"
169+
OutputInt errorCount = new OutputInt();
151170

152171
try (final PrintWriter out =
153172
FileUtilities.openUTF8Writer(
154173
LinkUtilities.DATA_DIR_DEV, "LinkDetectionTest.txt"); ) {
155-
writeTestHeader(out, "LinkDetectionTest", "LinkDetectionTest");
174+
writeTestHeader(out, HEADER_DETECT_TEST, "LinkDetectionTest", "LinkDetectionTest");
156175

157176
out.println("# Test cases contributed by ICANN\n");
158177

@@ -165,6 +184,7 @@ static void generateTestData() {
165184
List<String> parts = SPLIT_TABS.splitToList(line);
166185
if (parts.size() != 2) {
167186
System.out.println("* Malformed? " + line);
187+
++errorCount.value;
168188
return;
169189
}
170190
String base = parts.get(0);
@@ -199,17 +219,9 @@ static void generateTestData() {
199219
} catch (IOException e) {
200220
throw new UncheckedIOException(e);
201221
}
202-
203-
// // LinkPairedOpener.txt
204-
// bf.setValueSource(LinkUtilities.LINK_PAIRED_OPENER);
205-
// try (final PrintWriter out =
206-
// FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR,
207-
// "LinkFormattingTest.txt"); ) {
208-
// writeHeader(out, "LinkPairedOpener", "LinkPairedOpener", "undefined");
209-
// bf.showSetNames(out, LinkTermination.Close.base);
210-
// } catch (IOException e) {
211-
// throw new UncheckedIOException(e);
212-
// }
222+
if (errorCount.value != 0) {
223+
throw new IllegalArgumentException("Failures in writing test file: " + errorCount);
224+
}
213225
}
214226

215227
private static String addBraces(String base) {
@@ -233,6 +245,128 @@ private static String addBraces(String base) {
233245
return result.toString();
234246
}
235247

248+
static void generateFormattingTestData() {
249+
250+
OutputInt errorCount = new OutputInt();
251+
252+
// // LinkPairedOpener.txt
253+
// bf.setValueSource(LinkUtilities.LINK_PAIRED_OPENER);
254+
// try (final PrintWriter out =
255+
// FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR,
256+
// "LinkFormattingTest.txt"); ) {
257+
// writeHeader(out, "LinkPairedOpener", "LinkPairedOpener", "undefined");
258+
// bf.showSetNames(out, LinkTermination.Close.base);
259+
// } catch (IOException e) {
260+
// throw new UncheckedIOException(e);
261+
// }
262+
263+
try (final PrintWriter out =
264+
FileUtilities.openUTF8Writer(
265+
LinkUtilities.DATA_DIR_DEV, "LinkFormattingTest.txt"); ) {
266+
writeTestHeader(out, HEADER_FORMAT_TEST, "LinkFormattingTest", "LinkFormattingTest");
267+
268+
out.println("\n# Selected test cases\n");
269+
270+
Files.lines(Path.of(LinkUtilities.RESOURCE_DIR, "linkFormattingSource.txt"))
271+
.forEach(
272+
line -> {
273+
if (line.startsWith("#") || line.isBlank()) {
274+
out.println(line);
275+
return;
276+
}
277+
List<String> parts = SPLIT_SEMI.splitToList(line);
278+
if (parts.size() < 5 || parts.size() > 6) {
279+
System.out.println("* Malformed? " + line);
280+
++errorCount.value;
281+
return;
282+
}
283+
EnumMap<Part, String> partMap = new EnumMap<>(Part.class);
284+
partMap.put(Part.PROTOCOL, parts.get(0));
285+
partMap.put(Part.HOST, parts.get(1));
286+
partMap.put(Part.PATH, parts.get(2));
287+
partMap.put(Part.QUERY, parts.get(3));
288+
partMap.put(Part.FRAGMENT, parts.get(4));
289+
ImmutableSortedMap<Part, String> temp =
290+
ImmutableSortedMap.copyOf(partMap);
291+
292+
String actual = LinkUtilities.minimalEscape(temp, false, null);
293+
294+
String expected = parts.size() < 6 ? null : parts.get(5);
295+
if (expected != null && !actual.equals(expected)) {
296+
System.out.println(
297+
"* mismatch "
298+
+ temp
299+
+ "\nexpected:\t"
300+
+ expected
301+
+ "\nactual: \t"
302+
+ actual);
303+
++errorCount.value;
304+
return;
305+
}
306+
out.println(
307+
JOIN_SEMI_SP.join(
308+
temp.get(Part.PROTOCOL),
309+
temp.get(Part.HOST),
310+
temp.get(Part.PATH),
311+
temp.get(Part.QUERY),
312+
temp.get(Part.FRAGMENT),
313+
actual));
314+
});
315+
316+
// out.println("\n# Wikipedia test cases\n");
317+
//
318+
// Files.lines(Path.of(LinkUtilities.RESOURCE_DIR, "testUrls.txt"))
319+
// .forEach(
320+
// line -> {
321+
// if (line.startsWith("#") || line.isBlank()) {
322+
// out.println(line);
323+
// return;
324+
// }
325+
// // Pick up escaped URL
326+
// // Escape it
327+
// String escapedLine = line;
328+
// // Divide into parts
329+
// NavigableMap<Part, String> parts =
330+
// Part.getParts(escapedLine, false);
331+
//
332+
// out.println(
333+
// showFormatted(
334+
// parts.get(Part.PROTOCOL) +
335+
// parts.get(Part.HOST),
336+
// parts.get(Part.PATH),
337+
// parts.get(Part.QUERY),
338+
// parts.get(Part.FRAGMENT),
339+
// escapedLine));
340+
// });
341+
} catch (IOException e) {
342+
throw new UncheckedIOException(e);
343+
}
344+
if (errorCount.value != 0) {
345+
throw new IllegalArgumentException("Failures in writing test file: " + errorCount);
346+
}
347+
}
348+
349+
private static String showFormatted(
350+
String schemeAndHost, String path, String query, String fragment, String actual) {
351+
return JOIN_SEMI_SP.join(
352+
schemeAndHost,
353+
escapeBSlashed(path),
354+
escapeBSlashed(query),
355+
escapeBSlashed(fragment),
356+
actual);
357+
}
358+
359+
private static String computeFormat(
360+
String schemeAndHost, String path, String query, String fragment) {
361+
// TODO Auto-generated method stub
362+
return null;
363+
}
364+
365+
private static String escapeBSlashed(String path) {
366+
// TODO Auto-generated method stub
367+
return null;
368+
}
369+
236370
/**
237371
* Use the query https://w.wiki/CKpG to create a file. Put it in RESOURCE_DIR as
238372
* wikipedia1000raw.tsv, then run this class. It generates two new files, <br>

0 commit comments

Comments
 (0)