Skip to content

Commit 66fb559

Browse files
committed
Add test case for :~:
1 parent aa8087c commit 66fb559

File tree

4 files changed

+50
-25
lines changed

4 files changed

+50
-25
lines changed

unicodetools/data/linkification/dev/LinkDetectionTest.txt

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkDetectionTest.txt
2-
# Date: 2025-12-15, 20:01:53 GMT
2+
# Date: 2025-12-15, 21:53:25 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -62,21 +62,23 @@ See ⸠example.com/αβ⸡) on…
6262
# Include matched bracket
6363
See ⸠example.com/α(β)⸡ on…
6464

65-
See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμν⸡ on…
65+
See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμν:~:text=φχψ⸡ on…
6666

6767
# Continue past matching brackets within same part and not across interior syntax
68-
See ⸠example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν⸡ on…
69-
See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν⸡ on…
68+
See ⸠example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν:~:text=(φχψ)⸡ on…
69+
See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ)]⸡ on…
7070

7171
# Don't match mismatched brackets
72-
See ⸠example.com/α[(β⸡])γ/δρς?α=[(θ)]&[(β)]=κ#λμν on…
73-
See ⸠example.com/α[(β)]γ/δρς?α=[(θ⸡])&[(β)]=κ#λμν on…
74-
See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β⸡])=κ#λμν on…
72+
See ⸠example.com/α[(β⸡])γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=φχψ on…
73+
See ⸠example.com/α[(β)]γ/δρς?α=[(θ⸡])&[(β)]=κ#λμν:~:text=φχψ on…
74+
See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β⸡])=κ#λμν:~:text=φχψ on…
75+
See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ⸡]) on…
7576

7677
# Don't match across interior syntax
77-
See ⸠example.com/αβ(γ/δ⸡)ρς?α=θ&β=κ#λμν on…
78-
See ⸠example.com/αβγ/δρς?α(=⸡)θ&β=κ#λμν on…
79-
See ⸠example.com/αβγ/δρς?α=θ(&⸡)β=κ#λμν on…
78+
See ⸠example.com/αβ(γ/δ⸡)ρς?α=θ&β=κ#λμν:~:text=φχψ on…
79+
See ⸠example.com/αβγ/δρς?α(=⸡)θ&β=κ#λμν:~:text=φχψ on…
80+
See ⸠example.com/αβγ/δρς?α=θ(&⸡)β=κ#λμν:~:text=φχψ on…
81+
See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμ(ν:~:text=φχ⸡)ψ on…
8082

8183
⸠https://ja.wikipedia.org/wiki/フィンセント・ファン・ゴッホ⸡
8284
⸠https://ja.wikipedia.org/wiki/%E3%83%95%E3%82%A3%E3%83%B3%E3%82%BB%E3%83%B3%E3%83%88%E3%83%BB%E3%83%95%E3%82%A1%E3%83%B3%E3%83%BB%E3%82%B4%E3%83%83%E3%83%9B⸡

unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,9 @@ static void generateDetectionTestData() {
294294
+ expected
295295
+ "\nactual: \t"
296296
+ actual);
297-
LinkUtilities.addBracesAroundDetectedLink(base); // for debugging
298-
return;
297+
// for debugging
298+
LinkUtilities.addBracesAroundDetectedLink(base);
299+
return;
299300
}
300301

301302
out.println((ADDTEST ? (line + "\t") : "") + actual);
@@ -326,7 +327,8 @@ static void generateDetectionTestData() {
326327
+ expected
327328
+ "\nactual: \t"
328329
+ actual);
329-
LinkUtilities.addBracesAroundDetectedLink(base); // for debugging
330+
// for debugging
331+
LinkUtilities.addBracesAroundDetectedLink(base);
330332
return;
331333
}
332334
out.println(actual);

unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ public enum Part {
247247
HOST('\u0000', "[/?#]", "[]", "[]"),
248248
PATH('/', "[?#]", "[/]", "[]"),
249249
QUERY('?', "[#]", "[=\\&]", "[+]"),
250-
FRAGMENT('#', "[]", "[]", "[]");
250+
FRAGMENT('#', "[]", "[]", "[\\{:~\\}]");
251251
final int initiator;
252252
final UnicodeSet terminators;
253253
final UnicodeSet clearStack;
@@ -325,6 +325,8 @@ public static NavigableMap<Part, String> getParts(String source, boolean unescap
325325
public String unescape(String substring) {
326326
return LinkUtilities.unescape(substring, extraQuoted);
327327
}
328+
329+
static final int[] FRAGMENT_DIRECTIVE = ":~:".codePoints().toArray();
328330
}
329331

330332
private static final UnicodeSet idnMapped =
@@ -458,6 +460,7 @@ public static int parsePathQueryFragment(String source, int codePointOffset) {
458460
int[] codePoints = source.codePoints().toArray();
459461
int lastSafe = codePointOffset;
460462
Part part = null;
463+
461464
Stack<Integer> openingStack = new Stack<>();
462465
LinkTermination lt = LinkTermination.Soft;
463466
for (int i = codePointOffset; i < codePoints.length; ++i) {
@@ -481,8 +484,11 @@ public static int parsePathQueryFragment(String source, int codePointOffset) {
481484
}
482485
lastSafe = i + 1;
483486
continue;
484-
} else if (part.clearStack.contains(cp)) { // TODO, enhance for strings
485-
openingStack.clear();
487+
} else if (part.clearStack.contains(cp)) {
488+
openingStack.clear();
489+
} else if (part == Part.FRAGMENT && matches(codePoints, i, Part.FRAGMENT_DIRECTIVE)) {
490+
// there is one string form, so hard-code it
491+
openingStack.clear();
486492
}
487493
switch (lt) {
488494
case Include:
@@ -513,6 +519,19 @@ public static int parsePathQueryFragment(String source, int codePointOffset) {
513519
return lt == LinkTermination.Soft ? lastSafe : codePoints.length;
514520
}
515521

522+
/** Simple utility for finding matching int array regions */
523+
private static boolean matches(int[] codePoints, int cpIndex, int[] fragmentDirective) {
524+
if (cpIndex + fragmentDirective.length > codePoints.length) {
525+
return false;
526+
}
527+
for (int i = 0; i < fragmentDirective.length; ++i) {
528+
if (codePoints[i + cpIndex] != fragmentDirective[i]) {
529+
return false;
530+
}
531+
}
532+
return true;
533+
}
534+
516535
/**
517536
* Minimally escape. Presumes that the parts use \ for interior quoting.<br>
518537
*

unicodetools/src/main/resources/org/unicode/tools/LinkDetectionTestSource.txt

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,23 @@ See example.com/αβ) on… See ⸠example.com/αβ⸡) on…
3838
# Include matched bracket
3939
See example.com/α(β) on… See ⸠example.com/α(β)⸡ on…
4040

41-
See example.com/αβγ/δρς?α=θ&β=κ#λμν on… See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμν⸡ on…
41+
See example.com/αβγ/δρς?α=θ&β=κ#λμν:~:text=φχψ on… See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμν:~:text=φχψ⸡ on…
4242

4343
# Continue past matching brackets within same part and not across interior syntax
44-
See example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν on… See ⸠example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν⸡ on…
45-
See example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν⸡ on…
44+
See example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν:~:text=(φχψ) on… See ⸠example.com/α(β)γ/δρς?α=(θ)&(β)=κ#λμν:~:text=(φχψ)⸡ on…
45+
See example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ)] on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ)]⸡ on…
4646

4747
# Don't match mismatched brackets
48-
See example.com/α[(β])γ/δρς?α=[(θ)]&[(β)]=κ#λμν on… See ⸠example.com/α[(β⸡])γ/δρς?α=[(θ)]&[(β)]=κ#λμν on…
49-
See example.com/α[(β)]γ/δρς?α=[(θ])&[(β)]=κ#λμν on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ⸡])&[(β)]=κ#λμν on…
50-
See example.com/α[(β)]γ/δρς?α=[(θ)]&[(β])=κ#λμν on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β⸡])=κ#λμν on…
48+
See example.com/α[(β])γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=φχψ on… See ⸠example.com/α[(β⸡])γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=φχψ on…
49+
See example.com/α[(β)]γ/δρς?α=[(θ])&[(β)]=κ#λμν:~:text=φχψ on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ⸡])&[(β)]=κ#λμν:~:text=φχψ on…
50+
See example.com/α[(β)]γ/δρς?α=[(θ)]&[(β])=κ#λμν:~:text=φχψ on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β⸡])=κ#λμν:~:text=φχψ on…
51+
See example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ]) on… See ⸠example.com/α[(β)]γ/δρς?α=[(θ)]&[(β)]=κ#λμν:~:text=[(φχψ⸡]) on…
5152

5253
# Don't match across interior syntax
53-
See example.com/αβ(γ/δ)ρς?α=θ&β=κ#λμν on… See ⸠example.com/αβ(γ/δ⸡)ρς?α=θ&β=κ#λμν on…
54-
See example.com/αβγ/δρς?α(=)θ&β=κ#λμν on… See ⸠example.com/αβγ/δρς?α(=⸡)θ&β=κ#λμν on…
55-
See example.com/αβγ/δρς?α=θ(&)β=κ#λμν on… See ⸠example.com/αβγ/δρς?α=θ(&⸡)β=κ#λμν on…
54+
See example.com/αβ(γ/δ)ρς?α=θ&β=κ#λμν:~:text=φχψ on… See ⸠example.com/αβ(γ/δ⸡)ρς?α=θ&β=κ#λμν:~:text=φχψ on…
55+
See example.com/αβγ/δρς?α(=)θ&β=κ#λμν:~:text=φχψ on… See ⸠example.com/αβγ/δρς?α(=⸡)θ&β=κ#λμν:~:text=φχψ on…
56+
See example.com/αβγ/δρς?α=θ(&)β=κ#λμν:~:text=φχψ on… See ⸠example.com/αβγ/δρς?α=θ(&⸡)β=κ#λμν:~:text=φχψ on…
57+
See example.com/αβγ/δρς?α=θ&β=κ#λμ(ν:~:text=φχ)ψ on… See ⸠example.com/αβγ/δρς?α=θ&β=κ#λμ(ν:~:text=φχ⸡)ψ on…
5658

5759
https://ja.wikipedia.org/wiki/フィンセント・ファン・ゴッホ ⸠https://ja.wikipedia.org/wiki/フィンセント・ファン・ゴッホ⸡
5860
https://ja.wikipedia.org/wiki/%E3%83%95%E3%82%A3%E3%83%B3%E3%82%BB%E3%83%B3%E3%83%88%E3%83%BB%E3%83%95%E3%82%A1%E3%83%B3%E3%83%BB%E3%82%B4%E3%83%83%E3%83%9B ⸠https://ja.wikipedia.org/wiki/%E3%83%95%E3%82%A3%E3%83%B3%E3%82%BB%E3%83%B3%E3%83%88%E3%83%BB%E3%83%95%E3%82%A1%E3%83%B3%E3%83%BB%E3%82%B4%E3%83%83%E3%83%9B⸡

0 commit comments

Comments
 (0)