Skip to content

Commit b0b5d16

Browse files
committed
Cleanup tests
1 parent d8796a4 commit b0b5d16

File tree

4 files changed

+79
-20
lines changed

4 files changed

+79
-20
lines changed

unicodetools/data/linkification/dev/LinkDetectionTest.txt

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkDetectionTest.txt
2-
# Date: 2025-12-14, 06:39:23 GMT
2+
# Date: 2025-12-15, 01:34:55 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -76,9 +76,34 @@ See ⸠example.com/αβ(γ/δ)ρς?θικ#λμν⸡ on…
7676

7777
See ⸠αβγ.δεζ@example.com⸡ on…
7878
See ⸠mailto:αβγ.δεζ@example.com⸡ on…
79-
See mailto:αβγ.δεζ.@⸠example.com⸡ on…
80-
See mailto:αβγ..δεζ@⸠example.com⸡ on…
81-
See mailto:.αβγ.δεζ@⸠example.com⸡ on…
79+
80+
# If a local-part is invalid, skip the domain name
81+
See mailto:αβγ.δεζ[email protected] on…
82+
See mailto:αβγ..δεζ@example.com on…
83+
See mailto:.αβγ.δεζ@example.com on…
84+
85+
#Stop backing up when a space is hit
86+
87+
88+
#Include the medial dot
89+
90+
91+
#Handle non-ASCII
92+
See ⸠アルベルト.アルベルト@example.com⸡
93+
94+
#No valid domain name
95+
See @example.😎
96+
97+
# No local-part
98+
See ⸠@example.com⸡
99+
100+
# No valid local-part
101+
102+
103+
104+
105+
#Quoted local-parts (not in the base algorithm).
106+
See "john\ ⸠[email protected]⸡"
82107

83108
# Note that [email protected] is ambiguous: it could be an email address, or it could be a URL with a userinfo
84109
# The latter, however, is extremely rare, so always favor the email address.

unicodetools/src/main/java/org/unicode/utilities/LinkUtilities.java

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,17 +1021,20 @@ public boolean next() {
10211021
domainStart - 1);
10221022
// fail in illegal cases: .joe.jones, joe.jones. joe..jones
10231023
String localPart = source.substring(mailToStart, domainStart - 1);
1024-
if (!localPart.startsWith(".")
1025-
&& !localPart.endsWith(".")
1026-
&& !localPart.contains("..")) {
1027-
// check for mailto: beforehand
1028-
linkStart = backupIfAfter("mailto:", mailToStart);
1029-
linkEnd =
1030-
emailEnd; // do this so we don't include items after the domain
1031-
// name.
1032-
hardStart = linkEnd; // prepare for next next()
1033-
return true;
1024+
if (localPart.startsWith(".")
1025+
|| localPart.endsWith(".")
1026+
|| localPart.contains("..")) {
1027+
// prepare for next next() by skipping rest of domain link
1028+
hardStart = linkEnd;
1029+
continue; // scan again, skipping the URL after
10341030
}
1031+
// check for mailto: beforehand
1032+
linkStart = backupIfAfter("mailto:", mailToStart);
1033+
// do this so we don't include items after the domain name.
1034+
hardStart = linkEnd;
1035+
// we don't want to include anything after the domain
1036+
linkEnd = emailEnd;
1037+
return true;
10351038
}
10361039
}
10371040

unicodetools/src/main/resources/org/unicode/tools/LinkDetectionTestSource.txt

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,34 @@ abc.XN--11B4C3D ⸠abc.XN--11B4C3D⸡
5252

5353
See αβγ.δεζ@example.com on… See ⸠αβγ.δεζ@example.com⸡ on…
5454
See mailto:αβγ.δεζ@example.com on… See ⸠mailto:αβγ.δεζ@example.com⸡ on…
55-
See mailto:αβγ.δεζ[email protected] on… See mailto:αβγ.δεζ.@⸠example.com⸡ on…
56-
See mailto:αβγ..δεζ@example.com on… See mailto:αβγ..δεζ@⸠example.com⸡ on…
57-
See mailto:.αβγ.δεζ@example.com on… See mailto:.αβγ.δεζ@⸠example.com⸡ on…
55+
56+
# If a local-part is invalid, skip the domain name
57+
See mailto:αβγ.δεζ[email protected] on… See mailto:αβγ.δεζ[email protected] on…
58+
See mailto:αβγ..δεζ@example.com on… See mailto:αβγ..δεζ@example.com on…
59+
See mailto:.αβγ.δεζ@example.com on… See mailto:.αβγ.δεζ@example.com on…
60+
61+
#Stop backing up when a space is hit
62+
63+
64+
#Include the medial dot
65+
66+
67+
#Handle non-ASCII
68+
See アルベルト.アルベルト@example.com
69+
70+
#No valid domain name
71+
See @example.😎
72+
73+
# No local-part
74+
See @example.com
75+
76+
# No valid local-part
77+
78+
79+
80+
81+
#Quoted local-parts (not in the base algorithm).
82+
See "john\ [email protected]"
5883

5984
# Note that [email protected] is ambiguous: it could be an email address, or it could be a URL with a userinfo
6085
# The latter, however, is extremely rare, so always favor the email address.

unicodetools/src/test/java/org/unicode/unittest/LinkUtilitiesTest.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,15 @@ public void testMinimumEscaping() {
163163
"Escape soft ('.') unless followed by include"
164164
},
165165
{"a(v))", "g(d))", "e(z))", "/a(v)%29?g(d)%29#e(z)%29", "Escape unmatched brackets"},
166-
{"", "a%3D%26%=%3D%26%", "", "?a%3D%26%=%3D%26%", "Query with escapes"},
167-
{"a/v%2Fg", "", "", "/a/v%2Fg", "Path with escapes"},
168-
{"", "a%3D%26%=%3D%26%", "", "?a%3D%26%=%3D%26%", "Query with escapes"},
166+
{
167+
"",
168+
"a%3D%26%=%3D%26%",
169+
"",
170+
"?a%253D%2526%=%253D%2526%",
171+
"Query with escapes. %xx needs to go to %25xx if xx is hex"
172+
},
173+
{"a/v%2F%g", "", "", "/a/v%252F%g", "Path with escapes"},
174+
{"", "a%3D%26%=%3D%26%", "", "?a%253D%2526%=%253D%2526%", "Query with escapes"},
169175
};
170176
List<List<String>> testLines = new ArrayList<>();
171177
int line = 0;

0 commit comments

Comments
 (0)