Skip to content

Commit ae062ef

Browse files
committed
Fixes for Markus
1 parent 7388355 commit ae062ef

File tree

7 files changed

+1310
-1307
lines changed

7 files changed

+1310
-1307
lines changed

unicodetools/data/linkification/dev/LinkBracket.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkBracket.txt
2-
# Date: 2025-12-14, 06:39:23 GMT
2+
# Date: 2025-12-16, 17:57:01 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -15,20 +15,20 @@
1515
# Field 1: code point
1616
# For more information, see https://www.unicode.org/reports/tr58/#property-data.
1717
#
18-
# For the purpose of link detection and formatting operations, the property Link_Bracket is defined as
19-
# a string property whose value is either a single code point or is undefined.
18+
# For the purpose of regular expressions, the property Link_Bracket is defined as
19+
# a string property whose value is either a single code point or is <none>.
2020
#
2121
# The short name of the property is the same as its long name.
2222
#
2323
# All code points not explicitly listed for Link_Bracket
24-
# have the value undefined.
24+
# have the value <none>.
2525
#
26-
# @missing: 0000..10FFFF; undefined
26+
# @missing: 0000..10FFFF; <none>
2727
#
2828
# ================================================
2929

3030
0029 ; 0028 #1.1 () ⇒ () RIGHT PARENTHESIS
31-
003E ; 003C #1.1 (&gt;&lt;) GREATER-THAN SIGN
31+
003E ; 003C #1.1 (><) GREATER-THAN SIGN
3232
005D ; 005B #1.1 (] ⇒ [) RIGHT SQUARE BRACKET
3333
007D ; 007B #1.1 (} ⇒ {) RIGHT CURLY BRACKET
3434
0F3B ; 0F3A #2.0 (༻ ⇒ ༺) TIBETAN MARK GUG RTAGS GYAS

unicodetools/data/linkification/dev/LinkDetectionTest.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkDetectionTest.txt
2-
# Date: 2025-12-16, 01:20:14 GMT
2+
# Date: 2025-12-16, 17:57:01 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -11,7 +11,8 @@
1111
# Format:
1212
# Each line contains zero or more marked links, such as ⸠abc.com⸡
1313
#
14-
# Operation:# For each line.
14+
# Operation:
15+
# For each line.
1516
# • Create a copy of the line, with the characters ⸠ and ⸡ removed.
1617
# • Run link detection on the line, inserting ⸠ and ⸡ around each detected link.
1718
# • Report a failure if the result is not identical to the original line.
@@ -35,7 +36,7 @@ See http://.foo.example.com/αβγ on…
3536
See http://foo..example.com/αβγ on…
3637
See http://-foo.example-.com. on…
3738

38-
# Legal but unusual. Because we might be at the end of a sentence, we don't include the . unless followed by [/$#]
39+
# Legal but unusual. Because we might be at the end of a sentence, we don't include the . unless followed by a Path, Query, or Fragment
3940
See ⸠http://foo.example.com⸡. on…
4041
See ⸠http://foo.example.com./αβγ⸡ on…
4142

@@ -119,7 +120,7 @@ See [email protected]
119120
120121

121122
#Quoted local-parts (not in the base algorithm).
122-
See "john\ [email protected]"
123+
See "john\ doe"⸠@example.com⸡
123124

124125
# Note that [email protected] is ambiguous: it could be an email address, or it could be a URL with a userinfo
125126
# The latter, however, is extremely rare, so always favor the email address.

unicodetools/data/linkification/dev/LinkEmail.txt

Lines changed: 1269 additions & 1272 deletions
Large diffs are not rendered by default.

unicodetools/data/linkification/dev/LinkFormattingTest.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkFormattingTest.txt
2-
# Date: 2025-12-15, 06:34:31 GMT
2+
# Date: 2025-12-16, 17:57:01 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -16,8 +16,8 @@
1616
# Field 4: Result — with minimal escaping
1717
#
1818
# Empty lines, and lines starting with # are ignored.
19-
# Spaces around the semicolons are ignored.
2019
# Otherwise # is treated like any other character.
20+
# Spaces around the semicolons are ignored.
2121
#
2222
# The Path, Query, and Fragment will contain backslash escapes when characters would otherwise be
2323
# internal syntax characters in *that* part.
@@ -111,7 +111,6 @@ https:// ; azb.wikipedia.org ; /wiki/واشینقتن،_دی.سی. ; ; ; https
111111
https:// ; mad.wikipedia.org ; /wiki/Tasè’ ; ; ; https://mad.wikipedia.org/wiki/Tasè%E2%80%99
112112
https:// ; wuu.wikipedia.org ; /wiki/聖保羅(巴西) ; ; ; https://wuu.wikipedia.org/wiki/聖保羅(巴西)
113113
https:// ; vep.wikipedia.org ; /wiki/Brüssel' ; ; ; https://vep.wikipedia.org/wiki/Brüssel%27
114-
https:// ; tw.wikipedia.org ; /wiki/Wiase_Nyinaa_Wɛbsaet_(_World_Wide_Web;_WWW_) ; ; ; https://tw.wikipedia.org/wiki/Wiase_Nyinaa_Wɛbsaet_(_World_Wide_Web;_WWW_)
115114
https:// ; ja.wikibooks.org ; /wiki/植物学/植物とはどのような生き物か? ; ; ; https://ja.wikibooks.org/wiki/植物学/植物とはどのような生き物か%EF%BC%9F
116115
https:// ; bn.wikibooks.org ; /wiki/উইকিশৈশব:দেশসমূহ_(অ-হ)/ইসরায়েল ; ; ; https://bn.wikibooks.org/wiki/উইকিশৈশব:দেশসমূহ_(অ-হ)/ইসরায়েল
117116
https:// ; haw.wikipedia.org ; /wiki/Puke_noi‘i_kū‘ikena ; ; ; https://haw.wikipedia.org/wiki/Puke_noi‘i_kū‘ikena

unicodetools/data/linkification/dev/LinkTerm.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LinkTerm.txt
2-
# Date: 2025-12-14, 06:39:23 GMT
2+
# Date: 2025-12-16, 17:57:01 GMT
33
# © 2025 Unicode®, Inc.
44
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
55
# For terms of use and license, see https://www.unicode.org/terms_of_use.html
@@ -15,8 +15,8 @@
1515
# Field 1: a Link_Term value
1616
# For more information, see https://www.unicode.org/reports/tr58/#property-data.
1717
#
18-
# For the purpose of detection and formatting operations, the property Link_Term is defined as
19-
# mapping each code point to a set of enumerated values.
18+
# For the purpose of regular expressions, the property Link_Term is defined as
19+
# an enumerated property of code points.
2020
# The short name of the property is the same as its long name.
2121
# The possible values are: Include, Hard, Soft, Close, Open
2222
#
@@ -29,7 +29,7 @@
2929
#
3030
# ================================================
3131

32-
0021..0022 ; Soft # 1.1 [2] (!..&quot;) EXCLAMATION MARK..QUOTATION MARK
32+
0021..0022 ; Soft # 1.1 [2] (!..") EXCLAMATION MARK..QUOTATION MARK
3333
0027 ; Soft # 1.1 (') APOSTROPHE
3434
002C ; Soft # 1.1 (,) COMMA
3535
002E ; Soft # 1.1 (.) FULL STOP
@@ -170,7 +170,7 @@ FF64 ; Soft # 1.1 (、) HALFWIDTH IDEOGRAPHIC COMMA
170170
# Total code points: 330
171171

172172
0029 ; Close # 1.1 ()) RIGHT PARENTHESIS
173-
003E ; Close # 1.1 (&gt;) GREATER-THAN SIGN
173+
003E ; Close # 1.1 (>) GREATER-THAN SIGN
174174
005D ; Close # 1.1 (]) RIGHT SQUARE BRACKET
175175
007D ; Close # 1.1 (}) RIGHT CURLY BRACKET
176176
0F3B ; Close # 2.0 (༻) TIBETAN MARK GUG RTAGS GYAS
@@ -237,7 +237,7 @@ FF63 ; Close # 1.1 (」) HALFWIDTH RIGHT CORNER BRACKET
237237
# Total code points: 64
238238

239239
0028 ; Open # 1.1 (() LEFT PARENTHESIS
240-
003C ; Open # 1.1 (&lt;) LESS-THAN SIGN
240+
003C ; Open # 1.1 (<) LESS-THAN SIGN
241241
005B ; Open # 1.1 ([) LEFT SQUARE BRACKET
242242
007B ; Open # 1.1 ({) LEFT CURLY BRACKET
243243
0F3A ; Open # 2.0 (༺) TIBETAN MARK GUG RTAGS GYON
@@ -303,7 +303,7 @@ FF62 ; Open # 1.1 (「) HALFWIDTH LEFT CORNER BRACKET
303303

304304
# Total code points: 64
305305

306-
0023..0026 ; Include # 1.1 [4] (#..&amp;) NUMBER SIGN..AMPERSAND
306+
0023..0026 ; Include # 1.1 [4] (#..&) NUMBER SIGN..AMPERSAND
307307
002A..002B ; Include # 1.1 [2] (*..+) ASTERISK..PLUS SIGN
308308
002D ; Include # 1.1 (-) HYPHEN-MINUS
309309
002F..0039 ; Include # 1.1 [11] (/..9) SOLIDUS..DIGIT NINE

unicodetools/src/main/java/org/unicode/tools/GenerateLinkData.java

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import com.google.common.collect.TreeMultimap;
88
import com.ibm.icu.text.NumberFormat;
99
import com.ibm.icu.text.SimpleFormatter;
10+
import com.ibm.icu.text.Transliterator;
1011
import com.ibm.icu.text.UnicodeSet;
1112
import com.ibm.icu.util.OutputInt;
1213
import java.io.IOException;
@@ -25,6 +26,7 @@
2526
import org.unicode.cldr.draft.FileUtilities;
2627
import org.unicode.cldr.util.Counter;
2728
import org.unicode.cldr.util.Rational.MutableLong;
29+
import org.unicode.cldr.util.props.UnicodeLabel;
2830
import org.unicode.cldr.util.TransliteratorUtilities;
2931
import org.unicode.props.BagFormatter;
3032
import org.unicode.props.UcdProperty;
@@ -47,7 +49,9 @@
4749
*/
4850
class GenerateLinkData {
4951

50-
private static final boolean ADDTEST = false; // set to true to generate LinkDetectionTestSource
52+
private static final Transliterator FIX_ODD = Transliterator.createFromRules("any-html", ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; ", Transliterator.FORWARD);
53+
54+
private static final boolean ADDTEST = false; // set to true to generate LinkDetectionTestSource
5155

5256
private static final Joiner JOIN_SEMI_SP = Joiner.on(" ;\t");
5357
private static final Splitter SPLIT_TABS = Splitter.on('\t').omitEmptyStrings().trimResults();
@@ -89,8 +93,8 @@ public static void main(String[] args) throws IOException {
8993
+ "# Field 1: a {3} value\n"
9094
+ "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n"
9195
+ "#\n"
92-
+ "# For the purpose of detection and formatting operations, the property {3} is defined as\n"
93-
+ "# mapping each code point to a set of enumerated values.\n"
96+
+ "# For the purpose of regular expressions, the property {3} is defined as\n"
97+
+ "# an enumerated property of code points.\n"
9498
+ "# The short name of the property is the same as its long name.\n"
9599
+ "# The possible values are: Include, Hard, Soft, Close, Open\n"
96100
+ "#\n"
@@ -113,7 +117,7 @@ public static void main(String[] args) throws IOException {
113117
+ "# Field 1: code point\n"
114118
+ "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n"
115119
+ "#\n"
116-
+ "# For the purpose of link detection and formatting operations, the property {3} is defined as\n"
120+
+ "# For the purpose of regular expressions, the property {3} is defined as\n"
117121
+ "# a string property whose value is either a single code point or is {4}.\n"
118122
+ "#\n"
119123
+ "# The short name of the property is the same as its long name.\n"
@@ -132,18 +136,17 @@ public static void main(String[] args) throws IOException {
132136
+ "# Format\n"
133137
+ "#\n"
134138
+ "# Field 0: code point range\n"
135-
+ "# Field 1: binary value\n"
136139
+ "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n"
137140
+ "#\n"
138-
+ "# For the purpose of link detection and formatting operations, the property {3} is defined as\n"
141+
+ "# For the purpose of regular expressions, the property {3} is defined as\n"
139142
+ "# a binary property.\n"
140143
+ "#\n"
141144
+ "# The short name of the property is the same as its long name.\n"
142145
+ "#\n"
143146
+ "# All code points not explicitly listed for {3}\n"
144147
+ "# have the value {4}.\n"
145-
+ "#\n"
146-
+ "# @missing: 0000..10FFFF; {4}\n"
148+
// + "#\n"
149+
// + "# @missing: 0000..10FFFF; {4}\n"
147150
+ "#\n"
148151
+ "# ================================================\n");
149152

@@ -153,7 +156,7 @@ public static void main(String[] args) throws IOException {
153156
+ "# Format:\n"
154157
+ "# Each line contains zero or more marked links, such as ⸠abc.com⸡\n"
155158
+ "#\n"
156-
+ "# Operation:"
159+
+ "# Operation:\n"
157160
+ "# For each line.\n"
158161
+ "# • Create a copy of the line, with the characters ⸠ and ⸡ removed.\n"
159162
+ "# • Run link detection on the line, inserting ⸠ and ⸡ around each detected link.\n"
@@ -173,8 +176,8 @@ public static void main(String[] args) throws IOException {
173176
+ "# Field 4: Result — with minimal escaping\n"
174177
+ "#\n"
175178
+ "# Empty lines, and lines starting with # are ignored.\n"
176-
+ "# Spaces around the semicolons are ignored.\n"
177179
+ "# Otherwise # is treated like any other character.\n"
180+
+ "# Spaces around the semicolons are ignored.\n"
178181
+ "#\n"
179182
+ "# The Path, Query, and Fragment will contain backslash escapes when characters would otherwise be \n"
180183
+ "# internal syntax characters in *that* part. \n"
@@ -207,7 +210,7 @@ static void generatePropertyData() {
207210
System.out.println("TLDs=\t" + Joiner.on(' ').join(LinkUtilities.TLDS));
208211

209212
BagFormatter bf = new BagFormatter(LinkUtilities.IUP).setLineSeparator("\n");
210-
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
213+
bf.setShowLiteral(FIX_ODD);
211214

212215
// LinkTerm.txt
213216

@@ -229,7 +232,7 @@ static void generatePropertyData() {
229232
}
230233

231234
// LinkEmail.txt
232-
bf.setValueSource(LinkUtilities.LinkEmail);
235+
bf.setValueSource(UnicodeLabel.NULL);
233236
try (final PrintWriter out =
234237
FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR_DEV, "LinkEmail.txt"); ) {
235238
writePropHeader(
@@ -251,7 +254,7 @@ static void generatePropertyData() {
251254
bf.setShowDehexedValue(true);
252255
try (final PrintWriter out =
253256
FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR_DEV, "LinkBracket.txt"); ) {
254-
writePropHeader(out, HEADER_PROP_STRING, "LinkBracket", "Link_Bracket", "undefined");
257+
writePropHeader(out, HEADER_PROP_STRING, "LinkBracket", "Link_Bracket", "<none>");
255258
bf.showSetNames(out, LinkTermination.Close.base);
256259
} catch (IOException e) {
257260
throw new UncheckedIOException(e);
@@ -418,6 +421,9 @@ static void generateFormattingTestData() {
418421
if (wikiStart < 0) {
419422
return;
420423
}
424+
if (line.contains(";")) { // skip any url with a ; in it
425+
return;
426+
}
421427
int lastCodePoint = line.codePointBefore(line.length());
422428
String rest =
423429
line.substring(

unicodetools/src/main/resources/org/unicode/tools/LinkDetectionTestSource.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ See http://.foo.example.com/αβγ on… See http://.foo.example.com/αβγ on
1111
See http://foo..example.com/αβγ on… See http://foo..example.com/αβγ on…
1212
See http://-foo.example-.com. on… See http://-foo.example-.com. on…
1313

14-
# Legal but unusual. Because we might be at the end of a sentence, we don't include the . unless followed by [/$#]
14+
# Legal but unusual. Because we might be at the end of a sentence, we don't include the . unless followed by a Path, Query, or Fragment
1515
See http://foo.example.com. on… See ⸠http://foo.example.com⸡. on…
1616
See http://foo.example.com./αβγ on… See ⸠http://foo.example.com./αβγ⸡ on…
1717

@@ -95,7 +95,7 @@ See [email protected]
9595
9696

9797
#Quoted local-parts (not in the base algorithm).
98-
See "john\ [email protected]"
98+
See "john\ doe"@example.com
9999

100100
# Note that [email protected] is ambiguous: it could be an email address, or it could be a URL with a userinfo
101101
# The latter, however, is extremely rare, so always favor the email address.

0 commit comments

Comments
 (0)