77import com .google .common .collect .TreeMultimap ;
88import com .ibm .icu .text .NumberFormat ;
99import com .ibm .icu .text .SimpleFormatter ;
10+ import com .ibm .icu .text .Transliterator ;
1011import com .ibm .icu .text .UnicodeSet ;
1112import com .ibm .icu .util .OutputInt ;
1213import java .io .IOException ;
2526import org .unicode .cldr .draft .FileUtilities ;
2627import org .unicode .cldr .util .Counter ;
2728import org .unicode .cldr .util .Rational .MutableLong ;
29+ import org .unicode .cldr .util .props .UnicodeLabel ;
2830import org .unicode .cldr .util .TransliteratorUtilities ;
2931import org .unicode .props .BagFormatter ;
3032import org .unicode .props .UcdProperty ;
4749 */
4850class GenerateLinkData {
4951
50- private static final boolean ADDTEST = false ; // set to true to generate LinkDetectionTestSource
52+ private static final Transliterator FIX_ODD = Transliterator .createFromRules ("any-html" , ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; " , Transliterator .FORWARD );
53+
54+ private static final boolean ADDTEST = false ; // set to true to generate LinkDetectionTestSource
5155
5256 private static final Joiner JOIN_SEMI_SP = Joiner .on (" ;\t " );
5357 private static final Splitter SPLIT_TABS = Splitter .on ('\t' ).omitEmptyStrings ().trimResults ();
@@ -89,8 +93,8 @@ public static void main(String[] args) throws IOException {
8993 + "# Field 1: a {3} value\n "
9094 + "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n "
9195 + "#\n "
92- + "# For the purpose of detection and formatting operations , the property {3} is defined as\n "
93- + "# mapping each code point to a set of enumerated values .\n "
96+ + "# For the purpose of regular expressions , the property {3} is defined as\n "
97+ + "# an enumerated property of code points .\n "
9498 + "# The short name of the property is the same as its long name.\n "
9599 + "# The possible values are: Include, Hard, Soft, Close, Open\n "
96100 + "#\n "
@@ -113,7 +117,7 @@ public static void main(String[] args) throws IOException {
113117 + "# Field 1: code point\n "
114118 + "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n "
115119 + "#\n "
116- + "# For the purpose of link detection and formatting operations , the property {3} is defined as\n "
120+ + "# For the purpose of regular expressions , the property {3} is defined as\n "
117121 + "# a string property whose value is either a single code point or is {4}.\n "
118122 + "#\n "
119123 + "# The short name of the property is the same as its long name.\n "
@@ -132,18 +136,17 @@ public static void main(String[] args) throws IOException {
132136 + "# Format\n "
133137 + "#\n "
134138 + "# Field 0: code point range\n "
135- + "# Field 1: binary value\n "
136139 + "# For more information, see https://www.unicode.org/reports/tr58/#property-data. \n "
137140 + "#\n "
138- + "# For the purpose of link detection and formatting operations , the property {3} is defined as\n "
141+ + "# For the purpose of regular expressions , the property {3} is defined as\n "
139142 + "# a binary property.\n "
140143 + "#\n "
141144 + "# The short name of the property is the same as its long name.\n "
142145 + "#\n "
143146 + "# All code points not explicitly listed for {3}\n "
144147 + "# have the value {4}.\n "
145- + "#\n "
146- + "# @missing: 0000..10FFFF; {4}\n "
148+ // + "#\n"
149+ // + "# @missing: 0000..10FFFF; {4}\n"
147150 + "#\n "
148151 + "# ================================================\n " );
149152
@@ -153,7 +156,7 @@ public static void main(String[] args) throws IOException {
153156 + "# Format:\n "
154157 + "# Each line contains zero or more marked links, such as ⸠abc.com⸡\n "
155158 + "#\n "
156- + "# Operation:"
159+ + "# Operation:\n "
157160 + "# For each line.\n "
158161 + "# • Create a copy of the line, with the characters ⸠ and ⸡ removed.\n "
159162 + "# • Run link detection on the line, inserting ⸠ and ⸡ around each detected link.\n "
@@ -173,8 +176,8 @@ public static void main(String[] args) throws IOException {
173176 + "# Field 4: Result — with minimal escaping\n "
174177 + "#\n "
175178 + "# Empty lines, and lines starting with # are ignored.\n "
176- + "# Spaces around the semicolons are ignored.\n "
177179 + "# Otherwise # is treated like any other character.\n "
180+ + "# Spaces around the semicolons are ignored.\n "
178181 + "#\n "
179182 + "# The Path, Query, and Fragment will contain backslash escapes when characters would otherwise be \n "
180183 + "# internal syntax characters in *that* part. \n "
@@ -207,7 +210,7 @@ static void generatePropertyData() {
207210 System .out .println ("TLDs=\t " + Joiner .on (' ' ).join (LinkUtilities .TLDS ));
208211
209212 BagFormatter bf = new BagFormatter (LinkUtilities .IUP ).setLineSeparator ("\n " );
210- bf .setShowLiteral (TransliteratorUtilities . toHTMLControl );
213+ bf .setShowLiteral (FIX_ODD );
211214
212215 // LinkTerm.txt
213216
@@ -229,7 +232,7 @@ static void generatePropertyData() {
229232 }
230233
231234 // LinkEmail.txt
232- bf .setValueSource (LinkUtilities . LinkEmail );
235+ bf .setValueSource (UnicodeLabel . NULL );
233236 try (final PrintWriter out =
234237 FileUtilities .openUTF8Writer (LinkUtilities .DATA_DIR_DEV , "LinkEmail.txt" ); ) {
235238 writePropHeader (
@@ -251,7 +254,7 @@ static void generatePropertyData() {
251254 bf .setShowDehexedValue (true );
252255 try (final PrintWriter out =
253256 FileUtilities .openUTF8Writer (LinkUtilities .DATA_DIR_DEV , "LinkBracket.txt" ); ) {
254- writePropHeader (out , HEADER_PROP_STRING , "LinkBracket" , "Link_Bracket" , "undefined " );
257+ writePropHeader (out , HEADER_PROP_STRING , "LinkBracket" , "Link_Bracket" , "<none> " );
255258 bf .showSetNames (out , LinkTermination .Close .base );
256259 } catch (IOException e ) {
257260 throw new UncheckedIOException (e );
@@ -418,6 +421,9 @@ static void generateFormattingTestData() {
418421 if (wikiStart < 0 ) {
419422 return ;
420423 }
424+ if (line .contains (";" )) { // skip any url with a ; in it
425+ return ;
426+ }
421427 int lastCodePoint = line .codePointBefore (line .length ());
422428 String rest =
423429 line .substring (
0 commit comments