11package org .unicode .tools ;
22
3+ import com .google .common .base .Joiner ;
34import com .google .common .base .Splitter ;
5+ import com .google .common .collect .ImmutableSortedMap ;
46import com .google .common .collect .Multimap ;
57import com .google .common .collect .TreeMultimap ;
68import com .ibm .icu .text .NumberFormat ;
79import com .ibm .icu .text .SimpleFormatter ;
810import com .ibm .icu .text .UnicodeSet ;
11+ import com .ibm .icu .util .OutputInt ;
912import java .io .IOException ;
1013import java .io .PrintWriter ;
1114import java .io .UncheckedIOException ;
1417import java .time .Instant ;
1518import java .time .ZoneId ;
1619import java .time .format .DateTimeFormatter ;
20+ import java .util .EnumMap ;
1721import java .util .List ;
1822import java .util .NavigableMap ;
1923import java .util .function .Consumer ;
4145 */
4246class GenerateLinkData {
4347
44- private static final Splitter SPLIT_TABS = Splitter .on ('\t' ).omitEmptyStrings ();
48+ private static final Joiner JOIN_SEMI_SP = Joiner .on (" ;\t " );
49+ private static final Splitter SPLIT_TABS = Splitter .on ('\t' ).omitEmptyStrings ().trimResults ();
50+ private static final Splitter SPLIT_SEMI = Splitter .on (';' ).trimResults ();
51+
4552 private static final String HEADER_BASE =
4653 "# {0}.txt\n "
4754 + "# Date: {1} \n "
@@ -54,7 +61,8 @@ class GenerateLinkData {
5461
5562 public static void main (String [] args ) throws IOException {
5663 generatePropertyData ();
57- generateTestData ();
64+ generateDetectionTestData ();
65+ generateFormattingTestData ();
5866 }
5967
6068 static final Instant now = Instant .now ();
@@ -87,7 +95,7 @@ static void writePropHeader(
8795 filename , dt .format (now ), dty .format (now ), propertyName , missingValue ));
8896 }
8997
90- static final SimpleFormatter HEADER_TEST =
98+ static final SimpleFormatter HEADER_DETECT_TEST =
9199 SimpleFormatter .compile (
92100 HEADER_BASE
93101 + "# Format: each line contains zero or more marked links, such as ⸠abc.com⸡\n "
@@ -99,8 +107,26 @@ static void writePropHeader(
99107 + "# Otherwise # is treated like any other character.\n "
100108 + "# ================================================\n " );
101109
102- static void writeTestHeader (PrintWriter out , String filename , String testName ) {
103- out .println (HEADER_TEST .format (filename , dt .format (now ), dty .format (now ), testName ));
110+ static final SimpleFormatter HEADER_FORMAT_TEST =
111+ SimpleFormatter .compile (
112+ HEADER_BASE
113+ + "# Format: Each line has the following fields:\n "
114+ + "# Scheme/host\n "
115+ + "# Path\n "
116+ + "# Query\n "
117+ + "# Fragment\n "
118+ + "# Result — with minimal escaping\n "
119+ + "#\n "
120+ + "# Empty lines, and lines starting with # are ignored.\n "
121+ + "# Otherwise # is treated like any other character.\n "
122+ + "#\n "
123+ + "# The Path, Query, and Fragment may contain backslash escapes when characters would otherwise be \n "
124+ + "# internal syntax characters in that part. For example, a literal / within a path segments would be \\ /.\n "
125+ + "# ================================================\n " );
126+
127+ static void writeTestHeader (
128+ PrintWriter out , SimpleFormatter simpleFormatter , String filename , String testName ) {
129+ out .println (simpleFormatter .format (filename , dt .format (now ), dty .format (now ), testName ));
104130 }
105131
106132 /** Generate property data for the UTS */
@@ -138,21 +164,14 @@ static void generatePropertyData() {
138164 }
139165 }
140166
141- static void generateTestData () {
167+ static void generateDetectionTestData () {
142168
143- BagFormatter bf = new BagFormatter (LinkUtilities .IUP ).setLineSeparator ("\n " );
144-
145- // LinkTermination.txt
146-
147- bf .setValueSource (LinkTermination .PROPERTY );
148- bf .setLabelSource (LinkUtilities .IUP .getProperty (UcdProperty .Age ));
149-
150- // "/unicodetools/src/main/resources/org/unicode/tools/test_links_lt.txt"
169+ OutputInt errorCount = new OutputInt ();
151170
152171 try (final PrintWriter out =
153172 FileUtilities .openUTF8Writer (
154173 LinkUtilities .DATA_DIR_DEV , "LinkDetectionTest.txt" ); ) {
155- writeTestHeader (out , "LinkDetectionTest" , "LinkDetectionTest" );
174+ writeTestHeader (out , HEADER_DETECT_TEST , "LinkDetectionTest" , "LinkDetectionTest" );
156175
157176 out .println ("# Test cases contributed by ICANN\n " );
158177
@@ -165,6 +184,7 @@ static void generateTestData() {
165184 List <String > parts = SPLIT_TABS .splitToList (line );
166185 if (parts .size () != 2 ) {
167186 System .out .println ("* Malformed? " + line );
187+ ++errorCount .value ;
168188 return ;
169189 }
170190 String base = parts .get (0 );
@@ -199,17 +219,9 @@ static void generateTestData() {
199219 } catch (IOException e ) {
200220 throw new UncheckedIOException (e );
201221 }
202-
203- // // LinkPairedOpener.txt
204- // bf.setValueSource(LinkUtilities.LINK_PAIRED_OPENER);
205- // try (final PrintWriter out =
206- // FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR,
207- // "LinkFormattingTest.txt"); ) {
208- // writeHeader(out, "LinkPairedOpener", "LinkPairedOpener", "undefined");
209- // bf.showSetNames(out, LinkTermination.Close.base);
210- // } catch (IOException e) {
211- // throw new UncheckedIOException(e);
212- // }
222+ if (errorCount .value != 0 ) {
223+ throw new IllegalArgumentException ("Failures in writing test file: " + errorCount );
224+ }
213225 }
214226
215227 private static String addBraces (String base ) {
@@ -233,6 +245,128 @@ private static String addBraces(String base) {
233245 return result .toString ();
234246 }
235247
248+ static void generateFormattingTestData () {
249+
250+ OutputInt errorCount = new OutputInt ();
251+
252+ // // LinkPairedOpener.txt
253+ // bf.setValueSource(LinkUtilities.LINK_PAIRED_OPENER);
254+ // try (final PrintWriter out =
255+ // FileUtilities.openUTF8Writer(LinkUtilities.DATA_DIR,
256+ // "LinkFormattingTest.txt"); ) {
257+ // writeHeader(out, "LinkPairedOpener", "LinkPairedOpener", "undefined");
258+ // bf.showSetNames(out, LinkTermination.Close.base);
259+ // } catch (IOException e) {
260+ // throw new UncheckedIOException(e);
261+ // }
262+
263+ try (final PrintWriter out =
264+ FileUtilities .openUTF8Writer (
265+ LinkUtilities .DATA_DIR_DEV , "LinkFormattingTest.txt" ); ) {
266+ writeTestHeader (out , HEADER_FORMAT_TEST , "LinkFormattingTest" , "LinkFormattingTest" );
267+
268+ out .println ("\n # Selected test cases\n " );
269+
270+ Files .lines (Path .of (LinkUtilities .RESOURCE_DIR , "linkFormattingSource.txt" ))
271+ .forEach (
272+ line -> {
273+ if (line .startsWith ("#" ) || line .isBlank ()) {
274+ out .println (line );
275+ return ;
276+ }
277+ List <String > parts = SPLIT_SEMI .splitToList (line );
278+ if (parts .size () < 5 || parts .size () > 6 ) {
279+ System .out .println ("* Malformed? " + line );
280+ ++errorCount .value ;
281+ return ;
282+ }
283+ EnumMap <Part , String > partMap = new EnumMap <>(Part .class );
284+ partMap .put (Part .PROTOCOL , parts .get (0 ));
285+ partMap .put (Part .HOST , parts .get (1 ));
286+ partMap .put (Part .PATH , parts .get (2 ));
287+ partMap .put (Part .QUERY , parts .get (3 ));
288+ partMap .put (Part .FRAGMENT , parts .get (4 ));
289+ ImmutableSortedMap <Part , String > temp =
290+ ImmutableSortedMap .copyOf (partMap );
291+
292+ String actual = LinkUtilities .minimalEscape (temp , false , null );
293+
294+ String expected = parts .size () < 6 ? null : parts .get (5 );
295+ if (expected != null && !actual .equals (expected )) {
296+ System .out .println (
297+ "* mismatch "
298+ + temp
299+ + "\n expected:\t "
300+ + expected
301+ + "\n actual: \t "
302+ + actual );
303+ ++errorCount .value ;
304+ return ;
305+ }
306+ out .println (
307+ JOIN_SEMI_SP .join (
308+ temp .get (Part .PROTOCOL ),
309+ temp .get (Part .HOST ),
310+ temp .get (Part .PATH ),
311+ temp .get (Part .QUERY ),
312+ temp .get (Part .FRAGMENT ),
313+ actual ));
314+ });
315+
316+ // out.println("\n# Wikipedia test cases\n");
317+ //
318+ // Files.lines(Path.of(LinkUtilities.RESOURCE_DIR, "testUrls.txt"))
319+ // .forEach(
320+ // line -> {
321+ // if (line.startsWith("#") || line.isBlank()) {
322+ // out.println(line);
323+ // return;
324+ // }
325+ // // Pick up escaped URL
326+ // // Escape it
327+ // String escapedLine = line;
328+ // // Divide into parts
329+ // NavigableMap<Part, String> parts =
330+ // Part.getParts(escapedLine, false);
331+ //
332+ // out.println(
333+ // showFormatted(
334+ // parts.get(Part.PROTOCOL) +
335+ // parts.get(Part.HOST),
336+ // parts.get(Part.PATH),
337+ // parts.get(Part.QUERY),
338+ // parts.get(Part.FRAGMENT),
339+ // escapedLine));
340+ // });
341+ } catch (IOException e ) {
342+ throw new UncheckedIOException (e );
343+ }
344+ if (errorCount .value != 0 ) {
345+ throw new IllegalArgumentException ("Failures in writing test file: " + errorCount );
346+ }
347+ }
348+
349+ private static String showFormatted (
350+ String schemeAndHost , String path , String query , String fragment , String actual ) {
351+ return JOIN_SEMI_SP .join (
352+ schemeAndHost ,
353+ escapeBSlashed (path ),
354+ escapeBSlashed (query ),
355+ escapeBSlashed (fragment ),
356+ actual );
357+ }
358+
359+ private static String computeFormat (
360+ String schemeAndHost , String path , String query , String fragment ) {
361+ // TODO Auto-generated method stub
362+ return null ;
363+ }
364+
365+ private static String escapeBSlashed (String path ) {
366+ // TODO Auto-generated method stub
367+ return null ;
368+ }
369+
236370 /**
237371 * Use the query https://w.wiki/CKpG to create a file. Put it in RESOURCE_DIR as
238372 * wikipedia1000raw.tsv, then run this class. It generates two new files, <br>
0 commit comments