Generate Java monkeys (#1006)

eggrobin · web-flow · commit 27875cb66966 · 2025-01-28T01:50:20.000+01:00
* meow

* Greedy context before in LB20a

* Regenerate UCD
diff --git a/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html b/unicodetools/data/ucd/dev/auxiliary/LineBreakTest.html
@@ -7,7 +7,7 @@
 <body bgcolor='#FFFFFF'>
 <h2>Line_Break Chart</h2>
 <p><b>Unicode Version:</b> 17.0.0</p>
-<p><b>Date:</b> 2024-11-28, 01:27:49 GMT</p>
+<p><b>Date:</b> 2025-01-28, 00:21:01 GMT</p>
 <p>This page illustrates the application of the Line_Break specification. The material here is informative, not normative.</p> <p>The first chart shows where breaks would appear between different sample characters or strings. The sample characters are chosen mechanically to represent the different properties used by the specification.</p><p>Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. The symbol × indicates a prohibited break, even with intervening spaces; the ÷ symbol indicates a (direct) break; the symbol ∻ indicates a break only in the presence of an intervening space (an indirect break).The cells with × or ∻  are also shaded to make it easier to scan the table. For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, indicating that there is no break between CR and LF.</p>
 <p></p><p>In the row and column headers of the <a href='#table'>Table</a>, in the <a href='#rules'>Rules</a>, when hovering over characters in the <a href='#samples'>Samples</a>, and in the comments in the associated list of test cases <a href='LineBreakTest.txt'>LineBreakTest.txt</a>:</p>
 <ol><li>The following sets are used:<ul>
@@ -226,7 +226,7 @@ <h3><a href='#rules' name='rules'>Rules</a></h3>
 <tr><th style='text-align:right'><a href='#r13.03' name='r13.03'>13.03</a></th><td style='text-align:right'></td><td>×</td><td> CP</td></tr>
 <tr><th style='text-align:right'><a href='#r13.04' name='r13.04'>13.04</a></th><td style='text-align:right'></td><td>×</td><td> SY</td></tr>
 <tr><th style='text-align:right'><a href='#r14.0' name='r14.0'>14.0</a></th><td style='text-align:right'>OP SP* </td><td>×</td><td></td></tr>
-<tr><th style='text-align:right'><a href='#r15.11' name='r15.11'>15.11</a></th><td style='text-align:right'>( sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW ) QU_Pi SP* </td><td>×</td><td></td></tr>
+<tr><th style='text-align:right'><a href='#r15.11' name='r15.11'>15.11</a></th><td style='text-align:right'>( BK | CR | LF | NL | OP | QU | GL | SP | ZW | sot ) QU_Pi SP* </td><td>×</td><td></td></tr>
 <tr><th style='text-align:right'><a href='#r15.21' name='r15.21'>15.21</a></th><td style='text-align:right'></td><td>×</td><td> QU_Pf ( SP | GL | WJ | CL | QU | CP | EX | IS | SY | BK | CR | LF | NL | ZW | eot )</td></tr>
 <tr><th style='text-align:right'><a href='#r15.3' name='r15.3'>15.3</a></th><td style='text-align:right'>SP </td><td>÷</td><td> IS NU</td></tr>
 <tr><th style='text-align:right'><a href='#r15.4' name='r15.4'>15.4</a></th><td style='text-align:right'></td><td>×</td><td> IS</td></tr>
@@ -238,10 +238,10 @@ <h3><a href='#rules' name='rules'>Rules</a></h3>
 <tr><th style='text-align:right'><a href='#r19.1' name='r19.1'>19.1</a></th><td style='text-align:right'>[^EastAsian] </td><td>×</td><td> QU</td></tr>
 <tr><th style='text-align:right'><a href='#r19.11' name='r19.11'>19.11</a></th><td style='text-align:right'></td><td>×</td><td> QU ( [^EastAsian] | eot )</td></tr>
 <tr><th style='text-align:right'><a href='#r19.12' name='r19.12'>19.12</a></th><td style='text-align:right'>QU </td><td>×</td><td> [^EastAsian]</td></tr>
-<tr><th style='text-align:right'><a href='#r19.13' name='r19.13'>19.13</a></th><td style='text-align:right'>( sot | [^EastAsian] ) QU </td><td>×</td><td></td></tr>
+<tr><th style='text-align:right'><a href='#r19.13' name='r19.13'>19.13</a></th><td style='text-align:right'>( [^EastAsian] | sot ) QU </td><td>×</td><td></td></tr>
 <tr><th style='text-align:right'><a href='#r20.01' name='r20.01'>20.01</a></th><td style='text-align:right'></td><td>÷</td><td> CB</td></tr>
 <tr><th style='text-align:right'><a href='#r20.02' name='r20.02'>20.02</a></th><td style='text-align:right'>CB </td><td>÷</td><td></td></tr>
-<tr><th style='text-align:right'><a href='#r20.1' name='r20.1'>20.1</a></th><td style='text-align:right'>( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | Hyphen ) </td><td>×</td><td> AL</td></tr>
+<tr><th style='text-align:right'><a href='#r20.1' name='r20.1'>20.1</a></th><td style='text-align:right'>( BK | CR | LF | NL | SP | ZW | CB | GL | sot ) ( HY | Hyphen ) </td><td>×</td><td> AL</td></tr>
 <tr><th style='text-align:right'><a href='#r21.01' name='r21.01'>21.01</a></th><td style='text-align:right'></td><td>×</td><td> BA</td></tr>
 <tr><th style='text-align:right'><a href='#r21.02' name='r21.02'>21.02</a></th><td style='text-align:right'></td><td>×</td><td> HY</td></tr>
 <tr><th style='text-align:right'><a href='#r21.03' name='r21.03'>21.03</a></th><td style='text-align:right'></td><td>×</td><td> NS</td></tr>
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
@@ -480,6 +480,7 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) {
 
         generateTest(false, path, outFilename, propertyName);
         generateCppOldMonkeys(extraPath, outFilename);
+        generateJavaOldMonkeys(extraPath, outFilename);
     }
 
     private void generateCppOldMonkeys(String path, String outFilename) throws IOException {
@@ -512,6 +513,36 @@ private void generateCppOldMonkeys(String path, String outFilename) throws IOExc
         fc.close();
     }
 
+    private void generateJavaOldMonkeys(String path, String outFilename) throws IOException {
+        final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".java");
+        final PrintWriter out = fc.out;
+        out.println();
+        out.println("####### Instructions ###################################");
+        out.println("# Copy the following lines into RBBITestMonkey.java in #");
+        out.println(
+                "# ICU4J, in the constructor of RBBIMeowMonkey, replacing #"
+                        .replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char")));
+        out.println("# the existing block of generated code.                #");
+        out.println("########################################################");
+        out.println();
+        out.println("            // --- NOLI ME TANGERE ---");
+        out.println("            // Generated by GenerateBreakTest.java in the Unicode tools.");
+        for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) {
+            out.println(
+                    "            partition.add(new NamedSet(\""
+                            + part.getName().replace("\\", "\\\\").replace("\"", "\\\"")
+                            + "\", new UnicodeSet(\""
+                            + part.getDefinition().replace("\\", "\\\\").replace("\"", "\\\"")
+                            + "\")));");
+        }
+        out.println();
+        for (Segmenter.SegmentationRule rule : segmenter.getRules()) {
+            out.println("            rules.add(" + rule.toJavaOldMonkeyString() + ");");
+        }
+        out.println("            // --- End of generated code. ---");
+        fc.close();
+    }
+
     private void generateTest(
             boolean shortVersion, String path, String outFilename, String propertyName)
             throws IOException {
diff --git a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java
@@ -283,6 +283,8 @@ public String toString() {
         }
 
         public abstract String toCppOldMonkeyString();
+
+        public abstract String toJavaOldMonkeyString();
     }
 
     /** A « treat as » rule. */
@@ -390,6 +392,17 @@ public String toCppOldMonkeyString() {
                     + replacement
                     + ")\")";
         }
+
+        @Override
+        public String toJavaOldMonkeyString() {
+            return "new RemapRule(\""
+                    + name.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\", \""
+                    + patternDefinition.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\", \""
+                    + replacement.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\")";
+        }
     }
 
     /** A rule that determines the status of an offset. */
@@ -487,6 +500,19 @@ public String toCppOldMonkeyString() {
                     + ")\")";
         }
 
+        @Override
+        public String toJavaOldMonkeyString() {
+            return "new RegexRule(\""
+                    + name.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\", \""
+                    + beforeDefinition.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\", Resolution."
+                    + breaks.name()
+                    + ", \""
+                    + afterDefinition.replace("\\", "\\\\").replace("\"", "\\\"")
+                    + "\")";
+        }
+
         // ============== Internals ================
         // We cannot use a single regex of the form "(?<= before) after" because
         // (RI RI)* RI × RI would require unbounded lookbehind.
diff --git a/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt b/unicodetools/src/main/resources/org/unicode/tools/SegmenterDefault.txt
@@ -183,7 +183,7 @@ $NS=[$NSorig $CJ]
 # LB 15a Do not break after an unresolved initial punctuation that lies at the start of the line,
 # after a space, after opening punctuation, or after an unresolved quotation mark, even after
 # spaces.
-15.11) ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×
+15.11) ( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×
 # LB 15b Do not break before an unresolved final punctuation that lies at the end of the line, before
 # a space, before a prohibited break, or before an unresolved quotation mark, even before spaces.
 15.21) × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot )
@@ -204,12 +204,12 @@ $NS=[$NSorig $CJ]
 19.10) [^$EastAsian] × $QU
 19.11) × $QU ( [^$EastAsian] | $eot )
 19.12) $QU × [^$EastAsian]
-19.13) ( $sot | [^$EastAsian] ) $QU ×
+19.13) ( [^$EastAsian] | $sot ) $QU ×
 # LB 20  Break before and after unresolved CB.
 20.01)  ÷ $CB
 20.02) $CB ÷
 # LB 20a Do not break after a hyphen that follows break opportunity, a space, or the start of text.
-20.10) ( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL
+20.10) ( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL
 # LB 21  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.
 21.01) × $BA
 21.02) × $HY