1717import com .ibm .icu .text .UTF16 ;
1818import com .ibm .icu .text .UnicodeSet ;
1919import com .ibm .icu .text .UnicodeSet .SpanCondition ;
20- import com .ibm .icu .text .UnicodeSet .XSymbolTable ;
2120import com .ibm .icu .text .UnicodeSetIterator ;
2221import com .ibm .icu .util .ULocale ;
2322import java .text .ParsePosition ;
3635import java .util .stream .Collectors ;
3736import org .unicode .cldr .draft .FileUtilities ;
3837import org .unicode .cldr .util .TransliteratorUtilities ;
38+ import org .unicode .props .IndexUnicodeProperties ;
3939import org .unicode .props .UnicodeProperty ;
40+ import org .unicode .tools .Segmenter .Builder .NamedRefinedSet ;
4041import org .unicode .tools .Segmenter .SegmentationRule .Breaks ;
4142
4243/** Ordered list of rules, with variables resolved before building. Use Builder to make. */
@@ -68,6 +69,7 @@ public enum Target {
6869 public final Target target ;
6970
7071 private UnicodeMap <String > samples = new UnicodeMap <String >();
72+ private List <NamedRefinedSet > partitionDefinition = new ArrayList <>();
7173
7274 private Segmenter (Target target ) {
7375 this .target = target ;
@@ -279,13 +281,16 @@ public abstract Breaks applyAt(
279281 public String toString () {
280282 return toString (false );
281283 }
284+
285+ public abstract String toCppOldMonkeyString ();
282286 }
283287
284288 /** A « treat as » rule. */
285289 public static class RemapRule extends SegmentationRule {
286290
287291 public RemapRule (String leftHandSide , String replacement , String line ) {
288- pattern = Pattern .compile (leftHandSide , REGEX_FLAGS );
292+ patternDefinition = leftHandSide ;
293+ pattern = Pattern .compile (Builder .expandUnicodeSets (leftHandSide ), REGEX_FLAGS );
289294 this .replacement = replacement ;
290295 name = line ;
291296 }
@@ -352,6 +357,7 @@ public void apply(
352357 remap .accept (result );
353358 }
354359
360+ private String patternDefinition ;
355361 private Pattern pattern ;
356362 private String replacement ;
357363 private String name ;
@@ -373,6 +379,17 @@ public Breaks applyAt(
373379 protected String toString (boolean showResolved ) {
374380 return name ;
375381 }
382+
383+ @ Override
384+ public String toCppOldMonkeyString () {
385+ return "std::make_unique<RemapRule>(uR\" ("
386+ + name
387+ + ")\" , uR\" ("
388+ + patternDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
389+ + ")\" , uR\" ("
390+ + replacement
391+ + ")\" )" ;
392+ }
376393 }
377394
378395 /** A rule that determines the status of an offset. */
@@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
384401 * @param line
385402 */
386403 public RegexRule (String before , Breaks result , String after , String line ) {
404+ beforeDefinition = before ;
405+ afterDefinition = after ;
406+ before = Builder .expandUnicodeSets (before );
407+ after = Builder .expandUnicodeSets (after );
387408 breaks = result ;
388409 before = ".*(" + before + ")" ;
389410 String parsing = null ;
@@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
453474 return result ;
454475 }
455476
477+ @ Override
478+ public String toCppOldMonkeyString () {
479+ return "std::make_unique<RegexRule>(uR\" ("
480+ + name
481+ + ")\" , uR\" ("
482+ + beforeDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
483+ + ")\" , u'"
484+ + (breaks == Breaks .BREAK ? '÷' : '×' )
485+ + "', uR\" ("
486+ + afterDefinition .replaceAll ("&" , "&&" ).replaceAll ("-" , "--" )
487+ + ")\" )" ;
488+ }
489+
456490 // ============== Internals ================
457491 // We cannot use a single regex of the form "(?<= before) after" because
458492 // (RI RI)* RI × RI would require unbounded lookbehind.
459493 private Pattern before ;
460494 private Pattern after ;
461495 private String name ;
496+ private String beforeDefinition ;
497+ private String afterDefinition ;
462498
463499 private String resolved ;
464500 private Breaks breaks ;
@@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
474510 public static class Builder {
475511 private final UnicodeProperty .Factory propFactory ;
476512 private final Target target ;
477- private XSymbolTable symbolTable ;
478513 private List <String > rawVariables = new ArrayList <String >();
479514 private Map <Double , String > xmlRules = new TreeMap <Double , String >();
480515 private Map <Double , String > htmlRules = new TreeMap <Double , String >();
481516 private List <String > lastComments = new ArrayList <String >();
482517
483518 class NamedSet {
484- NamedSet (String name , UnicodeSet set ) {
519+ NamedSet (String name , String definition , UnicodeSet set ) {
485520 this .name = name ;
521+ this .definition = definition ;
486522 this .set = set ;
487523 }
488524
489525 String name ;
526+ String definition ;
490527 UnicodeSet set ;
491528 }
492529
493- class NamedRefinedSet {
530+ public class NamedRefinedSet {
494531 public NamedRefinedSet clone () {
495532 NamedRefinedSet result = new NamedRefinedSet ();
496533 for (var term : intersectionTerms ) {
497- result .intersectionTerms .add (new NamedSet (term .name , term .set .cloneAsThawed ()));
534+ result .intersectionTerms .add (
535+ new NamedSet (term .name , term .definition , term .set .cloneAsThawed ()));
498536 }
499537 for (var subtrahend : subtrahends ) {
500538 result .subtrahends .add (
501- new NamedSet (subtrahend .name , subtrahend .set .cloneAsThawed ()));
539+ new NamedSet (
540+ subtrahend .name ,
541+ subtrahend .definition ,
542+ subtrahend .set .cloneAsThawed ()));
502543 }
503544 result .set = this .set .cloneAsThawed ();
504545 return result ;
@@ -547,6 +588,19 @@ public String getName() {
547588 .collect (Collectors .joining ());
548589 }
549590
591+ public String getDefinition () {
592+ return intersectionTerms .isEmpty ()
593+ ? "[^[]]"
594+ : "["
595+ + intersectionTerms .stream ()
596+ .map ((s ) -> s .definition )
597+ .collect (Collectors .joining ("&" ))
598+ + subtrahends .stream ()
599+ .map ((s ) -> "-" + s .definition )
600+ .collect (Collectors .joining ())
601+ + "]" ;
602+ }
603+
550604 private UnicodeSet getIntersection () {
551605 UnicodeSet result = UnicodeSet .ALL_CODE_POINTS .cloneAsThawed ();
552606 for (var term : intersectionTerms ) {
@@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
565619 public Builder (UnicodeProperty .Factory factory , Target target ) {
566620 propFactory = factory ;
567621 this .target = target ;
568- symbolTable = new MyXSymbolTable (); // propFactory.getXSymbolTable();
569622 htmlRules .put (new Double (BREAK_SOT ), "sot \u00F7 " );
570623 htmlRules .put (new Double (BREAK_EOT ), "\u00F7 eot" );
571624 htmlRules .put (new Double (BREAK_ANY ), "\u00F7 Any" );
572625 }
573626
574- // copied to make independent of ICU4J internals
575- private class MyXSymbolTable extends UnicodeSet .XSymbolTable {
576- public boolean applyPropertyAlias (
577- String propertyName , String propertyValue , UnicodeSet result ) {
578- UnicodeProperty prop = propFactory .getProperty (propertyName );
579- if (prop == null ) {
580- if (propertyValue .isEmpty ()) {
581- prop = propFactory .getProperty ("Script" );
582- result .clear ();
583- UnicodeSet x = prop .getSet (propertyName , result );
584- if (!x .isEmpty ()) {
585- return true ;
586- }
587- }
588- // If we cannot handle the property name, then we need to really fail.
589- // If we were to just print something and return false, then the UnicodeSet code
590- // would just evaluate this itself, and may succeed but give wrong results.
591- // For example, as long as we require "gc=Cn" and don't handle "Cn" here,
592- // falling back to built-in ICU data means that we get gc=Cn ranges from ICU
593- // rather than from the current Unicode beta.
594- throw new IllegalArgumentException (
595- "Segmenter.MyXSymbolTable: Unknown property " + propertyName );
596- }
597- // Binary properties:
598- // \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
599- if (propertyValue .isEmpty () && prop .isType (UnicodeProperty .BINARY_MASK )) {
600- propertyValue = "Yes" ;
601- }
602- result .clear ();
603- UnicodeSet x = prop .getSet (propertyValue , result );
604- if (x .isEmpty ()) {
605- // didn't find anything
606- System .out .println (
607- "Segmenter.MyXSymbolTable: !Empty! "
608- + propertyName
609- + "="
610- + propertyValue );
611- }
612- return true ; // mark that we handled it even if there are no results.
613- }
614- }
615-
616627 public String toString (String testName , String indent ) {
617628
618629 StringBuffer result = new StringBuffer ();
@@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
728739 + TransliteratorUtilities .toXML .transliterate (value )
729740 + "</variable>" );
730741 value = replaceVariables (value , variables );
742+ ;
731743 if (!name .endsWith ("_" )) {
732744 try {
733745 parsePosition .setIndex (0 );
734- UnicodeSet valueSet = new UnicodeSet (value , parsePosition , symbolTable );
746+ UnicodeSet valueSet =
747+ new UnicodeSet (
748+ value ,
749+ parsePosition ,
750+ IndexUnicodeProperties .make ().getXSymbolTable ());
735751 if (parsePosition .getIndex () != value .length ()) {
736752 if (SHOW_SAMPLES )
737753 System .out .println (
@@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
748764 } else {
749765 String name2 = name ;
750766 if (name2 .startsWith ("$" )) name2 = name2 .substring (1 );
751- refinePartition (new NamedSet (name2 , valueSet ));
767+ refinePartition (new NamedSet (name2 , value , valueSet ));
752768 if (SHOW_SAMPLES ) {
753769 System .out .println ("Samples for: " + name + " = " + value );
754770 System .out .println ("\t " + valueSet );
@@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
827843 + " </rule>" );
828844 rules .put (
829845 order ,
830- new Segmenter .RemapRule (
831- replaceVariables (before , expandedVariables ), after , line ));
846+ new Segmenter .RemapRule (replaceVariables (before , variables ), after , line ));
832847 return this ;
833848 }
834849
@@ -889,9 +904,9 @@ Builder addRegexRule(
889904 rules .put (
890905 order ,
891906 new Segmenter .RegexRule (
892- replaceVariables (before , expandedVariables ),
907+ replaceVariables (before , variables ),
893908 breaks ,
894- replaceVariables (after , expandedVariables ),
909+ replaceVariables (after , variables ),
895910 line ));
896911 return this ;
897912 }
@@ -906,6 +921,7 @@ public Segmenter make() {
906921 for (Double key : rules .keySet ()) {
907922 result .add (key .doubleValue (), rules .get (key ));
908923 }
924+ result .partitionDefinition = partition ;
909925 for (var part : partition ) {
910926 if (part .getName () == null ) {
911927 throw new IllegalArgumentException ("Unclassified characters: " + part .getSet ());
@@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
952968 }
953969
954970 /** Replaces Unicode Sets with literals. */
955- public String expandUnicodeSets (String input ) {
971+ public static String expandUnicodeSets (String input ) {
956972 String result = input ;
973+ var parsePosition = new ParsePosition (0 );
957974 // replace properties
958975 // TODO really dumb parse for now, fix later
959976 for (int i = 0 ; i < result .length (); ++i ) {
960977 if (UnicodeSet .resemblesPattern (result , i )) {
961978 parsePosition .setIndex (i );
962- UnicodeSet temp = new UnicodeSet (result , parsePosition , symbolTable );
979+ UnicodeSet temp =
980+ new UnicodeSet (
981+ result ,
982+ parsePosition ,
983+ IndexUnicodeProperties .make ().getXSymbolTable ());
963984 String insert = getInsertablePattern (temp );
964985 result =
965986 result .substring (0 , i )
@@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
9811002 * @param temp
9821003 * @return
9831004 */
984- private String getInsertablePattern (UnicodeSet temp ) {
1005+ private static String getInsertablePattern (UnicodeSet temp ) {
9851006 temp .complement ().complement ();
9861007 if (DEBUG_REDUCE_SET_SIZE != null ) {
9871008 UnicodeSet temp2 = new UnicodeSet (temp );
@@ -1053,6 +1074,14 @@ public List<String> getRules() {
10531074 }
10541075 }
10551076
1077+ public List <NamedRefinedSet > getPartitionDefinition () {
1078+ return partitionDefinition ;
1079+ }
1080+
1081+ public List <SegmentationRule > getRules () {
1082+ return rules ;
1083+ }
1084+
10561085 // ============== Internals ================
10571086
10581087 private List <SegmentationRule > rules = new ArrayList <SegmentationRule >(1 );
0 commit comments