@@ -490,11 +490,18 @@ public static function split(
490490 bool |int $ captureOffset = false ,
491491 bool $ skipEmpty = false ,
492492 int $ limit = -1 ,
493+ bool $ utf8 = false ,
493494 ): array {
494495 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
495496 ? $ captureOffset
496497 : ($ captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0 ) | ($ skipEmpty ? PREG_SPLIT_NO_EMPTY : 0 );
497- return self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
498+ $ pattern .= $ utf8 ? 'u ' : '' ;
499+ $ m = self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
500+ if ($ utf8 && ($ flags & PREG_SPLIT_OFFSET_CAPTURE )) {
501+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
502+ }
503+
504+ return $ m ;
498505 }
499506
500507
@@ -507,17 +514,29 @@ public static function match(
507514 bool |int $ captureOffset = false ,
508515 int $ offset = 0 ,
509516 bool $ unmatchedAsNull = false ,
517+ bool $ utf8 = false ,
510518 ): ?array {
511519 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
512520 ? $ captureOffset
513521 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
522+ if ($ utf8 ) {
523+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
524+ $ pattern .= 'u ' ;
525+ }
526+
514527 if ($ offset > strlen ($ subject )) {
515528 return null ;
516529 }
517530
518- return self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])
519- ? $ m
520- : null ;
531+ if (!self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])) {
532+ return null ;
533+ }
534+
535+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
536+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
537+ }
538+
539+ return $ m ;
521540 }
522541
523542
@@ -532,10 +551,16 @@ public static function matchAll(
532551 int $ offset = 0 ,
533552 bool $ unmatchedAsNull = false ,
534553 bool $ patternOrder = false ,
554+ bool $ utf8 = false ,
535555 ): array {
536556 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
537557 ? $ captureOffset
538558 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 ) | ($ patternOrder ? PREG_PATTERN_ORDER : 0 );
559+ if ($ utf8 ) {
560+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
561+ $ pattern .= 'u ' ;
562+ }
563+
539564 if ($ offset > strlen ($ subject )) {
540565 return [];
541566 }
@@ -545,6 +570,10 @@ public static function matchAll(
545570 ($ flags & PREG_PATTERN_ORDER ) ? $ flags : ($ flags | PREG_SET_ORDER ),
546571 $ offset ,
547572 ]);
573+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
574+ return self ::bytesToChars ($ subject , $ m );
575+ }
576+
548577 return $ m ;
549578 }
550579
@@ -559,24 +588,56 @@ public static function replace(
559588 int $ limit = -1 ,
560589 bool $ captureOffset = false ,
561590 bool $ unmatchedAsNull = false ,
591+ bool $ utf8 = false ,
562592 ): string {
563593 if (is_object ($ replacement ) || is_array ($ replacement )) {
564594 if (!is_callable ($ replacement , false , $ textual )) {
565595 throw new Nette \InvalidStateException ("Callback ' $ textual' is not callable. " );
566596 }
567597
568598 $ flags = ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
599+ if ($ utf8 ) {
600+ $ pattern .= 'u ' ;
601+ if ($ captureOffset ) {
602+ $ replacement = fn ($ m ) => $ replacement (self ::bytesToChars ($ subject , [$ m ])[0 ]);
603+ }
604+ }
605+
569606 return self ::pcre ('preg_replace_callback ' , [$ pattern , $ replacement , $ subject , $ limit , 0 , $ flags ]);
570607
571608 } elseif (is_array ($ pattern ) && is_string (key ($ pattern ))) {
572609 $ replacement = array_values ($ pattern );
573610 $ pattern = array_keys ($ pattern );
574611 }
575612
613+ if ($ utf8 ) {
614+ $ pattern = array_map (fn ($ item ) => $ item . 'u ' , (array ) $ pattern );
615+ }
616+
576617 return self ::pcre ('preg_replace ' , [$ pattern , $ replacement , $ subject , $ limit ]);
577618 }
578619
579620
621+ private static function bytesToChars (string $ s , array $ groups ): array
622+ {
623+ $ lastBytes = $ lastChars = 0 ;
624+ foreach ($ groups as &$ matches ) {
625+ foreach ($ matches as &$ match ) {
626+ if ($ match [1 ] > $ lastBytes ) {
627+ $ lastChars += self ::length (substr ($ s , $ lastBytes , $ match [1 ] - $ lastBytes ));
628+ } elseif ($ match [1 ] < $ lastBytes ) {
629+ $ lastChars -= self ::length (substr ($ s , $ match [1 ], $ lastBytes - $ match [1 ]));
630+ }
631+
632+ $ lastBytes = $ match [1 ];
633+ $ match [1 ] = $ lastChars ;
634+ }
635+ }
636+
637+ return $ groups ;
638+ }
639+
640+
580641 /** @internal */
581642 public static function pcre (string $ func , array $ args )
582643 {
0 commit comments