@@ -477,11 +477,17 @@ public static function split(
477477 bool |int $ captureOffset = false ,
478478 bool $ skipEmpty = false ,
479479 int $ limit = -1 ,
480+ bool $ utf8 = false ,
480481 ): array {
481482 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
482483 ? $ captureOffset
483484 : ($ captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0 ) | ($ skipEmpty ? PREG_SPLIT_NO_EMPTY : 0 );
484- return self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
485+ $ pattern .= $ utf8 ? 'u ' : '' ;
486+ $ m = self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
487+ if ($ utf8 && ($ flags & PREG_SPLIT_OFFSET_CAPTURE )) {
488+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
489+ }
490+ return $ m ;
485491 }
486492
487493
@@ -494,16 +500,25 @@ public static function match(
494500 bool |int $ captureOffset = false ,
495501 int $ offset = 0 ,
496502 bool $ unmatchedAsNull = false ,
503+ bool $ utf8 = false ,
497504 ): ?array {
498505 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
499506 ? $ captureOffset
500507 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
508+ if ($ utf8 ) {
509+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
510+ $ pattern .= 'u ' ;
511+ }
501512 if ($ offset > strlen ($ subject )) {
502513 return null ;
503514 }
504- return self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])
505- ? $ m
506- : null ;
515+ if (!self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])) {
516+ return null ;
517+ }
518+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
519+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
520+ }
521+ return $ m ;
507522 }
508523
509524
@@ -518,10 +533,15 @@ public static function matchAll(
518533 int $ offset = 0 ,
519534 bool $ unmatchedAsNull = false ,
520535 bool $ patternOrder = false ,
536+ bool $ utf8 = false ,
521537 ): array {
522538 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
523539 ? $ captureOffset
524540 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 ) | ($ patternOrder ? PREG_PATTERN_ORDER : 0 );
541+ if ($ utf8 ) {
542+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
543+ $ pattern .= 'u ' ;
544+ }
525545 if ($ offset > strlen ($ subject )) {
526546 return [];
527547 }
@@ -530,6 +550,9 @@ public static function matchAll(
530550 ($ flags & PREG_PATTERN_ORDER ) ? $ flags : ($ flags | PREG_SET_ORDER ),
531551 $ offset ,
532552 ]);
553+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
554+ return self ::bytesToChars ($ subject , $ m );
555+ }
533556 return $ m ;
534557 }
535558
@@ -544,23 +567,52 @@ public static function replace(
544567 int $ limit = -1 ,
545568 bool $ captureOffset = false ,
546569 bool $ unmatchedAsNull = false ,
570+ bool $ utf8 = false ,
547571 ): string {
548572 if (is_object ($ replacement ) || is_array ($ replacement )) {
549573 if (!is_callable ($ replacement , false , $ textual )) {
550574 throw new Nette \InvalidStateException ("Callback ' $ textual' is not callable. " );
551575 }
552576 $ flags = ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
577+ if ($ utf8 ) {
578+ $ pattern .= 'u ' ;
579+ if ($ captureOffset ) {
580+ $ replacement = fn ($ m ) => $ replacement (self ::bytesToChars ($ subject , [$ m ])[0 ]);
581+ }
582+ }
553583 return self ::pcre ('preg_replace_callback ' , [$ pattern , $ replacement , $ subject , $ limit , 0 , $ flags ]);
554584
555585 } elseif (is_array ($ pattern ) && is_string (key ($ pattern ))) {
556586 $ replacement = array_values ($ pattern );
557587 $ pattern = array_keys ($ pattern );
558588 }
559589
590+ if ($ utf8 ) {
591+ $ pattern = array_map (fn ($ item ) => $ item . 'u ' , (array ) $ pattern );
592+ }
593+
560594 return self ::pcre ('preg_replace ' , [$ pattern , $ replacement , $ subject , $ limit ]);
561595 }
562596
563597
598+ private static function bytesToChars (string $ s , array $ groups ): array
599+ {
600+ $ lastBytes = $ lastChars = 0 ;
601+ foreach ($ groups as &$ matches ) {
602+ foreach ($ matches as &$ match ) {
603+ if ($ match [1 ] > $ lastBytes ) {
604+ $ lastChars += self ::length (substr ($ s , $ lastBytes , $ match [1 ] - $ lastBytes ));
605+ } elseif ($ match [1 ] < $ lastBytes ) {
606+ $ lastChars -= self ::length (substr ($ s , $ match [1 ], $ lastBytes - $ match [1 ]));
607+ }
608+ $ lastBytes = $ match [1 ];
609+ $ match [1 ] = $ lastChars ;
610+ }
611+ }
612+ return $ groups ;
613+ }
614+
615+
564616 /** @internal */
565617 public static function pcre (string $ func , array $ args )
566618 {
0 commit comments