@@ -475,11 +475,17 @@ public static function split(
475475 bool |int $ captureOffset = false ,
476476 bool $ skipEmpty = false ,
477477 int $ limit = -1 ,
478+ bool $ utf8 = false ,
478479 ): array {
479480 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
480481 ? $ captureOffset
481482 : ($ captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0 ) | ($ skipEmpty ? PREG_SPLIT_NO_EMPTY : 0 );
482- return self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
483+ $ pattern .= $ utf8 ? 'u ' : '' ;
484+ $ m = self ::pcre ('preg_split ' , [$ pattern , $ subject , $ limit , $ flags | PREG_SPLIT_DELIM_CAPTURE ]);
485+ if ($ utf8 && ($ flags & PREG_SPLIT_OFFSET_CAPTURE )) {
486+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
487+ }
488+ return $ m ;
483489 }
484490
485491
@@ -492,16 +498,25 @@ public static function match(
492498 bool |int $ captureOffset = false ,
493499 int $ offset = 0 ,
494500 bool $ unmatchedAsNull = false ,
501+ bool $ utf8 = false ,
495502 ): ?array {
496503 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
497504 ? $ captureOffset
498505 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
506+ if ($ utf8 ) {
507+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
508+ $ pattern .= 'u ' ;
509+ }
499510 if ($ offset > strlen ($ subject )) {
500511 return null ;
501512 }
502- return self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])
503- ? $ m
504- : null ;
513+ if (!self ::pcre ('preg_match ' , [$ pattern , $ subject , &$ m , $ flags , $ offset ])) {
514+ return null ;
515+ }
516+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
517+ return self ::bytesToChars ($ subject , [$ m ])[0 ];
518+ }
519+ return $ m ;
505520 }
506521
507522
@@ -516,10 +531,15 @@ public static function matchAll(
516531 int $ offset = 0 ,
517532 bool $ unmatchedAsNull = false ,
518533 bool $ patternOrder = false ,
534+ bool $ utf8 = false ,
519535 ): array {
520536 $ flags = is_int ($ captureOffset ) && $ captureOffset // back compatibility
521537 ? $ captureOffset
522538 : ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 ) | ($ patternOrder ? PREG_PATTERN_ORDER : 0 );
539+ if ($ utf8 ) {
540+ $ offset = strlen (self ::substring ($ subject , 0 , $ offset ));
541+ $ pattern .= 'u ' ;
542+ }
523543 if ($ offset > strlen ($ subject )) {
524544 return [];
525545 }
@@ -528,6 +548,9 @@ public static function matchAll(
528548 ($ flags & PREG_PATTERN_ORDER ) ? $ flags : ($ flags | PREG_SET_ORDER ),
529549 $ offset ,
530550 ]);
551+ if ($ utf8 && ($ flags & PREG_OFFSET_CAPTURE )) {
552+ return self ::bytesToChars ($ subject , $ m );
553+ }
531554 return $ m ;
532555 }
533556
@@ -542,23 +565,52 @@ public static function replace(
542565 int $ limit = -1 ,
543566 bool $ captureOffset = false ,
544567 bool $ unmatchedAsNull = false ,
568+ bool $ utf8 = false ,
545569 ): string {
546570 if (is_object ($ replacement ) || is_array ($ replacement )) {
547571 if (!is_callable ($ replacement , false , $ textual )) {
548572 throw new Nette \InvalidStateException ("Callback ' $ textual' is not callable. " );
549573 }
550574 $ flags = ($ captureOffset ? PREG_OFFSET_CAPTURE : 0 ) | ($ unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0 );
575+ if ($ utf8 ) {
576+ $ pattern .= 'u ' ;
577+ if ($ captureOffset ) {
578+ $ replacement = fn ($ m ) => $ replacement (self ::bytesToChars ($ subject , [$ m ])[0 ]);
579+ }
580+ }
551581 return self ::pcre ('preg_replace_callback ' , [$ pattern , $ replacement , $ subject , $ limit , 0 , $ flags ]);
552582
553583 } elseif (is_array ($ pattern ) && is_string (key ($ pattern ))) {
554584 $ replacement = array_values ($ pattern );
555585 $ pattern = array_keys ($ pattern );
556586 }
557587
588+ if ($ utf8 ) {
589+ $ pattern = array_map (fn ($ item ) => $ item . 'u ' , (array ) $ pattern );
590+ }
591+
558592 return self ::pcre ('preg_replace ' , [$ pattern , $ replacement , $ subject , $ limit ]);
559593 }
560594
561595
596+ private static function bytesToChars (string $ s , array $ groups ): array
597+ {
598+ $ lastBytes = $ lastChars = 0 ;
599+ foreach ($ groups as &$ matches ) {
600+ foreach ($ matches as &$ match ) {
601+ if ($ match [1 ] > $ lastBytes ) {
602+ $ lastChars += self ::length (substr ($ s , $ lastBytes , $ match [1 ] - $ lastBytes ));
603+ } elseif ($ match [1 ] < $ lastBytes ) {
604+ $ lastChars -= self ::length (substr ($ s , $ match [1 ], $ lastBytes - $ match [1 ]));
605+ }
606+ $ lastBytes = $ match [1 ];
607+ $ match [1 ] = $ lastChars ;
608+ }
609+ }
610+ return $ groups ;
611+ }
612+
613+
562614 /** @internal */
563615 public static function pcre (string $ func , array $ args )
564616 {
0 commit comments