Skip to content

Commit 92c593d

Browse files
committed
Strings: added support for UTF8 offsets in regexp
1 parent e4043a3 commit 92c593d

File tree

5 files changed

+123
-12
lines changed

5 files changed

+123
-12
lines changed

src/Utils/Strings.php

Lines changed: 65 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,11 +490,18 @@ public static function split(
490490
bool|int $captureOffset = false,
491491
bool $skipEmpty = false,
492492
int $limit = -1,
493+
bool $utf8 = false,
493494
): array {
494495
$flags = is_int($captureOffset) && $captureOffset // back compatibility
495496
? $captureOffset
496497
: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
497-
return self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
498+
$pattern .= $utf8 ? 'u' : '';
499+
$m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
500+
if ($utf8 && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
501+
return self::bytesToChars($subject, [$m])[0];
502+
}
503+
504+
return $m;
498505
}
499506

500507

@@ -507,17 +514,29 @@ public static function match(
507514
bool|int $captureOffset = false,
508515
int $offset = 0,
509516
bool $unmatchedAsNull = false,
517+
bool $utf8 = false,
510518
): ?array {
511519
$flags = is_int($captureOffset) && $captureOffset // back compatibility
512520
? $captureOffset
513521
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
522+
if ($utf8) {
523+
$offset = strlen(self::substring($subject, 0, $offset));
524+
$pattern .= 'u';
525+
}
526+
514527
if ($offset > strlen($subject)) {
515528
return null;
516529
}
517530

518-
return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
519-
? $m
520-
: null;
531+
if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
532+
return null;
533+
}
534+
535+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
536+
return self::bytesToChars($subject, [$m])[0];
537+
}
538+
539+
return $m;
521540
}
522541

523542

@@ -532,10 +551,16 @@ public static function matchAll(
532551
int $offset = 0,
533552
bool $unmatchedAsNull = false,
534553
bool $patternOrder = false,
554+
bool $utf8 = false,
535555
): array {
536556
$flags = is_int($captureOffset) && $captureOffset // back compatibility
537557
? $captureOffset
538558
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
559+
if ($utf8) {
560+
$offset = strlen(self::substring($subject, 0, $offset));
561+
$pattern .= 'u';
562+
}
563+
539564
if ($offset > strlen($subject)) {
540565
return [];
541566
}
@@ -545,6 +570,10 @@ public static function matchAll(
545570
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
546571
$offset,
547572
]);
573+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
574+
return self::bytesToChars($subject, $m);
575+
}
576+
548577
return $m;
549578
}
550579

@@ -559,24 +588,56 @@ public static function replace(
559588
int $limit = -1,
560589
bool $captureOffset = false,
561590
bool $unmatchedAsNull = false,
591+
bool $utf8 = false,
562592
): string {
563593
if (is_object($replacement) || is_array($replacement)) {
564594
if (!is_callable($replacement, false, $textual)) {
565595
throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
566596
}
567597

568598
$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
599+
if ($utf8) {
600+
$pattern .= 'u';
601+
if ($captureOffset) {
602+
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
603+
}
604+
}
605+
569606
return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
570607

571608
} elseif (is_array($pattern) && is_string(key($pattern))) {
572609
$replacement = array_values($pattern);
573610
$pattern = array_keys($pattern);
574611
}
575612

613+
if ($utf8) {
614+
$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
615+
}
616+
576617
return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
577618
}
578619

579620

621+
private static function bytesToChars(string $s, array $groups): array
622+
{
623+
$lastBytes = $lastChars = 0;
624+
foreach ($groups as &$matches) {
625+
foreach ($matches as &$match) {
626+
if ($match[1] > $lastBytes) {
627+
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
628+
} elseif ($match[1] < $lastBytes) {
629+
$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
630+
}
631+
632+
$lastBytes = $match[1];
633+
$match[1] = $lastChars;
634+
}
635+
}
636+
637+
return $groups;
638+
}
639+
640+
580641
/** @internal */
581642
public static function pcre(string $func, array $args)
582643
{

tests/Utils/Strings.match().phpt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,22 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
1919

2020
Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
2121

22-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
23-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
22+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
23+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
2424

25+
Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true));
2526
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true));
2627
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', 0, 0, unmatchedAsNull: true)); // $flags = 0
2728

2829
Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2));
2930

31+
Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2));
32+
33+
Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2));
34+
35+
Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
36+
37+
Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2));
38+
3039
Assert::null(Strings::match('hello world!', '', offset: 50));
3140
Assert::null(Strings::match('', '', offset: 1));

tests/Utils/Strings.matchAll().phpt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,31 @@ Assert::same([
4545
[['u', 3], ['u', 7], ['', 11], ['', 15]],
4646
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
4747

48+
Assert::same([
49+
[['lu', 1], ['l', 1], ['u', 2]],
50+
[['ou', 4], ['o', 4], ['u', 5]],
51+
[['k', 7], ['k', 7], ['', 8]],
52+
[['k', 10], ['k', 10], ['', 11]],
53+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true));
54+
4855
Assert::same([
4956
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
5057
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
5158
[['u', 3], ['u', 7], ['', 11], ['', 15]],
5259
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
5360

61+
Assert::same([
62+
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
63+
[['l', 1], ['o', 4], ['k', 7], ['k', 10]],
64+
[['u', 2], ['u', 5], ['', 8], ['', 11]],
65+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true));
66+
5467
Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2));
5568

69+
Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true));
70+
71+
Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
72+
5673
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
5774
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true));
5875

tests/Utils/Strings.replace().phpt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
3737

3838
// flags & callback
3939
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode('', $m[0]), captureOffset: true));
40+
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode('', $m[0]), captureOffset: true, utf8: true));
4041
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);
42+
43+
// utf-8 without modifier
44+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', fn() => '*', utf8: true));
45+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', '*', utf8: true));
46+
Assert::same('* *', Strings::replace('žluťoučký kůň', ['#\w+#'], '*', utf8: true));

tests/Utils/Strings.split().phpt

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,29 @@ Assert::same([
4646
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
4747

4848
Assert::same([
49-
['a', 0],
50-
[',', 1],
51-
['b', 3],
52-
[',', 4],
53-
['c', 6],
54-
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
49+
['ž', 0],
50+
['lu', 2],
51+
['ť', 4],
52+
['ou', 6],
53+
['č', 8],
54+
['k', 10],
55+
['ý ', 11],
56+
['k', 14],
57+
['ůň', 15],
58+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
59+
60+
Assert::same([
61+
['ž', 0],
62+
['lu', 1],
63+
['ť', 3],
64+
['ou', 4],
65+
['č', 6],
66+
['k', 7],
67+
['ý ', 8],
68+
['k', 10],
69+
['ůň', 11],
70+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true));
71+
72+
Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
5573

5674
Assert::same(['a', ',', 'b, c'], Strings::split('a, b, c', '#(,)\s*#', limit: 2));

0 commit comments

Comments
 (0)