Skip to content

Commit 11747d7

Browse files
committed
Strings: added support for UTF8 offsets in regexp
1 parent 619eb3e commit 11747d7

File tree

5 files changed

+114
-12
lines changed

5 files changed

+114
-12
lines changed

src/Utils/Strings.php

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -475,11 +475,17 @@ public static function split(
475475
bool|int $captureOffset = false,
476476
bool $skipEmpty = false,
477477
int $limit = -1,
478+
bool $utf8 = false,
478479
): array {
479480
$flags = is_int($captureOffset) && $captureOffset // back compatibility
480481
? $captureOffset
481482
: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
482-
return self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
483+
$pattern .= $utf8 ? 'u' : '';
484+
$m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]);
485+
if ($utf8 && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
486+
return self::bytesToChars($subject, [$m])[0];
487+
}
488+
return $m;
483489
}
484490

485491

@@ -492,16 +498,25 @@ public static function match(
492498
bool|int $captureOffset = false,
493499
int $offset = 0,
494500
bool $unmatchedAsNull = false,
501+
bool $utf8 = false,
495502
): ?array {
496503
$flags = is_int($captureOffset) && $captureOffset // back compatibility
497504
? $captureOffset
498505
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
506+
if ($utf8) {
507+
$offset = strlen(self::substring($subject, 0, $offset));
508+
$pattern .= 'u';
509+
}
499510
if ($offset > strlen($subject)) {
500511
return null;
501512
}
502-
return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
503-
? $m
504-
: null;
513+
if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
514+
return null;
515+
}
516+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
517+
return self::bytesToChars($subject, [$m])[0];
518+
}
519+
return $m;
505520
}
506521

507522

@@ -516,10 +531,15 @@ public static function matchAll(
516531
int $offset = 0,
517532
bool $unmatchedAsNull = false,
518533
bool $patternOrder = false,
534+
bool $utf8 = false,
519535
): array {
520536
$flags = is_int($captureOffset) && $captureOffset // back compatibility
521537
? $captureOffset
522538
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
539+
if ($utf8) {
540+
$offset = strlen(self::substring($subject, 0, $offset));
541+
$pattern .= 'u';
542+
}
523543
if ($offset > strlen($subject)) {
524544
return [];
525545
}
@@ -528,6 +548,9 @@ public static function matchAll(
528548
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
529549
$offset,
530550
]);
551+
if ($utf8 && ($flags & PREG_OFFSET_CAPTURE)) {
552+
return self::bytesToChars($subject, $m);
553+
}
531554
return $m;
532555
}
533556

@@ -542,23 +565,52 @@ public static function replace(
542565
int $limit = -1,
543566
bool $captureOffset = false,
544567
bool $unmatchedAsNull = false,
568+
bool $utf8 = false,
545569
): string {
546570
if (is_object($replacement) || is_array($replacement)) {
547571
if (!is_callable($replacement, false, $textual)) {
548572
throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
549573
}
550574
$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
575+
if ($utf8) {
576+
$pattern .= 'u';
577+
if ($captureOffset) {
578+
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
579+
}
580+
}
551581
return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
552582

553583
} elseif (is_array($pattern) && is_string(key($pattern))) {
554584
$replacement = array_values($pattern);
555585
$pattern = array_keys($pattern);
556586
}
557587

588+
if ($utf8) {
589+
$pattern = array_map(fn($item) => $item . 'u', (array) $pattern);
590+
}
591+
558592
return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]);
559593
}
560594

561595

596+
private static function bytesToChars(string $s, array $groups): array
597+
{
598+
$lastBytes = $lastChars = 0;
599+
foreach ($groups as &$matches) {
600+
foreach ($matches as &$match) {
601+
if ($match[1] > $lastBytes) {
602+
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
603+
} elseif ($match[1] < $lastBytes) {
604+
$lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1]));
605+
}
606+
$lastBytes = $match[1];
607+
$match[1] = $lastChars;
608+
}
609+
}
610+
return $groups;
611+
}
612+
613+
562614
/** @internal */
563615
public static function pcre(string $func, array $args)
564616
{

tests/Utils/Strings.match().phpt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,22 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
1919

2020
Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
2121

22-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
23-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
22+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
23+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
2424

25+
Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true));
2526
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', unmatchedAsNull: true));
2627
Assert::same(['e', null], Strings::match('hello world!', '#e(x)*#', 0, 0, unmatchedAsNull: true)); // $flags = 0
2728

2829
Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', offset: 2));
2930

31+
Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', offset: 2));
32+
33+
Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8: true, offset: 2));
34+
35+
Assert::same(['žluťoučký'], Strings::match('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
36+
37+
Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8: true, offset: 2));
38+
3039
Assert::null(Strings::match('hello world!', '', offset: 50));
3140
Assert::null(Strings::match('', '', offset: 1));

tests/Utils/Strings.matchAll().phpt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,31 @@ Assert::same([
4545
[['u', 3], ['u', 7], ['', 11], ['', 15]],
4646
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
4747

48+
Assert::same([
49+
[['lu', 1], ['l', 1], ['u', 2]],
50+
[['ou', 4], ['o', 4], ['u', 5]],
51+
[['k', 7], ['k', 7], ['', 8]],
52+
[['k', 10], ['k', 10], ['', 11]],
53+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8: true));
54+
4855
Assert::same([
4956
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
5057
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
5158
[['u', 3], ['u', 7], ['', 11], ['', 15]],
5259
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
5360

61+
Assert::same([
62+
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
63+
[['l', 1], ['o', 4], ['k', 7], ['k', 10]],
64+
[['u', 2], ['u', 5], ['', 8], ['', 11]],
65+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true, utf8: true));
66+
5467
Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2));
5568

69+
Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', offset: 2, utf8: true));
70+
71+
Assert::same([['žluťoučký'], ['kůň']], Strings::matchAll('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
72+
5673
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
5774
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', offset: 2, patternOrder: true));
5875

tests/Utils/Strings.replace().phpt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,10 @@ Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
3737

3838
// flags & callback
3939
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true));
40+
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8: true));
4041
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);
42+
43+
// utf-8 without modifier
44+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', fn() => '*', utf8: true));
45+
Assert::same('* *', Strings::replace('žluťoučký kůň', '#\w+#', '*', utf8: true));
46+
Assert::same('* *', Strings::replace('žluťoučký kůň', ['#\w+#'], '*', utf8: true));

tests/Utils/Strings.split().phpt

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,29 @@ Assert::same([
4646
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
4747

4848
Assert::same([
49-
['a', 0],
50-
[',', 1],
51-
['b', 3],
52-
[',', 4],
53-
['c', 6],
54-
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
49+
['ž', 0],
50+
['lu', 2],
51+
['ť', 4],
52+
['ou', 6],
53+
['č', 8],
54+
['k', 10],
55+
['ý ', 11],
56+
['k', 14],
57+
['ůň', 15],
58+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
59+
60+
Assert::same([
61+
['ž', 0],
62+
['lu', 1],
63+
['ť', 3],
64+
['ou', 4],
65+
['č', 6],
66+
['k', 7],
67+
['ý ', 8],
68+
['k', 10],
69+
['ůň', 11],
70+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8: true));
71+
72+
Assert::same(['', ' ', ''], Strings::split('žluťoučký kůň', '#\w+#', utf8: true)); // without modifier
5573

5674
Assert::same(['a', ',', 'b, c'], Strings::split('a, b, c', '#(,)\s*#', limit: 2));

0 commit comments

Comments
 (0)