Skip to content

Commit 13edc1c

Browse files
committed
Replace regex tokenizer with character scanner in JoinSpec
Signed-off-by: Simon Mundy <simon.mundy@peptolab.com>
1 parent eb7441a commit 13edc1c

File tree

1 file changed

+37
-26
lines changed

1 file changed

+37
-26
lines changed

src/Sql/Part/JoinSpec.php

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
use function is_array;
2020
use function is_string;
2121
use function key;
22-
use function preg_match_all;
22+
use function ctype_alpha;
23+
use function ctype_alnum;
2324
use function sprintf;
2425
use function strlen;
26+
use function strtoupper;
2527
use function substr;
2628

2729
/**
@@ -31,8 +33,6 @@
3133
*/
3234
final readonly class JoinSpec
3335
{
34-
private const IDENTIFIER_PATTERN = '/\b(?!(?:AS|AND|OR|BETWEEN)\b)([a-zA-Z_]\w*+(?:\.[a-zA-Z_]\w*+)*)(?!\s*\()/i';
35-
3636
public string|TableIdentifier|Select|ExpressionInterface $table;
3737
public JoinTableType $tableType;
3838
public ?string $alias;
@@ -87,35 +87,46 @@ public function __construct(array $join)
8787
$this->onTokens = is_string($on) ? self::tokenizeOn($on) : null;
8888
}
8989

90-
/**
91-
* Tokenize a string ON clause into Identifier and Literal tokens.
92-
* Identifiers are word-like tokens excluding SQL keywords and function calls.
93-
*
94-
* @return ArgumentInterface[]
95-
*/
90+
/** @return ArgumentInterface[] */
9691
private static function tokenizeOn(string $on): array
9792
{
98-
preg_match_all(self::IDENTIFIER_PATTERN, $on, $matches, PREG_OFFSET_CAPTURE);
99-
100-
if ($matches[0] === []) {
101-
return [new Literal($on)];
102-
}
103-
104-
$tokens = [];
105-
$pos = 0;
106-
107-
foreach ($matches[0] as [$match, $offset]) {
108-
if ($offset > $pos) {
109-
$tokens[] = new Literal(substr($on, $pos, $offset - $pos));
93+
$tokens = [];
94+
$len = strlen($on);
95+
$pos = 0;
96+
$literalStart = 0;
97+
98+
while ($pos < $len) {
99+
$ch = $on[$pos];
100+
101+
if ($ch === '_' || ctype_alpha($ch)) {
102+
$wordStart = $pos++;
103+
while ($pos < $len && ($on[$pos] === '_' || $on[$pos] === '.' || ctype_alnum($on[$pos]))) {
104+
$pos++;
105+
}
106+
107+
$word = substr($on, $wordStart, $pos - $wordStart);
108+
$upper = strtoupper($word);
109+
110+
if ($upper === 'AND' || $upper === 'OR' || $upper === 'AS' || $upper === 'BETWEEN'
111+
|| ($pos < $len && $on[$pos] === '(')
112+
) {
113+
continue;
114+
}
115+
116+
if ($wordStart > $literalStart) {
117+
$tokens[] = new Literal(substr($on, $literalStart, $wordStart - $literalStart));
118+
}
119+
$tokens[] = new Identifier($word);
120+
$literalStart = $pos;
121+
} else {
122+
$pos++;
110123
}
111-
$tokens[] = new Identifier($match);
112-
$pos = $offset + strlen($match);
113124
}
114125

115-
if ($pos < strlen($on)) {
116-
$tokens[] = new Literal(substr($on, $pos));
126+
if ($literalStart < $len) {
127+
$tokens[] = new Literal(substr($on, $literalStart));
117128
}
118129

119-
return $tokens;
130+
return $tokens ?: [new Literal($on)];
120131
}
121132
}

0 commit comments

Comments
 (0)