Skip to content

Commit 1c7315c

Browse files
author
gitlost
committed
Fix single-byte trie. Move regex alts to tools/functions.php.
1 parent d691f24 commit 1c7315c

File tree

6 files changed

+247
-99
lines changed

6 files changed

+247
-99
lines changed

Symfony/Normalizer.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,21 +57,25 @@
5757
if ( version_compare( PHP_VERSION, '5.3.4', '<' ) ) {
5858
// If verbs unavailable...
5959
if ( version_compare( substr( PCRE_VERSION, 0, strspn( PCRE_VERSION, '0123456789.' ) ), '7.3', '<' ) ) {
60+
// Typically PHP 5.2.4 only (with or without UTF-8 mode).
6061
function unfc_is_valid_utf8( $str ) {
6162
return 1 !== preg_match( UNFC_REGEX_IS_INVALID_UTF8, $str ); // Very slow for PHP < 7.
6263
}
6364
} else {
65+
// Typically when UTF-8 mode unavailable and PHP < 5.3.4, ie 5.2.5-17 (last), 5.3.0-3.
6466
function unfc_is_valid_utf8( $str ) {
6567
return 1 !== preg_match( UNFC_REGEX_IS_INVALID_UTF8_SKIP, $str ); // Very slow for PHP < 7.
6668
}
6769
}
6870
} else {
71+
// Typically when UTF-8 mode unavailable and PHP >= 5.3.4; or when built against PCRE 8.32, ie PHP 5.3.24-29 (last), 5.4.14-40, 5.5.0-9.
6972
function unfc_is_valid_utf8( $str ) {
7073
// See https://core.trac.wordpress.org/ticket/29717#comment:11
7174
return '' === $str || '' !== htmlspecialchars( $str, ENT_NOQUOTES, 'UTF-8' );
7275
}
7376
}
7477
} else {
78+
// Typically all PHPs with UTF-8 mode available except 5.2.4 and those built against PCRE 8.32.
7579
function unfc_is_valid_utf8( $str ) {
7680
return 1 === preg_match( '//u', $str ); // Original Normalizer validity check.
7781
}

includes/class-unfc-normalize.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ static function activation_check() {
228228
* Helper to test if using UTF-8.
229229
*/
230230
function is_blog_utf8() {
231-
return in_array( get_option( 'blog_charset' ), array( 'UTF-8', 'utf-8', 'utf8', 'UTF8' ) );
231+
return in_array( get_option( 'blog_charset' ), array( 'UTF-8', 'utf-8', 'utf8', 'UTF8' ), true );
232232
}
233233

234234
/**

tests/toolsTest.php

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,74 @@ function test_utf8_regex_alts() {
7474
}
7575
}
7676

77+
/**
78+
* @ticket unfc_utf8_ranges_from_codepoints
79+
*/
80+
function test_utf8_ranges_from_codepoints() {
81+
$out = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( array( 0x9, 0x0a, 0xb ) ) );
82+
$this->assertSame( '[\x09-\x0b]', $out );
83+
$out_utf16 = unfc_unicode_regex_chars_from_codepoints( array( 0x9, 0x0a, 0xb ) );
84+
$this->assertSame( '\x09-\x0b', $out_utf16 );
85+
86+
$codepoints = array(
87+
0x9, 0xa, 0xb, 0xc, 0xd, 0x20,
88+
);
89+
90+
$out = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( $codepoints ) );
91+
$this->assertSame( '[\x09-\x0d\x20]', $out );
92+
93+
$codepoints = array_merge( $codepoints, array( 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa9, 0xaa ) );
94+
sort( $codepoints );
95+
$out = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( $codepoints ) );
96+
$this->assertSame( '[\x09-\x0d\x20]|\xc2[\xa1-\xa7\xa9\xaa]', $out );
97+
98+
$codepoints = array_merge( $codepoints, array( 0x42, 0x43, 0x5f ) );
99+
sort( $codepoints );
100+
$out = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( $codepoints ) );
101+
$this->assertSame( '[\x09-\x0d\x20\x42\x43\x5f]|\xc2[\xa1-\xa7\xa9\xaa]', $out );
102+
103+
$codepoints = array_merge( $codepoints, array( 0xe, 0xf, 0x21, 0x22, 0x24 ) );
104+
sort( $codepoints );
105+
$out = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( $codepoints ) );
106+
$this->assertSame( '[\x09-\x0f\x20-\x22\x24\x42\x43\x5f]|\xc2[\xa1-\xa7\xa9\xaa]', $out );
107+
}
108+
109+
/**
110+
* @ticket unfc_utf8_parse_unicode_data
111+
*/
112+
function test_utf8_parse_unicode_data() {
113+
$file = 'tests/UCD-9.0.0/UnicodeData.txt';
114+
115+
$codepoints = unfc_parse_unicode_data( $file, __CLASS__ . '::parse_unicode_data_cb' );
116+
$this->assertFalse( empty( $codepoints['Z'] ) );
117+
sort( $codepoints['Z'] );
118+
$out_utf8 = unfc_utf8_regex_alts( unfc_utf8_ranges_from_codepoints( $codepoints['Z'] ) );
119+
$this->assertSame( '\x20|\xc2\xa0|\xe1\x9a\x80|\xe2(?:\x80[\x80-\x8a\xa8\xa9\xaf]|\x81\x9f)|\xe3\x80\x80', $out_utf8 );
120+
$out_utf16 = unfc_unicode_regex_chars_from_codepoints( $codepoints['Z'] );
121+
$this->assertSame( '\x20\xa0\x{1680}\x{2000}-\x{200a}\x{2028}\x{2029}\x{202f}\x{205f}\x{3000}', $out_utf16 );
122+
$str = " \x20\xe2\x80\x89";
123+
$this->assertSame( preg_match( '/' . $out_utf8 . '/', $str ), preg_match( '/[' . $out_utf16 . ']/u', $str ) );
124+
}
125+
126+
static function parse_unicode_data_cb( &$codepoints, $cp, $name, $parts, $in_interval, $first_cp, $last_cp ) {
127+
$general_cat = $parts[2];
128+
if ( strlen( $general_cat ) > 1 ) {
129+
$general_cat_super = $general_cat[0];
130+
} else {
131+
$general_cat_super = null;
132+
}
133+
if ( ! isset( $codepoints[ $general_cat ] ) ) {
134+
$codepoints[ $general_cat ] = array();
135+
}
136+
$codepoints[ $general_cat ][] = $cp;
137+
if ( $general_cat_super ) {
138+
if ( ! isset( $general_cat_super ) ) {
139+
$codepoints[ $general_cat_super ] = array();
140+
}
141+
$codepoints[ $general_cat_super ][] = $cp;
142+
}
143+
}
144+
77145
/**
78146
* @ticket unfc_u_equivalence
79147
*/

tools/functions.php

Lines changed: 151 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -198,13 +198,19 @@ function unfc_utf8_preg_fmt_range_entry( $range_entry ) {
198198
function unfc_utf8_trie( &$trie, $range, $idx = 0, $parent = null, $fmt = null ) {
199199
$cnt = count( $range );
200200

201-
if ( $idx === $cnt - 1 && $parent ) {
202-
// If at lowest index, append formatted range to parent rather than add trie.
203-
if ( is_array( $parent[ $fmt ] ) ) {
204-
$parent[ $fmt ] = '';
201+
if ( $idx === $cnt - 1 ) {
202+
if ( $parent ) {
203+
// If at lowest index, append formatted range to parent rather than add trie.
204+
if ( is_array( $parent[ $fmt ] ) ) {
205+
$parent[ $fmt ] = '';
206+
}
207+
$parent[ $fmt ] .= unfc_utf8_preg_fmt_range_entry( $range[ $idx ] );
208+
} else {
209+
if ( ! isset( $trie[''] ) ) {
210+
$trie[''] = '';
211+
}
212+
$trie[''] .= unfc_utf8_preg_fmt_range_entry( $range[ $idx ] );
205213
}
206-
$parent[ $fmt ] .= unfc_utf8_preg_fmt_range_entry( $range[ $idx ] );
207-
208214
} elseif ( $idx < $cnt && is_array( $trie ) ) {
209215
// Create key entry (trie).
210216
$fmt = unfc_utf8_preg_fmt_range_entry( $range[ $idx ] );
@@ -257,12 +263,150 @@ function unfc_utf8_regex_alts( $ranges ) {
257263
}
258264

259265
unfc_utf8_trie( $trie, $range, $idx );
260-
//error_log( "loop trie=" . print_r( $trie, true ) );
261266
}
262267

263268
return implode( '|', unfc_utf8_trie_regex_alts( $trie ) );
264269
}
265270

271+
/**
272+
* Calculate the UTF-8 byte sequence ranges from unicode codepoints.
273+
*/
274+
function unfc_utf8_ranges_from_codepoints( $codepoints ) {
275+
$ranges = array();
276+
277+
$last = array_shift( $codepoints );
278+
$first = $last;
279+
$carry = null;
280+
foreach ( $codepoints as $codepoint ) {
281+
if ( $codepoint === $last + 1 ) {
282+
$carry = $codepoint;
283+
} else {
284+
if ( null === $carry ) {
285+
$ranges[] = unfc_utf8_ints( $last );
286+
} else {
287+
if ( $first + 1 === $carry ) {
288+
$ranges[] = unfc_utf8_ints( $first );
289+
$ranges[] = unfc_utf8_ints( $carry );
290+
} else {
291+
unfc_utf8_ranges( $ranges, $first, $carry );
292+
}
293+
$carry = null;
294+
}
295+
$first = $codepoint;
296+
}
297+
$last = $codepoint;
298+
}
299+
if ( null === $carry ) {
300+
$ranges[] = unfc_utf8_ints( $last );
301+
} else {
302+
if ( $first + 1 === $carry ) {
303+
$ranges[] = unfc_utf8_ints( $first );
304+
$ranges[] = unfc_utf8_ints( $carry );
305+
} else {
306+
unfc_utf8_ranges( $ranges, $first, $carry );
307+
}
308+
}
309+
310+
return $ranges;
311+
}
312+
313+
314+
/**
315+
* Calculate the Unicode (UTF-16) alternatives from unicode codepoints.
316+
*/
317+
function unfc_unicode_regex_chars_from_codepoints( $codepoints ) {
318+
$regex_alts = '';
319+
320+
$last = array_shift( $codepoints );
321+
$first = $last;
322+
$carry = null;
323+
foreach ( $codepoints as $codepoint ) {
324+
if ( $codepoint === $last + 1 ) {
325+
$carry = $codepoint;
326+
} else {
327+
if ( null === $carry ) {
328+
$regex_alts .= unfc_unicode_preg_fmt( $last );
329+
} else {
330+
$regex_alts .= unfc_unicode_preg_fmt( $first ) . ( $first + 1 === $carry ? '' : '-' ) . unfc_unicode_preg_fmt( $carry );
331+
$carry = null;
332+
}
333+
$first = $codepoint;
334+
}
335+
$last = $codepoint;
336+
}
337+
if ( null === $carry ) {
338+
$regex_alts .= unfc_unicode_preg_fmt( $last );
339+
} else {
340+
$regex_alts .= unfc_unicode_preg_fmt( $first ) . ( $first + 1 === $carry ? '' : '-' ) . unfc_unicode_preg_fmt( $carry );
341+
}
342+
343+
return $regex_alts;
344+
}
345+
346+
/**
347+
* Parse the Unicode data file http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
348+
* Calls the $callback to collect codepoints of interest in the passed-in $codepoints array, which is returned.
349+
* In particular, deals with intervals, calling the $callback for each codepoint in the interval.
350+
*/
351+
function unfc_parse_unicode_data( $file, $callback ) {
352+
353+
// Read the file.
354+
355+
if ( false === ( $get = file_get_contents( $file ) ) ) {
356+
error_log( "unfc_parse_unicode_data: failed to read file '$file'" );
357+
return false;
358+
}
359+
360+
$lines = array_map( 'unfc_get_cb', explode( "\n", $get ) ); // Strip newlines.
361+
362+
$first = 'First>';
363+
$first_len_minus = -strlen( $first );
364+
$last = 'Last>';
365+
$last_len_minus = -strlen( $last );
366+
367+
// Parse the file.
368+
369+
$codepoints = array();
370+
$line_num = 0;
371+
$in_interval = false;
372+
$first_cp = 0;
373+
foreach ( $lines as $line ) {
374+
$line_num++;
375+
$line = trim( $line );
376+
if ( '' === $line ) {
377+
continue;
378+
}
379+
$parts = array_map( 'trim', explode( ';', $line ) );
380+
381+
$name = $parts[1];
382+
383+
if ( $in_interval ) {
384+
if ( $last === substr( $name, $last_len_minus ) ) {
385+
$last_cp = hexdec( $parts[0] );
386+
for ( $cp = $first_cp + 1; $cp <= $last_cp; $cp++ ) {
387+
if ( false === $callback( $codepoints, $cp, $name, $parts, $in_interval, $first_cp, $last_cp ) ) {
388+
error_log( "unfc_parse_unicode_data: user func fail line_num=$line_num" );
389+
}
390+
}
391+
} else {
392+
error_log( "unfc_parse_unicode_data: invalid first/last pair line_num=$line_num" );
393+
}
394+
$in_interval = false;
395+
} else {
396+
$cp = hexdec( $parts[0] );
397+
if ( $first === substr( $name, $first_len_minus ) ) {
398+
$in_interval = true;
399+
$first_cp = $cp;
400+
}
401+
if ( false === $callback( $codepoints, $cp, $name, $parts, $in_interval, $first_cp, 0 /*$last_cp*/ ) ) {
402+
error_log( "unfc_parse_unicode_data: user func fail line_num=$line_num" );
403+
}
404+
}
405+
}
406+
407+
return $codepoints;
408+
}
409+
266410
/**
267411
* Strip any invalid UTF-8 sequences from string.
268412
*/

tools/gen_unfc_regex_alts.php

Lines changed: 3 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -197,40 +197,7 @@ function __( $str, $td ) { return $str; }
197197

198198
// Calculate the UTF-8 byte sequence ranges from the unicode codepoints.
199199

200-
$ranges = array();
201-
$tmp_codepoints = $codepoints[ $idx ];
202-
$last = array_shift( $tmp_codepoints );
203-
$first = $last;
204-
$carry = null;
205-
foreach ( $tmp_codepoints as $codepoint ) {
206-
if ( $codepoint === $last + 1 ) {
207-
$carry = $codepoint;
208-
} else {
209-
if ( null === $carry ) {
210-
$ranges[] = unfc_utf8_ints( $last );
211-
} else {
212-
if ( $first + 1 === $carry ) {
213-
$ranges[] = unfc_utf8_ints( $first );
214-
$ranges[] = unfc_utf8_ints( $carry );
215-
} else {
216-
unfc_utf8_ranges( $ranges, $first, $carry );
217-
}
218-
$carry = null;
219-
}
220-
$first = $codepoint;
221-
}
222-
$last = $codepoint;
223-
}
224-
if ( null === $carry ) {
225-
$ranges[] = unfc_utf8_ints( $last );
226-
} else {
227-
if ( $first + 1 === $carry ) {
228-
$ranges[] = unfc_utf8_ints( $first );
229-
$ranges[] = unfc_utf8_ints( $carry );
230-
} else {
231-
unfc_utf8_ranges( $ranges, $first, $carry );
232-
}
233-
}
200+
$ranges = unfc_utf8_ranges_from_codepoints( $codepoints[ $idx ] );
234201
//error_log( "ranges=" . print_r( unfc_array_map_recursive( 'unfc_utf8_preg_fmt', $ranges ), true ) );
235202

236203
// Generate the regular expression alternatives.
@@ -262,33 +229,8 @@ function __( $str, $td ) { return $str; }
262229
$out[] = '';
263230
$out[] = 'if ( ( defined( \'WP_DEBUG\' ) && WP_DEBUG ) ) {';
264231
foreach ( $out_idxs as $idx ) {
265-
$regex_alts = '';
266-
267-
// Unicode (UTF-16) regular expression alternatives.
268-
269-
$tmp_codepoints = $codepoints[ $idx ];
270-
$last = array_shift( $tmp_codepoints );
271-
$first = $last;
272-
$carry = null;
273-
foreach ( $tmp_codepoints as $codepoint ) {
274-
if ( $codepoint === $last + 1 ) {
275-
$carry = $codepoint;
276-
} else {
277-
if ( null === $carry ) {
278-
$regex_alts .= unfc_unicode_preg_fmt( $last );
279-
} else {
280-
$regex_alts .= unfc_unicode_preg_fmt( $first ) . ( $first + 1 === $carry ? '' : '-' ) . unfc_unicode_preg_fmt( $carry );
281-
$carry = null;
282-
}
283-
$first = $codepoint;
284-
}
285-
$last = $codepoint;
286-
}
287-
if ( null === $carry ) {
288-
$regex_alts .= unfc_unicode_preg_fmt( $last );
289-
} else {
290-
$regex_alts .= unfc_unicode_preg_fmt( $first ) . ( $first + 1 === $carry ? '' : '-' ) . unfc_unicode_preg_fmt( $carry );
291-
}
232+
// Unicode (UTF-16) regular expression charset.
233+
$regex_alts = unfc_unicode_regex_chars_from_codepoints( $codepoints[ $idx ] );
292234

293235
$IDX = strtoupper( $idx );
294236
$out[] = '';

0 commit comments

Comments
 (0)