Skip to content

Commit 5e01542

Browse files
committed
Improve basename(). Avoid calling mblen() for ASCII compatible locales.
1 parent be92009 commit 5e01542

File tree

4 files changed

+165
-59
lines changed

4 files changed

+165
-59
lines changed

Zend/zend_globals.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ struct _zend_compiler_globals {
9595
bool skip_shebang;
9696
bool increment_lineno;
9797

98+
bool variable_width_locale; /* UTF-8, Shift-JIS, Big5, ISO 2022, EUC, etc */
99+
bool ascii_compatible_locale; /* locale uses ASCII characters as singletons */
100+
/* and don't use them as lead/trail units */
101+
98102
zend_string *doc_comment;
99103
uint32_t extra_fn_flags;
100104

Zend/zend_operators.c

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,21 @@
3030
#include "zend_exceptions.h"
3131
#include "zend_closures.h"
3232

33+
#include <locale.h>
34+
#ifdef HAVE_LANGINFO_H
35+
# include <langinfo.h>
36+
#endif
37+
3338
#ifdef __SSE2__
3439
#include <emmintrin.h>
3540
#endif
3641

42+
#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER)
43+
/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */
44+
#define ZEND_USE_TOLOWER_L 1
45+
#endif
46+
3747
#ifdef ZEND_USE_TOLOWER_L
38-
#include <locale.h>
3948
static _locale_t current_locale = NULL;
4049
/* this is true global! may lead to strange effects on ZTS, but so may setlocale() */
4150
#define zend_tolower(c) _tolower_l(c, current_locale)
@@ -2537,13 +2546,85 @@ ZEND_API bool ZEND_FASTCALL zend_object_is_true(zval *op) /* {{{ */
25372546
}
25382547
/* }}} */
25392548

2540-
#ifdef ZEND_USE_TOLOWER_L
25412549
ZEND_API void zend_update_current_locale(void) /* {{{ */
25422550
{
2551+
#ifdef ZEND_USE_TOLOWER_L
2552+
# if defined(ZEND_WIN32) && defined(_MSC_VER)
25432553
current_locale = _get_current_locale();
2554+
# else
2555+
current_locale = uselocale(0);
2556+
# endif
2557+
#endif
2558+
#if defined(ZEND_WIN32) && defined(_MSC_VER)
2559+
if (MB_CUR_MAX > 1) {
2560+
unsigned int cp = ___lc_codepage_func();
2561+
CG(variable_width_locale) = 1;
2562+
// TODO: EUC-* are also ASCII compatible ???
2563+
CG(ascii_compatible_locale) =
2564+
cp == 65001; /* UTF-8 */
2565+
} else {
2566+
CG(variable_width_locale) = 0;
2567+
CG(ascii_compatible_locale) = 1;
2568+
}
2569+
#elif defined(MB_CUR_MAX)
2570+
/* Check if current locale uses variable width encoding */
2571+
if (MB_CUR_MAX > 1) {
2572+
#if HAVE_NL_LANGINFO
2573+
const char *charmap = nl_langinfo(CODESET);
2574+
#else
2575+
char buf[16];
2576+
const char *charmap = NULL;
2577+
const char *locale = setlocale(LC_CTYPE, NULL);
2578+
2579+
if (locale) {
2580+
const char *dot = strchr(locale, '.');
2581+
const char *modifier;
2582+
2583+
if (dot) {
2584+
dot++;
2585+
modifier = strchr(dot, '@');
2586+
if (!modifier) {
2587+
charmap = dot;
2588+
} else if (modifier - dot < sizeof(buf)) {
2589+
memcpy(buf, dot, modifier - dot);
2590+
buf[modifier - dot] = '\0';
2591+
charmap = buf;
2592+
}
2593+
}
2594+
}
2595+
#endif
2596+
CG(variable_width_locale) = 1;
2597+
CG(ascii_compatible_locale) = 0;
2598+
2599+
if (charmap) {
2600+
size_t len = strlen(charmap);
2601+
static const char *ascii_compatible_charmaps[] = {
2602+
"utf-8",
2603+
"utf8",
2604+
// TODO: EUC-* are also ASCII compatible ???
2605+
NULL
2606+
};
2607+
const char **p;
2608+
/* Check if current locale is ASCII compatible */
2609+
for (p = ascii_compatible_charmaps; *p; p++) {
2610+
if (zend_binary_strcasecmp(charmap, len, *p, strlen(*p)) == 0) {
2611+
CG(ascii_compatible_locale) = 1;
2612+
break;
2613+
}
2614+
}
2615+
}
2616+
2617+
} else {
2618+
CG(variable_width_locale) = 0;
2619+
CG(ascii_compatible_locale) = 1;
2620+
}
2621+
#else
2622+
/* We can't determine current charset. Assume the worst case */
2623+
CG(variable_width_locale) = 1;
2624+
CG(ascii_compatible_locale) = 0;
2625+
#endif
25442626
}
25452627
/* }}} */
2546-
#endif
25472628

25482629
static zend_always_inline void zend_str_tolower_impl(char *dest, const char *str, size_t length) /* {{{ */ {
25492630
unsigned char *p = (unsigned char*)str;

Zend/zend_operators.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -450,16 +450,7 @@ ZEND_API zend_long ZEND_FASTCALL zend_atol(const char *str, size_t str_len);
450450
#define convert_to_object_ex(zv) convert_to_object(zv)
451451
#define convert_scalar_to_number_ex(zv) convert_scalar_to_number(zv)
452452

453-
#if defined(ZEND_WIN32) && !defined(ZTS) && defined(_MSC_VER)
454-
/* This performance improvement of tolower() on Windows gives 10-18% on bench.php */
455-
#define ZEND_USE_TOLOWER_L 1
456-
#endif
457-
458-
#ifdef ZEND_USE_TOLOWER_L
459453
ZEND_API void zend_update_current_locale(void);
460-
#else
461-
#define zend_update_current_locale()
462-
#endif
463454

464455
/* The offset in bytes between the value and type fields of a zval */
465456
#define ZVAL_OFFSETOF_TYPE \

ext/standard/string.c

Lines changed: 77 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,66 +1465,96 @@ PHP_FUNCTION(strtolower)
14651465
/* {{{ php_basename */
14661466
PHPAPI zend_string *php_basename(const char *s, size_t len, const char *suffix, size_t suffix_len)
14671467
{
1468-
/* State 0 is directly after a directory separator (or at the start of the string).
1469-
* State 1 is everything else. */
1470-
int state = 0;
1471-
const char *basename_start = s;
1472-
const char *basename_end = s;
1473-
while (len > 0) {
1474-
int inc_len = (*s == '\0' ? 1 : php_mblen(s, len));
1468+
const char *basename_start;
1469+
const char *basename_end;
1470+
1471+
if (CG(ascii_compatible_locale)) {
1472+
#ifdef ZEND_WIN32
1473+
if ((len >= 2) && isalpha((int)((unsigned char *)s)[0]) && (s[1] == ':')) {
1474+
s += 2;
1475+
len -= 2;
1476+
}
1477+
#endif
1478+
1479+
basename_end = s + len - 1;
1480+
1481+
/* Strip trailing slashes */
1482+
while (basename_end >= s && IS_SLASH_P(basename_end)) {
1483+
basename_end--;
1484+
}
1485+
if (basename_end < s) {
1486+
return ZSTR_EMPTY_ALLOC();
1487+
}
1488+
1489+
/* Extract filename */
1490+
basename_start = basename_end;
1491+
basename_end++;
1492+
while (basename_start > s && !IS_SLASH_P(basename_start - 1)) {
1493+
basename_start--;
1494+
}
1495+
} else {
1496+
/* State 0 is directly after a directory separator (or at the start of the string).
1497+
* State 1 is everything else. */
1498+
int state = 0;
1499+
1500+
basename_start = s;
1501+
basename_end = s;
1502+
while (len > 0) {
1503+
int inc_len = (*s == '\0' ? 1 : php_mblen(s, len));
14751504

1476-
switch (inc_len) {
1477-
case 0:
1478-
goto quit_loop;
1479-
case 1:
1505+
switch (inc_len) {
1506+
case 0:
1507+
goto quit_loop;
1508+
case 1:
14801509
#if defined(PHP_WIN32)
1481-
if (*s == '/' || *s == '\\') {
1510+
if (*s == '/' || *s == '\\') {
14821511
#else
1483-
if (*s == '/') {
1512+
if (*s == '/') {
14841513
#endif
1485-
if (state == 1) {
1486-
state = 0;
1487-
basename_end = s;
1488-
}
1514+
if (state == 1) {
1515+
state = 0;
1516+
basename_end = s;
1517+
}
14891518
#if defined(PHP_WIN32)
1490-
/* Catch relative paths in c:file.txt style. They're not to confuse
1491-
with the NTFS streams. This part ensures also, that no drive
1492-
letter traversing happens. */
1493-
} else if ((*s == ':' && (s - basename_start == 1))) {
1494-
if (state == 0) {
1495-
basename_start = s;
1496-
state = 1;
1519+
/* Catch relative paths in c:file.txt style. They're not to confuse
1520+
with the NTFS streams. This part ensures also, that no drive
1521+
letter traversing happens. */
1522+
} else if ((*s == ':' && (s - basename_start == 1))) {
1523+
if (state == 0) {
1524+
basename_start = s;
1525+
state = 1;
1526+
} else {
1527+
basename_end = s;
1528+
state = 0;
1529+
}
1530+
#endif
14971531
} else {
1498-
basename_end = s;
1499-
state = 0;
1532+
if (state == 0) {
1533+
basename_start = s;
1534+
state = 1;
1535+
}
1536+
}
1537+
break;
1538+
default:
1539+
if (inc_len < 0) {
1540+
/* If character is invalid, treat it like other non-significant characters. */
1541+
inc_len = 1;
1542+
php_mb_reset();
15001543
}
1501-
#endif
1502-
} else {
15031544
if (state == 0) {
15041545
basename_start = s;
15051546
state = 1;
15061547
}
1507-
}
1508-
break;
1509-
default:
1510-
if (inc_len < 0) {
1511-
/* If character is invalid, treat it like other non-significant characters. */
1512-
inc_len = 1;
1513-
php_mb_reset();
1514-
}
1515-
if (state == 0) {
1516-
basename_start = s;
1517-
state = 1;
1518-
}
1519-
break;
1548+
break;
1549+
}
1550+
s += inc_len;
1551+
len -= inc_len;
15201552
}
1521-
s += inc_len;
1522-
len -= inc_len;
1523-
}
15241553

15251554
quit_loop:
1526-
if (state == 1) {
1527-
basename_end = s;
1555+
if (state == 1) {
1556+
basename_end = s;
1557+
}
15281558
}
15291559

15301560
if (suffix != NULL && suffix_len < (size_t)(basename_end - basename_start) &&
@@ -4604,7 +4634,6 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) {
46044634
retval = setlocale(cat, NULL);
46054635
}
46064636
# endif
4607-
zend_update_current_locale();
46084637
if (!retval) {
46094638
return NULL;
46104639
}
@@ -4615,6 +4644,7 @@ static zend_string *try_setlocale_str(zend_long cat, zend_string *loc) {
46154644

46164645
BG(locale_changed) = 1;
46174646
if (cat == LC_CTYPE || cat == LC_ALL) {
4647+
zend_update_current_locale();
46184648
if (BG(ctype_string)) {
46194649
zend_string_release_ex(BG(ctype_string), 0);
46204650
}

0 commit comments

Comments
 (0)