Skip to content

Commit e670fd7

Browse files
committed
ext/standard: speed up php_url_parse_ex2 by ~12%
Three related changes to ext/standard/url.c targeting the ctype macros on the parse_url hot path. On a 17-URL mix (17M parses per run, CPU pinned, same-session A/B), median wall time drops from 1.90s to 1.68s, a ~12% reduction and ~13% throughput increase (8.94M/s to 10.10M/s). 1. php_replace_controlchars replaces its iscntrl() call with an inline `c < 0x20 || c == 0x7f` comparison. Callgrind showed iscntrl at ~14% of total instructions on a realistic URL workload; glibc's iscntrl goes through __ctype_b_loc() per byte for a TLS lookup and table deref, which defeats auto-vectorization. URL components are bytes, not locale-dependent text, so C/POSIX semantics are what we want regardless of the process locale. The Zend language scanner uses the same pattern (yych <= 0x1F). This runs once per component per parse, up to 7 times. 2. The scheme-validation walk uses isalpha/isdigit which have the same __ctype_b_loc tax. I extracted the check into php_url_is_scheme_char with an inline ASCII test: ((c | 0x20) - 'a' < 26u) || (c - '0' < 10u) for the letter/digit half, plus the three literal comparisons for + - and . The scheme loop runs once per byte of the scheme on every parse. A helper php_url_is_ascii_digit covers the two isdigit call sites in the port-scan loops (one in the mailto-branch port probe, one in the parse_port fallback). 3. The three branches that allocate ret->scheme all followed zend_string_init with a php_replace_controlchars call. The scheme loop above has already rejected any byte that isn't in [a-zA-Z0-9+.-], so the control-char scan on scheme is dead work. Removed from all three sites. No behavior change: the inline comparisons are identical in behavior to the ctype macros in C/POSIX, and URL bytes are never locale-dependent. I checked that contaminated inputs like http://ex\x7fample.com/p\x1fath still get their control bytes replaced with underscores.
1 parent 8ad79e1 commit e670fd7

File tree

1 file changed

+16
-8
lines changed

1 file changed

+16
-8
lines changed

ext/standard/url.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,17 @@ PHPAPI void php_url_free(php_url *theurl)
4747
}
4848
/* }}} */
4949

50+
static zend_always_inline bool php_url_is_scheme_char(unsigned char c)
51+
{
52+
return ((c | 0x20) - 'a' < 26u) || (c - '0' < 10u)
53+
|| c == '+' || c == '-' || c == '.';
54+
}
55+
56+
static zend_always_inline bool php_url_is_ascii_digit(unsigned char c)
57+
{
58+
return c - '0' < 10u;
59+
}
60+
5061
static void php_replace_controlchars(char *str, size_t len)
5162
{
5263
unsigned char *s = (unsigned char *)str;
@@ -55,8 +66,8 @@ static void php_replace_controlchars(char *str, size_t len)
5566
ZEND_ASSERT(str != NULL);
5667

5768
while (s < e) {
58-
if (iscntrl(*s)) {
59-
*s='_';
69+
if (UNEXPECTED(*s < 0x20 || *s == 0x7f)) {
70+
*s = '_';
6071
}
6172
s++;
6273
}
@@ -103,7 +114,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
103114
p = s;
104115
while (p < e) {
105116
/* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
106-
if (!isalpha(*p) && !isdigit(*p) && *p != '+' && *p != '.' && *p != '-') {
117+
if (!php_url_is_scheme_char((unsigned char) *p)) {
107118
if (e + 1 < ue && e < binary_strcspn(s, ue, "?#")) {
108119
goto parse_port;
109120
} else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') { /* relative-scheme URL */
@@ -119,7 +130,6 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
119130

120131
if (e + 1 == ue) { /* only scheme is available */
121132
ret->scheme = zend_string_init(s, (e - s), 0);
122-
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
123133
return ret;
124134
}
125135

@@ -132,7 +142,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
132142
* correctly parse things like a.com:80
133143
*/
134144
p = e + 1;
135-
while (p < ue && isdigit(*p)) {
145+
while (p < ue && php_url_is_ascii_digit((unsigned char) *p)) {
136146
p++;
137147
}
138148

@@ -141,13 +151,11 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
141151
}
142152

143153
ret->scheme = zend_string_init(s, (e-s), 0);
144-
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
145154

146155
s = e + 1;
147156
goto just_path;
148157
} else {
149158
ret->scheme = zend_string_init(s, (e-s), 0);
150-
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
151159

152160
if (e + 2 < ue && *(e + 2) == '/') {
153161
s = e + 3;
@@ -172,7 +180,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
172180
p = e + 1;
173181
pp = p;
174182

175-
while (pp < ue && pp - p < 6 && isdigit(*pp)) {
183+
while (pp < ue && pp - p < 6 && php_url_is_ascii_digit((unsigned char) *pp)) {
176184
pp++;
177185
}
178186

0 commit comments

Comments
 (0)