-
Notifications
You must be signed in to change notification settings - Fork 8k
Fix UTF-8 handling in php_url_parse_ex2 #19506
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
1114173
9b5f44f
e75fba2
836e617
4cc7c77
d320368
deccf12
c0a2f4f
474b13f
90f8635
8a62104
388394a
7d0a1e1
4de8b14
a326c64
713c797
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
--TEST-- | ||
Uri: hostnames should be preserved in Unicode form | ||
--FILE-- | ||
<?php | ||
|
||
$parsed = parse_url('http://ουτοπία.δπθ.gr/'); | ||
var_dump($parsed['host']); | ||
?> | ||
--EXPECT-- | ||
string(24) "ουτοπία.δπθ.gr" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,19 +47,26 @@ PHPAPI void php_url_free(php_url *theurl) | |
} | ||
/* }}} */ | ||
|
||
static void php_replace_controlchars(char *str, size_t len) | ||
static void php_str_to_utf8(const char *str, size_t len) | ||
{ | ||
unsigned char *s = (unsigned char *)str; | ||
unsigned char *e = (unsigned char *)str + len; | ||
|
||
ZEND_ASSERT(str != NULL); | ||
|
||
while (s < e) { | ||
if (iscntrl(*s)) { | ||
*s='_'; | ||
} | ||
s++; | ||
} | ||
zend_string *utf8; | ||
|
||
utf8 = zend_string_alloc(len * 4, 0); | ||
|
||
const unsigned char *s = (const unsigned char *)str; | ||
const unsigned char *e = s + len; | ||
unsigned char *d = (unsigned char *)ZSTR_VAL(utf8); | ||
|
||
while (s < e) { | ||
if (*s < 0x80) { // ASCII | ||
*d++ = *s++; | ||
} else { | ||
/* Convert non-ASCII bytes to UTF-8 */ | ||
*d++ = 0xEF; | ||
*d++ = 0xBF; | ||
*d++ = 0xBD; | ||
s++; | ||
} | ||
} | ||
zend_string_release(utf8); | ||
} | ||
|
||
PHPAPI php_url *php_url_parse(char const *str) | ||
|
@@ -119,7 +126,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
|
||
if (e + 1 == ue) { /* only scheme is available */ | ||
ret->scheme = zend_string_init(s, (e - s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
return ret; | ||
} | ||
|
||
|
@@ -141,13 +148,13 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
} | ||
|
||
ret->scheme = zend_string_init(s, (e-s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
|
||
s = e + 1; | ||
goto just_path; | ||
} else { | ||
ret->scheme = zend_string_init(s, (e-s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme)); | ||
|
||
if (e + 2 < ue && *(e + 2) == '/') { | ||
s = e + 3; | ||
|
@@ -213,14 +220,14 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
if ((p = zend_memrchr(s, '@', (e-s)))) { | ||
if ((pp = memchr(s, ':', (p-s)))) { | ||
ret->user = zend_string_init(s, (pp-s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user)); | ||
php_str_to_utf8(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user)); | ||
|
||
pp++; | ||
ret->pass = zend_string_init(pp, (p-pp), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->pass), ZSTR_LEN(ret->pass)); | ||
php_str_to_utf8(ZSTR_VAL(ret->pass), ZSTR_LEN(ret->pass)); | ||
} else { | ||
ret->user = zend_string_init(s, (p-s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user)); | ||
php_str_to_utf8(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user)); | ||
} | ||
|
||
s = p + 1; | ||
|
@@ -269,7 +276,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
} | ||
|
||
ret->host = zend_string_init(s, (p-s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->host), ZSTR_LEN(ret->host)); | ||
php_str_to_utf8(ZSTR_VAL(ret->host), ZSTR_LEN(ret->host)); | ||
|
||
if (e == ue) { | ||
return ret; | ||
|
@@ -285,7 +292,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
p++; | ||
if (p < e) { | ||
ret->fragment = zend_string_init(p, (e - p), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->fragment), ZSTR_LEN(ret->fragment)); | ||
php_str_to_utf8(ZSTR_VAL(ret->fragment), ZSTR_LEN(ret->fragment)); | ||
} else { | ||
ret->fragment = ZSTR_EMPTY_ALLOC(); | ||
} | ||
|
@@ -297,7 +304,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
p++; | ||
if (p < e) { | ||
ret->query = zend_string_init(p, (e - p), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->query), ZSTR_LEN(ret->query)); | ||
php_str_to_utf8(ZSTR_VAL(ret->query), ZSTR_LEN(ret->query)); | ||
} else { | ||
ret->query = ZSTR_EMPTY_ALLOC(); | ||
} | ||
|
@@ -306,7 +313,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port | |
|
||
if (s < e || s == ue) { | ||
ret->path = zend_string_init(s, (e - s), 0); | ||
php_replace_controlchars(ZSTR_VAL(ret->path), ZSTR_LEN(ret->path)); | ||
php_str_to_utf8(ZSTR_VAL(ret->path), ZSTR_LEN(ret->path)); | ||
} | ||
|
||
return ret; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this work? You allocate a new string, but at the end of the function you throw it away?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It allocate a
zend_string *utf8 with zend_string_alloc(len * 4, 0)
.It fills
UTF-8
with either the original ASCII bytes or the UTF-8 replacement character for non-ASCII bytes.At the end, immediately call
zend_string_release(utf8)
;This frees the memory just allocated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand that. But how does this solve the problem?
You never use the newly created string, and you now no longer replace control characters.
Furthermore, what's the idea behind using the replacement character? This should not be necessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update: After removing
php_replace_controlchars
, the code works correctly without any issues.