Skip to content
Closed
10 changes: 10 additions & 0 deletions ext/standard/tests/url/url_utf8.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--TEST--
Uri: hostnames should be preserved in Unicode form
--FILE--
<?php

$parsed = parse_url('http://ουτοπία.δπθ.gr/');
var_dump($parsed['host']);
?>
--EXPECT--
string(24) "ουτοπία.δπθ.gr"
51 changes: 29 additions & 22 deletions ext/standard/url.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,26 @@ PHPAPI void php_url_free(php_url *theurl)
}
/* }}} */

static void php_replace_controlchars(char *str, size_t len)
static void php_str_to_utf8(const char *str, size_t len)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this work? You allocate a new string, but at the end of the function you throw it away?

Copy link
Author

@arshidkv12 arshidkv12 Aug 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. It allocate a zend_string *utf8 with zend_string_alloc(len * 4, 0).

    • This is intended to hold the UTF-8–encoded version of the input string.
  2. It fills UTF-8 with either the original ASCII bytes or the UTF-8 replacement character for non-ASCII bytes.

  3. At the end, immediately call zend_string_release(utf8);

  4. This frees the memory just allocated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that. But how does this solve the problem?
You never use the newly created string, and you now no longer replace control characters.
Furthermore, what's the idea behind using the replacement character? This should not be necessary

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: After removing php_replace_controlchars, the code works correctly without any issues.

{
unsigned char *s = (unsigned char *)str;
unsigned char *e = (unsigned char *)str + len;

ZEND_ASSERT(str != NULL);

while (s < e) {
if (iscntrl(*s)) {
*s='_';
}
s++;
}
zend_string *utf8;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should merge the declaration and assignment

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. I changed it. zend_string *utf8 = zend_string_safe_alloc(len, 4, 0, 0);

utf8 = zend_string_alloc(len * 4, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Never open-code a multiplication inside an allocation size argument, this can cause security issues (integer overflow -> buffer overflow)
Use zend_string_safe_alloc instead.

const unsigned char *s = (const unsigned char *)str;
const unsigned char *e = s + len;
unsigned char *d = (unsigned char *)ZSTR_VAL(utf8);

while (s < e) {
if (*s < 0x80) { // ASCII
*d++ = *s++;
} else {
/* Convert non-ASCII bytes to UTF-8 */
*d++ = 0xEF;
*d++ = 0xBF;
*d++ = 0xBD;
s++;
}
}
zend_string_release(utf8);
}

PHPAPI php_url *php_url_parse(char const *str)
Expand Down Expand Up @@ -119,7 +126,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port

if (e + 1 == ue) { /* only scheme is available */
ret->scheme = zend_string_init(s, (e - s), 0);
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
return ret;
}

Expand All @@ -141,13 +148,13 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
}

ret->scheme = zend_string_init(s, (e-s), 0);
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));

s = e + 1;
goto just_path;
} else {
ret->scheme = zend_string_init(s, (e-s), 0);
php_replace_controlchars(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));
php_str_to_utf8(ZSTR_VAL(ret->scheme), ZSTR_LEN(ret->scheme));

if (e + 2 < ue && *(e + 2) == '/') {
s = e + 3;
Expand Down Expand Up @@ -213,14 +220,14 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
if ((p = zend_memrchr(s, '@', (e-s)))) {
if ((pp = memchr(s, ':', (p-s)))) {
ret->user = zend_string_init(s, (pp-s), 0);
php_replace_controlchars(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user));
php_str_to_utf8(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user));

pp++;
ret->pass = zend_string_init(pp, (p-pp), 0);
php_replace_controlchars(ZSTR_VAL(ret->pass), ZSTR_LEN(ret->pass));
php_str_to_utf8(ZSTR_VAL(ret->pass), ZSTR_LEN(ret->pass));
} else {
ret->user = zend_string_init(s, (p-s), 0);
php_replace_controlchars(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user));
php_str_to_utf8(ZSTR_VAL(ret->user), ZSTR_LEN(ret->user));
}

s = p + 1;
Expand Down Expand Up @@ -269,7 +276,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
}

ret->host = zend_string_init(s, (p-s), 0);
php_replace_controlchars(ZSTR_VAL(ret->host), ZSTR_LEN(ret->host));
php_str_to_utf8(ZSTR_VAL(ret->host), ZSTR_LEN(ret->host));

if (e == ue) {
return ret;
Expand All @@ -285,7 +292,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
p++;
if (p < e) {
ret->fragment = zend_string_init(p, (e - p), 0);
php_replace_controlchars(ZSTR_VAL(ret->fragment), ZSTR_LEN(ret->fragment));
php_str_to_utf8(ZSTR_VAL(ret->fragment), ZSTR_LEN(ret->fragment));
} else {
ret->fragment = ZSTR_EMPTY_ALLOC();
}
Expand All @@ -297,7 +304,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port
p++;
if (p < e) {
ret->query = zend_string_init(p, (e - p), 0);
php_replace_controlchars(ZSTR_VAL(ret->query), ZSTR_LEN(ret->query));
php_str_to_utf8(ZSTR_VAL(ret->query), ZSTR_LEN(ret->query));
} else {
ret->query = ZSTR_EMPTY_ALLOC();
}
Expand All @@ -306,7 +313,7 @@ PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port

if (s < e || s == ue) {
ret->path = zend_string_init(s, (e - s), 0);
php_replace_controlchars(ZSTR_VAL(ret->path), ZSTR_LEN(ret->path));
php_str_to_utf8(ZSTR_VAL(ret->path), ZSTR_LEN(ret->path));
}

return ret;
Expand Down