Skip to content

Commit 4c34ef0

Browse files
committed
Add URL escaping/unescaping for winhttp URL functions.
Fixes #351
1 parent 156b952 commit 4c34ef0

3 files changed

Lines changed: 137 additions & 44 deletions

File tree

src/_native/winhttp.cpp

Lines changed: 113 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <Python.h>
22
#include <windows.h>
3+
#include <shlwapi.h>
34
#include <winhttp.h>
45
#include <netlistmgr.h>
56

@@ -108,7 +109,7 @@ static wchar_t **split_to_array(wchar_t *str, wchar_t sep) {
108109
}
109110

110111

111-
static int crack_url(wchar_t *url, URL_COMPONENTS *parts, int add_nuls) {
112+
static int crack_url(const wchar_t *url, URL_COMPONENTS *parts) {
112113
parts->lpszScheme = NULL;
113114
parts->lpszUserName = NULL;
114115
parts->lpszPassword = NULL;
@@ -125,29 +126,66 @@ static int crack_url(wchar_t *url, URL_COMPONENTS *parts, int add_nuls) {
125126
winhttp_error();
126127
return 0;
127128
}
128-
if (add_nuls) {
129-
if (parts->lpszScheme && parts->dwSchemeLength > 0) {
130-
parts->lpszScheme[parts->dwSchemeLength] = L'\0';
131-
}
132-
if (parts->lpszUserName && parts->dwUserNameLength > 0) {
133-
parts->lpszUserName[parts->dwUserNameLength] = L'\0';
134-
}
135-
if (parts->lpszPassword && parts->dwPasswordLength > 0) {
136-
parts->lpszPassword[parts->dwPasswordLength] = L'\0';
137-
}
138-
if (parts->lpszHostName && parts->dwHostNameLength > 0) {
139-
parts->lpszHostName[parts->dwHostNameLength] = L'\0';
140-
}
141-
if (parts->lpszUrlPath && parts->dwUrlPathLength > 0) {
142-
parts->lpszUrlPath[parts->dwUrlPathLength] = L'\0';
129+
return 1;
130+
}
131+
132+
static wchar_t *_recode_url_part(bool encode, const wchar_t *url_part, DWORD cch, bool allow_env=false)
133+
{
134+
if (!url_part) {
135+
return NULL;
136+
}
137+
// Need to copy the incoming string to ensure it's null terminated
138+
cch += 1;
139+
wchar_t *url_string = (wchar_t *)PyMem_Malloc(sizeof(wchar_t) * cch);
140+
if (!url_string) {
141+
PyErr_NoMemory();
142+
return NULL;
143+
}
144+
wcsncpy_s(url_string, cch, url_part, cch - 1);
145+
if (!url_string[0] || cch > 32767) {
146+
// Too long/empty for the API, just bail out
147+
return url_string;
148+
}
149+
if (allow_env && cch > 2 && url_string[0] == L'%' && url_string[cch - 2] == L'%') {
150+
// Looks like an environment variable, so we won't change it.
151+
return url_string;
152+
}
153+
154+
wchar_t *result = NULL;
155+
HRESULT r = E_POINTER;
156+
for (int retries = 3; retries > 0 && r == E_POINTER; --retries) {
157+
result = (wchar_t *)PyMem_Realloc(result, sizeof(wchar_t) * cch);
158+
if (!result) {
159+
PyMem_Free(url_string);
160+
PyErr_NoMemory();
161+
return NULL;
143162
}
144-
if (parts->lpszExtraInfo && parts->dwExtraInfoLength > 0) {
145-
parts->lpszExtraInfo[parts->dwExtraInfoLength] = L'\0';
163+
if (encode) {
164+
// "SEGMENT_ONLY" means we want to escape the entire string
165+
r = UrlEscapeW(url_string, result, &cch, URL_ESCAPE_SEGMENT_ONLY | URL_ESCAPE_ASCII_URI_COMPONENT);
166+
} else {
167+
r = UrlUnescapeW(url_string, result, &cch, 0);
146168
}
147169
}
148-
return 1;
170+
PyMem_Free(url_string);
171+
if (r) {
172+
err_SetFromWindowsErrWithMessage((DWORD)r);
173+
return NULL;
174+
}
175+
return result;
176+
}
177+
178+
static wchar_t *encode_url_part(const wchar_t *url_part, DWORD cch, bool allow_env=false)
179+
{
180+
return _recode_url_part(true, url_part, cch, allow_env);
149181
}
150182

183+
static wchar_t *decode_url_part(const wchar_t *url_part, DWORD cch, bool allow_env=false)
184+
{
185+
return _recode_url_part(false, url_part, cch, allow_env);
186+
}
187+
188+
151189
extern "C" {
152190

153191
#define CHECK_WINHTTP(x) if (!x) { winhttp_error(); goto exit; }
@@ -226,7 +264,6 @@ static bool winhttp_apply_proxy(HINTERNET hSession, HINTERNET hRequest, const wc
226264
PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
227265
static const char * keywords[] = {"url", "method", "headers", "accepts", "chunksize", "on_progress", "on_cred_request", NULL};
228266
wchar_t *url = NULL;
229-
wchar_t *url2 = NULL; // a copy of url for splitting
230267
wchar_t *method = NULL;
231268
wchar_t *headers = NULL;
232269
wchar_t *accepts = NULL;
@@ -246,7 +283,11 @@ PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
246283
uint64_t content_length;
247284
PyObject *chunks = NULL;
248285
uint64_t content_read = 0;
249-
size_t n = 0;
286+
287+
wchar_t *hostname = NULL;
288+
wchar_t *urlpath = NULL;
289+
wchar_t *user = NULL;
290+
wchar_t *pass = NULL;
250291

251292
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&O&O&O&|nOO:winhttp_urlopen", keywords,
252293
as_utf16, &url, as_utf16, &method, as_utf16, &headers, as_utf16, &accepts, &chunksize, &on_progress, &on_cred_request)) {
@@ -264,17 +305,19 @@ PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
264305
if (!accepts_array) {
265306
goto exit;
266307
}
267-
n = wcslen(url) + 1;
268-
url2 = (wchar_t *)PyMem_Malloc(n * sizeof(wchar_t));
269-
if (!url2) {
270-
PyErr_NoMemory();
308+
if (!crack_url(url, &url_parts)) {
309+
goto exit;
310+
}
311+
hostname = decode_url_part(url_parts.lpszHostName, url_parts.dwHostNameLength);
312+
if (!hostname) {
271313
goto exit;
272314
}
273-
wcscpy_s(url2, n, url);
274-
if (!crack_url(url2, &url_parts, 1)) {
315+
urlpath = decode_url_part(url_parts.lpszUrlPath, url_parts.dwUrlPathLength);
316+
if (!urlpath) {
275317
goto exit;
276318
}
277319

320+
278321
hSession = WinHttpOpen(
279322
NULL,
280323
WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,
@@ -310,19 +353,16 @@ PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
310353

311354
hConnection = WinHttpConnect(
312355
hSession,
313-
url_parts.lpszHostName,
356+
hostname,
314357
url_parts.nPort,
315358
0
316359
);
317360
CHECK_WINHTTP(hConnection);
318361

319-
if (url_parts.dwUrlPathLength && !url_parts.lpszUrlPath[0]) {
320-
url_parts.lpszUrlPath[0] = L'/';
321-
}
322362
hRequest = WinHttpOpenRequest(
323363
hConnection,
324364
method,
325-
url_parts.lpszUrlPath,
365+
urlpath && urlpath[0] ? urlpath : L"/",
326366
NULL,
327367
WINHTTP_NO_REFERER,
328368
accepts_array,
@@ -341,12 +381,20 @@ PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
341381
));
342382

343383
if (url_parts.dwUserNameLength || url_parts.dwPasswordLength) {
384+
user = decode_url_part(url_parts.lpszUserName, url_parts.dwUserNameLength);
385+
if (!user) {
386+
goto exit;
387+
}
388+
pass = decode_url_part(url_parts.lpszPassword, url_parts.dwPasswordLength);
389+
if (!pass) {
390+
goto exit;
391+
}
344392
CHECK_WINHTTP(WinHttpSetCredentials(
345393
hRequest,
346394
WINHTTP_AUTH_TARGET_SERVER,
347395
WINHTTP_AUTH_SCHEME_BASIC,
348-
url_parts.lpszUserName,
349-
url_parts.lpszPassword,
396+
user,
397+
pass,
350398
NULL
351399
));
352400
}
@@ -464,11 +512,14 @@ PyObject *winhttp_urlopen(PyObject *, PyObject *args, PyObject *kwargs) {
464512
if (hSession) {
465513
WinHttpCloseHandle(hSession);
466514
}
515+
PyMem_Free(user);
516+
PyMem_Free(pass);
517+
PyMem_Free(hostname);
518+
PyMem_Free(urlpath);
467519
PyMem_Free(accepts_array);
468520
PyMem_Free(accepts);
469521
PyMem_Free(headers);
470522
PyMem_Free(method);
471-
PyMem_Free(url2);
472523
PyMem_Free(url);
473524
return result;
474525
}
@@ -510,19 +561,26 @@ PyObject *winhttp_urlsplit(PyObject *, PyObject *args, PyObject *kwargs) {
510561
return NULL;
511562
}
512563
URL_COMPONENTS url_parts = { sizeof(URL_COMPONENTS) };
513-
if (!crack_url(url, &url_parts, 0)) {
564+
if (!crack_url(url, &url_parts)) {
514565
PyMem_Free(url);
515566
return NULL;
516567
}
568+
// Deliberately not decoding host or path. We never use a blacklist, we only
569+
// match against values specified by the user, or pass it to. If they want
570+
// to provide the same URL with different encoding, that's their fault.
571+
wchar_t *user = decode_url_part(url_parts.lpszUserName, url_parts.dwUserNameLength, true);
572+
wchar_t *pass = decode_url_part(url_parts.lpszPassword, url_parts.dwPasswordLength, true);
517573
PyObject *r = Py_BuildValue("(u#u#u#u#nu#u#)",
518574
url_parts.lpszScheme, (Py_ssize_t)url_parts.dwSchemeLength,
519-
url_parts.lpszUserName, (Py_ssize_t)url_parts.dwUserNameLength,
520-
url_parts.lpszPassword, (Py_ssize_t)url_parts.dwPasswordLength,
575+
user, (Py_ssize_t)(user ? wcslen(user) : 0),
576+
pass, (Py_ssize_t)(pass ? wcslen(pass) : 0),
521577
url_parts.lpszHostName, (Py_ssize_t)url_parts.dwHostNameLength,
522578
(Py_ssize_t)url_parts.nPort,
523579
url_parts.lpszUrlPath, (Py_ssize_t)url_parts.dwUrlPathLength,
524580
url_parts.lpszExtraInfo, (Py_ssize_t)url_parts.dwExtraInfoLength
525581
);
582+
PyMem_Free(user);
583+
PyMem_Free(pass);
526584
PyMem_Free(url);
527585
return r;
528586
}
@@ -532,10 +590,12 @@ PyObject *winhttp_urlunsplit(PyObject *, PyObject *args, PyObject *kwargs) {
532590
static const char * keywords[] = {"scheme", "user", "password", "netloc", "port", "path", "extra", NULL};
533591
URL_COMPONENTS url = { sizeof(URL_COMPONENTS) };
534592
Py_ssize_t port = 0;
593+
wchar_t *user = NULL;
594+
wchar_t *pass = NULL;
535595
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&O&O&O&nO&O&:winhttp_urlunsplit", keywords,
536596
as_utf16, &url.lpszScheme,
537-
as_utf16, &url.lpszUserName,
538-
as_utf16, &url.lpszPassword,
597+
as_utf16, &user,
598+
as_utf16, &pass,
539599
as_utf16, &url.lpszHostName,
540600
&port,
541601
as_utf16, &url.lpszUrlPath,
@@ -545,8 +605,16 @@ PyObject *winhttp_urlunsplit(PyObject *, PyObject *args, PyObject *kwargs) {
545605
}
546606
DWORD cch = 0;
547607
PyObject *r = NULL;
608+
url.lpszUserName = encode_url_part(user, user ? wcslen(user) : 0, true);
609+
if (user && !url.lpszUserName) {
610+
goto exit;
611+
}
612+
url.lpszPassword = encode_url_part(pass, pass ? wcslen(pass) : 0, true);
613+
if (pass && !url.lpszPassword) {
614+
goto exit;
615+
}
548616
url.nPort = (INTERNET_PORT)port;
549-
if (WinHttpCreateUrl(&url, ICU_ESCAPE, NULL, &cch)) {
617+
if (WinHttpCreateUrl(&url, 0, NULL, &cch)) {
550618
// Success path, because it should've failed with ERROR_INSUFFICIENT_BUFFER
551619
PyErr_SetString(PyExc_ValueError, "unable to unsplit URL");
552620
} else if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
@@ -556,14 +624,17 @@ PyObject *winhttp_urlunsplit(PyObject *, PyObject *args, PyObject *kwargs) {
556624
wchar_t *buf = (wchar_t*)PyMem_Malloc(cch * sizeof(wchar_t));
557625
if (!buf) {
558626
PyErr_NoMemory();
559-
} else if (!WinHttpCreateUrl(&url, ICU_ESCAPE, buf, &cch)) {
627+
} else if (!WinHttpCreateUrl(&url, 0, buf, &cch)) {
560628
winhttp_error();
561629
PyMem_Free(buf);
562630
} else {
563631
r = PyUnicode_FromWideChar(buf, cch);
564632
PyMem_Free(buf);
565633
}
566634
}
635+
exit:
636+
PyMem_Free(user);
637+
PyMem_Free(pass);
567638
PyMem_Free(url.lpszScheme);
568639
PyMem_Free(url.lpszUserName);
569640
PyMem_Free(url.lpszPassword);

src/manage/urlutils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -552,9 +552,11 @@ def sanitise_url(url):
552552
if ex.winerror in (12005, 12006):
553553
return url
554554
raise
555-
p[U_USERNAME] = None
555+
u = p[U_USERNAME]
556+
if u and not (u.startswith("%") and u.endswith("%")):
557+
p[U_USERNAME] = None
556558
pw = p[U_PASSWORD]
557-
if pw and not (pw.startswith("%") and pw.startswith("%")):
559+
if pw and not (pw.startswith("%") and pw.endswith("%")):
558560
p[U_PASSWORD] = None
559561
return winhttp_urlunsplit(*p)
560562

tests/test_urlutils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
("https://example.com/", "https://example.com/"),
1212
("https://user@example.com/", "https://example.com/"),
1313
("https://user:placeholder@example.com/", "https://example.com/"),
14+
("https://%placeholder%@example.com/", "https://%placeholder%@example.com/"),
15+
("https://%user%:%placeholder%@example.com/", "https://%user%:%placeholder%@example.com/"),
1416
]])
1517
def test_urlsanitise(url, expect):
1618
assert expect == UU.sanitise_url(url)
@@ -30,10 +32,23 @@ def test_urlunsanitise():
3032
assert None == UU.unsanitise_url(url, candidates)
3133

3234

35+
def test_urlunsanitise_encoded():
36+
candidates = ["https://user%40example.com:place%40holder@example.com/"]
37+
url = "https://example.com/my_path"
38+
expect = "https://user%40example.com:place%40holder@example.com/my_path"
39+
actual = UU.unsanitise_url(url, candidates)
40+
print(actual)
41+
print(UU.winhttp_urlsplit(actual))
42+
assert expect == UU.unsanitise_url(url, candidates)
43+
44+
3345
def test_extract_url_auth():
3446
assert "1", "2" == UU.extract_url_auth("https://1:2@example.com")
3547
assert "1", "" == UU.extract_url_auth("https://1@example.com")
3648

49+
assert "1", "2" == UU.extract_url_auth("https://%31:%32@example.com")
50+
assert "1", "" == UU.extract_url_auth("https://%31@example.com")
51+
3752
os.environ["PYMANAGER_TEST_VALUE"] = v = str(time.time())
3853
assert "1", v == UU.extract_url_auth("https://1:%PYMANAGER_TEST_VALUE%@example.com")
3954

@@ -48,6 +63,11 @@ def test_extract_url_auth():
4863
("https://example.com/A/B/C", "//EXAMPLE.COM/A", True, "https://EXAMPLE.COM/A"),
4964
("https://example.com/A/B/C", "//EXAMPLE.COM/", None, "https://EXAMPLE.COM/"),
5065
66+
# We are intentionally blind to encoded chars.
67+
("https://example.com/A/B/C", "%2fD", False, "https://example.com/A/B/C/%2fD"),
68+
("https://example.com/A/B/C", "%2f%2fD", False, "https://example.com/A/B/C/%2f%2fD"),
69+
("https://example.com/A/B%2fC", "D", True, "https://example.com/A/D"),
70+
5171
("file:///C:/local/path", "file.json", False, "file:///C:/local/path/file.json"),
5272
("file:///C:/local/path", "file.json", True, "file:///C:/local/file.json"),
5373
("file:///C:/local/path", ".\\dir\\file.json", False, "file:///C:/local/path/dir/file.json"),

0 commit comments

Comments
 (0)