Skip to content

Commit 597ad37

Browse files
committed
Clean-up based on senglehardt review.
Also identified new edge case of port but no scheme. Not easy to handle so just documented in docstring and tests for now.
1 parent b89c89a commit 597ad37

File tree

2 files changed

+111
-40
lines changed

2 files changed

+111
-40
lines changed

domain_utils/domain_utils.py

Lines changed: 50 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -116,61 +116,81 @@ def hostname_subparts(url, include_ps=False, **kwargs):
116116
return subparts
117117

118118

119-
def get_stripped_url(url, scheme=False, non_http_scheme=None):
119+
def get_stripped_url(url, scheme=False, drop_non_http=False, use_netloc=True):
120120
"""
121-
Returns a url stripped to (scheme)?+netloc+path
121+
Returns a url stripped to just the beginning and end, or more formally,
122+
(scheme)?+netloc+path
122123
For example ``https://my.domain.net/a/path/to/a/file.html#anchor?a=1``
123124
becomes ``my.domain.net/a/path/to/a/file.html``
124125
126+
URL parsing is done using std lib
127+
`urllib.parse.urlparse <https://docs.python.org/3.8/library/urllib.parse.html>`_.
128+
Empty scheme e.g. ``my.domain.cloudfront.net`` are assumed to be http schemes.
125129
126-
URL parsing is done using std lib urllib.parse.urlparse
127-
Using netloc means that a port is included, for example,
128-
if it was in the path.
129-
The method strips just the beginning and end being stripped.
130+
If a URL has a port but no scheme, urlparse determines the scheme to
131+
be the hostname and we do not handle this special case. In this case,
132+
the url will be treated as a non_http_scheme and the return value will
133+
be determined by the ``drop_non_http`` setting.
130134
131135
:param url: URL to be parsed
132136
:type url: str
133-
:param scheme: If True, scheme will be prepended in
137+
:param scheme: If ``True``, scheme will be prepended in
134138
returned result, defaults to False
135139
:type scheme: bool, optional
136-
:param non_http_scheme: Action to take if scheme is not
140+
:param drop_non_http: Action to take if scheme is not
137141
``http`` or ``https`` e.g. ``file:`` or ``about:blank``.
138-
If None, return empty string.
139-
If ``self``, return the original URL.
140-
Default is None.
141-
:type non_http_scheme: None or ``"self"``, optional
142+
If ``True``, the result for non http urls will be an empty string
143+
If ``False``, the result for non http urls will be the original url,
144+
not further processed e.g. ``about:blank`` -> ``about:blank`` even
145+
if ``scheme=False``. The result for http urls will be the stripped
146+
url with or without the scheme as per scheme param.
147+
Default is ``False``.
148+
:type non_http_scheme: bool, optional
149+
:param use_netloc: If ``True`` urlparse's netloc will be used.
150+
If ``False`` urlparse's host will be returned. Using netloc means
151+
that a port is included, for example, if it was in the path.
152+
Default is ``True``.
153+
:type use_netloc: bool, optional
142154
143155
:return: Returns a url stripped to (scheme)?+netloc+path.
144156
Returns empty string if appropriate.
145157
:rtype: str
146158
"""
147-
if non_http_scheme not in [None, 'self']:
148-
raise ValueError('non_http_scheme must be either `None` or `self`')
149159
purl = urlparse(url)
150-
151160
_scheme = purl.scheme
161+
162+
# Handle non http schemes
163+
if _scheme not in ['http', 'https', '']:
164+
if drop_non_http is True:
165+
return ''
166+
if drop_non_http is False:
167+
return url
168+
169+
if _scheme == '':
170+
# From the docs: "urlparse recognizes a netloc only
171+
# if it is properly introduced by ‘//’". So we
172+
# prepend to get results we expect.
173+
url = '//{url}'.format(url=url)
174+
175+
purl = urlparse(url)
152176
scheme_out = ''
153-
netloc_out = purl.netloc
177+
loc_out = ''
154178
path_out = purl.path
155179

156-
if _scheme not in ['http', 'https']:
157-
if non_http_scheme == 'self':
158-
scheme = True
159-
if non_http_scheme is None:
160-
# e.g. in the case of about:blank, the path is 'blank', but we want
161-
# to return nothing
162-
path_out = ''
163-
164180
if scheme is True:
165181
if _scheme in ['http', 'https']:
166182
scheme_out = '{scheme}://'.format(scheme=_scheme)
167-
elif _scheme == '':
168-
scheme_out = ''
169183
else:
170-
scheme_out = '{scheme}:'.format(scheme=_scheme)
184+
# Should only get here if scheme is ''
185+
scheme_out = '{scheme}'.format(scheme=_scheme)
186+
187+
if use_netloc is True:
188+
loc_out = purl.netloc
189+
else:
190+
loc_out = purl.hostname
171191

172-
return '{scheme_out}{netloc_out}{path_out}'.format(
192+
return '{scheme_out}{loc_out}{path_out}'.format(
173193
scheme_out=scheme_out,
174-
netloc_out=netloc_out,
194+
loc_out=loc_out,
175195
path_out=path_out,
176-
)
196+
)

tests/test_domain_utils.py

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,28 @@ def test_get_stripped_url_path():
3131
assert result == 'my.domain.cloudfront.net/a/path/to/a/file.html'
3232

3333

34-
def test_get_stripped_url_no_path_and_non_http_scheme_self():
34+
def test_get_stripped_url_no_path_and_drop_non_http_false():
3535
url = 'https://my.domain.cloudfront.net#anchor'
36-
result = du.get_stripped_url(url, non_http_scheme='self')
36+
result = du.get_stripped_url(url)
37+
assert result == 'my.domain.cloudfront.net'
38+
39+
40+
def test_get_stripped_url_no_scheme():
41+
url = 'my.domain.cloudfront.net#anchor'
42+
result = du.get_stripped_url(url)
43+
assert result == 'my.domain.cloudfront.net'
44+
45+
46+
def test_get_stripped_url_no_scheme_and_scheme_true():
47+
url = 'my.domain.cloudfront.net#anchor'
48+
result = du.get_stripped_url(url, scheme=True)
3749
assert result == 'my.domain.cloudfront.net'
3850

3951

40-
def test_get_stripped_url_no_scheme_and_non_http_scheme_self():
52+
def test_get_stripped_url_no_scheme_and_drop_non_http_urls_true():
53+
# Note we assume that empty schemes are http urls
4154
url = 'my.domain.cloudfront.net#anchor'
42-
result = du.get_stripped_url(url, non_http_scheme='self')
55+
result = du.get_stripped_url(url, drop_non_http=True)
4356
assert result == 'my.domain.cloudfront.net'
4457

4558

@@ -57,22 +70,60 @@ def test_get_stripped_url_with_hostname_only_and_scheme():
5770

5871
def test_get_stripped_url_non_http_scheme_none():
5972
url = 'about:blank'
60-
result = du.get_stripped_url(url, non_http_scheme=None)
73+
result = du.get_stripped_url(url, drop_non_http=True)
6174
assert result == ''
6275

6376

6477
def test_get_stripped_url_non_http_scheme_return_self():
6578
url = 'about:blank'
66-
result = du.get_stripped_url(url, non_http_scheme='self')
79+
result = du.get_stripped_url(url, drop_non_http=False)
6780
assert result == url
6881

6982

70-
def test_get_stripped_url_only_accepts_correct_args_for_non_http_scheme():
71-
with pytest.raises(ValueError):
72-
du.get_stripped_url('', non_http_scheme='milk')
83+
def test_get_stripped_url_returns_port_if_present():
84+
url = 'http://my.example.com:8080/path/to/webapp.htm?aced=1'
85+
result = du.get_stripped_url(url)
86+
assert result == 'my.example.com:8080/path/to/webapp.htm'
7387

7488

75-
def test_get_stripped_url_returns_port_if_present():
89+
def test_get_stripped_url_returns_port_if_present_and_use_netloc_false():
7690
url = 'http://my.example.com:8080/path/to/webapp.htm?aced=1'
91+
result = du.get_stripped_url(url, use_netloc=False)
92+
assert result == 'my.example.com/path/to/webapp.htm'
93+
94+
95+
"""
96+
Currently don't support urls with a port but no scheme in the way we want.
97+
98+
url = 'my.example.com:8080/path/to/webapp.htm?aced=1'
99+
ParseResult(scheme='my.example.com', netloc='', path='8080/path/to/webapp.htm',...
100+
101+
The following are two tests xfailed with expected behavior and one test
102+
that documents the actual behavior
103+
"""
104+
105+
@pytest.mark.xfail(reason="""
106+
urlparse does not have a good way to handle a url with a port but no scheme.""")
107+
def test_get_stripped_with_port_when_no_scheme():
108+
url = 'my.example.com:8080/path/to/webapp.htm?aced=1'
77109
result = du.get_stripped_url(url)
78110
assert result == 'my.example.com:8080/path/to/webapp.htm'
111+
112+
113+
@pytest.mark.xfail(reason="""
114+
urlparse does not have a good way to handle a url with a port but no scheme.""")
115+
def test_get_stripped_url_with_port_when_no_scheme_and_use_netloc_false():
116+
url = 'my.example.com:8080/path/to/webapp.htm?aced=1'
117+
result = du.get_stripped_url(url, use_netloc=False)
118+
assert result == 'my.example.com/path/to/webapp.htm'
119+
120+
121+
def test_get_stripped_url_document_behavior_with_port_when_no_scheme():
122+
url = 'my.example.com:8080/path/to/webapp.htm?aced=1'
123+
result = du.get_stripped_url(url)
124+
assert result == 'my.example.com:8080/path/to/webapp.htm?aced=1'
125+
result = du.get_stripped_url(url, use_netloc=False)
126+
assert result == 'my.example.com:8080/path/to/webapp.htm?aced=1'
127+
128+
129+
# End of url with port but no scheme

0 commit comments

Comments
 (0)