Skip to content

Commit 41d0d2f

Browse files
committed
Add additional test cases, use netloc
May need to back out netloc change. Looking for input from @englehardt.
1 parent 1296f53 commit 41d0d2f

File tree

2 files changed

+52
-6
lines changed

2 files changed

+52
-6
lines changed

domain_utils/domain_utils.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,14 @@ def hostname_subparts(url, include_ps=False, **kwargs):
118118

119119
def get_stripped_url(url, scheme=False, non_http_scheme=None):
120120
"""
121-
Returns a url stripped to (scheme)?+hostname+path
121+
Returns a url stripped to (scheme)?+netloc+path
122+
For example `https://my.domain.net/a/path/to/a/file.html#anchor?a=1
123+
becomes `my.domain.net/a/path/to/a/file.html`
124+
122125
123126
URL parsing is done using std lib urllib.parse.urlparse
127+
Using netloc means that a port is included, for example, if it was in the path.
128+
This makes it cleaner to think about just the scheme and params being stripped.
124129
125130
:param url: URL to be parsed
126131
:type url: str
@@ -129,13 +134,36 @@ def get_stripped_url(url, scheme=False, non_http_scheme=None):
129134
:param non_http_scheme: Action to take if scheme is not http or https e.g. file: or 'about:blank'. If None, return empty string. If 'self', return the original URL. Default is None.
130135
:type non_http_scheme: None or str, optional
131136
132-
:return: Returns a url stripped to (scheme)?+hostname+path. Returns empty string if appropriate.
137+
:return: Returns a url stripped to (scheme)?+netloc+path. Returns empty string if appropriate.
133138
:rtype: str
134139
"""
135140
if non_http_scheme not in [None, 'self']:
136141
raise ValueError('non_http_scheme must be either `None` or `self`')
137142
purl = urlparse(url)
143+
144+
_scheme = purl.scheme
138145
scheme_out = ''
139-
if scheme:
140-
scheme_out = f'{purl.scheme}://'
141-
return f'{scheme_out}{purl.hostname}{purl.path}'
146+
netloc_out = purl.netloc
147+
path_out = purl.path
148+
149+
if _scheme not in ['http', 'https']:
150+
if non_http_scheme == 'self':
151+
scheme = True
152+
if non_http_scheme is None:
153+
# e.g. in the case of about:blank, the path is 'blank', but we want
154+
# to return nothing
155+
path_out = ''
156+
157+
if scheme is True:
158+
if _scheme in ['http', 'https']:
159+
scheme_out = '{scheme}://'.format(scheme=_scheme)
160+
elif _scheme == '':
161+
scheme_out = ''
162+
else:
163+
scheme_out = '{scheme}:'.format(scheme=_scheme)
164+
165+
return '{scheme_out}{netloc_out}{path_out}'.format(
166+
scheme_out=scheme_out,
167+
netloc_out=netloc_out,
168+
path_out=path_out,
169+
)

tests/test_domain_utils.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@ def test_get_stripped_url_path():
3131
assert result == 'my.domain.cloudfront.net/a/path/to/a/file.html'
3232

3333

34+
def test_get_stripped_url_no_path_and_non_http_scheme_self():
35+
url = 'https://my.domain.cloudfront.net#anchor'
36+
result = du.get_stripped_url(url, non_http_scheme='self')
37+
assert result == 'my.domain.cloudfront.net'
38+
39+
40+
def test_get_stripped_url_no_scheme_and_non_http_scheme_self():
41+
url = 'my.domain.cloudfront.net#anchor'
42+
result = du.get_stripped_url(url, non_http_scheme='self')
43+
assert result == 'my.domain.cloudfront.net'
44+
45+
3446
def test_get_stripped_url_path_params():
3547
url = 'https://my.domain.cloudfront.net/a/path/to/a/file.html?a=1'
3648
result = du.get_stripped_url(url)
@@ -46,7 +58,7 @@ def test_get_stripped_url_with_hostname_only_and_scheme():
4658
def test_get_stripped_url_non_http_scheme_none():
4759
url = 'about:blank'
4860
result = du.get_stripped_url(url, non_http_scheme=None)
49-
assert result is None
61+
assert result is ''
5062

5163

5264
def test_get_stripped_url_non_http_scheme_return_self():
@@ -58,3 +70,9 @@ def test_get_stripped_url_non_http_scheme_return_self():
5870
def test_get_stripped_url_only_accepts_correct_args_for_non_http_scheme():
5971
with pytest.raises(ValueError):
6072
result = du.get_stripped_url('', non_http_scheme='milk')
73+
74+
75+
def test_get_stripped_url_returns_port_if_present():
76+
url = 'http://my.example.com:8080/path/to/webapp.htm?aced=1'
77+
result = du.get_stripped_url(url)
78+
assert result == 'my.example.com:8080/path/to/webapp.htm'

0 commit comments

Comments
 (0)