@@ -116,61 +116,81 @@ def hostname_subparts(url, include_ps=False, **kwargs):
116116 return subparts
117117
118118
119- def get_stripped_url (url , scheme = False , non_http_scheme = None ):
119+ def get_stripped_url (url , scheme = False , drop_non_http = False , use_netloc = True ):
120120 """
121- Returns a url stripped to (scheme)?+netloc+path
121+ Returns a url stripped to just the beginning and end, or more formally,
122+ (scheme)?+netloc+path
122123 For example ``https://my.domain.net/a/path/to/a/file.html#anchor?a=1``
123124 becomes ``my.domain.net/a/path/to/a/file.html``
124125
126+ URL parsing is done using std lib
127+ `urllib.parse.urlparse <https://docs.python.org/3.8/library/urllib.parse.html>`_.
128+ Empty scheme e.g. ``my.domain.cloudfront.net`` are assumed to be http schemes.
125129
126- URL parsing is done using std lib urllib.parse. urlparse
127- Using netloc means that a port is included, for example ,
128- if it was in the path.
129- The method strips just the beginning and end being stripped .
130+ If a URL has a port but no scheme, urlparse determines the scheme to
131+ be the hostname and we do not handle this special case. In this case ,
132+ the url will be treated as a non_http_scheme and the return value will
133+ be determined by the ``drop_non_http`` setting .
130134
131135 :param url: URL to be parsed
132136 :type url: str
133- :param scheme: If True, scheme will be prepended in
137+ :param scheme: If `` True`` , scheme will be prepended in
134138 returned result, defaults to False
135139 :type scheme: bool, optional
136- :param non_http_scheme : Action to take if scheme is not
140+ :param drop_non_http : Action to take if scheme is not
137141 ``http`` or ``https`` e.g. ``file:`` or ``about:blank``.
138- If None, return empty string.
139- If ``self``, return the original URL.
140- Default is None.
141- :type non_http_scheme: None or ``"self"``, optional
142+ If ``True``, the result for non http urls will be an empty string
143+ If ``False``, the result for non http urls will be the original url,
144+ not further processed e.g. ``about:blank`` -> ``about:blank`` even
145+ if ``scheme=False``. The result for http urls will be the stripped
146+ url with or without the scheme as per scheme param.
147+ Default is ``False``.
148+ :type non_http_scheme: bool, optional
149+ :param use_netloc: If ``True`` urlparse's netloc will be used.
150+ If ``False`` urlparse's host will be returned. Using netloc means
151+ that a port is included, for example, if it was in the path.
152+ Default is ``True``.
153+ :type use_netloc: bool, optional
142154
143155 :return: Returns a url stripped to (scheme)?+netloc+path.
144156 Returns empty string if appropriate.
145157 :rtype: str
146158 """
147- if non_http_scheme not in [None , 'self' ]:
148- raise ValueError ('non_http_scheme must be either `None` or `self`' )
149159 purl = urlparse (url )
150-
151160 _scheme = purl .scheme
161+
162+ # Handle non http schemes
163+ if _scheme not in ['http' , 'https' , '' ]:
164+ if drop_non_http is True :
165+ return ''
166+ if drop_non_http is False :
167+ return url
168+
169+ if _scheme == '' :
170+ # From the docs: "urlparse recognizes a netloc only
171+ # if it is properly introduced by ‘//’". So we
172+ # prepend to get results we expect.
173+ url = '//{url}' .format (url = url )
174+
175+ purl = urlparse (url )
152176 scheme_out = ''
153- netloc_out = purl . netloc
177+ loc_out = ''
154178 path_out = purl .path
155179
156- if _scheme not in ['http' , 'https' ]:
157- if non_http_scheme == 'self' :
158- scheme = True
159- if non_http_scheme is None :
160- # e.g. in the case of about:blank, the path is 'blank', but we want
161- # to return nothing
162- path_out = ''
163-
164180 if scheme is True :
165181 if _scheme in ['http' , 'https' ]:
166182 scheme_out = '{scheme}://' .format (scheme = _scheme )
167- elif _scheme == '' :
168- scheme_out = ''
169183 else :
170- scheme_out = '{scheme}:' .format (scheme = _scheme )
184+ # Should only get here if scheme is ''
185+ scheme_out = '{scheme}' .format (scheme = _scheme )
186+
187+ if use_netloc is True :
188+ loc_out = purl .netloc
189+ else :
190+ loc_out = purl .hostname
171191
172- return '{scheme_out}{netloc_out }{path_out}' .format (
192+ return '{scheme_out}{loc_out }{path_out}' .format (
173193 scheme_out = scheme_out ,
174- netloc_out = netloc_out ,
194+ loc_out = loc_out ,
175195 path_out = path_out ,
176- )
196+ )
0 commit comments