1111from w3lib .url import safe_url_string
1212from w3lib ._types import StrOrBytes
1313
14- _ent_re = re .compile (r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)' , re .IGNORECASE )
15- _tag_re = re .compile (r'<[a-zA-Z\/!].*?>' , re .DOTALL )
16- _baseurl_re = re .compile (r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']' , re .I )
17- _meta_refresh_re = re .compile (r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)' , re .DOTALL | re .IGNORECASE )
18- _cdata_re = re .compile (r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))' , re .DOTALL )
19-
20- HTML5_WHITESPACE = ' \t \n \r \x0c '
21-
22-
23- def replace_entities (text : AnyStr , keep : Iterable [str ] = (), remove_illegal : bool = True , encoding : str = 'utf-8' ) -> str :
14+ _ent_re = re .compile (
15+ r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)" ,
16+ re .IGNORECASE ,
17+ )
18+ _tag_re = re .compile (r"<[a-zA-Z\/!].*?>" , re .DOTALL )
19+ _baseurl_re = re .compile (r"<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']" , re .I )
20+ _meta_refresh_re = re .compile (
21+ r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)' ,
22+ re .DOTALL | re .IGNORECASE ,
23+ )
24+ _cdata_re = re .compile (
25+ r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))" , re .DOTALL
26+ )
27+
28+ HTML5_WHITESPACE = " \t \n \r \x0c "
29+
30+
31+ def replace_entities (
32+ text : AnyStr ,
33+ keep : Iterable [str ] = (),
34+ remove_illegal : bool = True ,
35+ encoding : str = "utf-8" ,
36+ ) -> str :
2437 """Remove entities from the given `text` by converting them to their
2538 corresponding unicode character.
2639
@@ -51,12 +64,12 @@ def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: boo
5164 def convert_entity (m : Match ) -> str :
5265 groups = m .groupdict ()
5366 number = None
54- if groups .get (' dec' ):
55- number = int (groups [' dec' ], 10 )
56- elif groups .get (' hex' ):
57- number = int (groups [' hex' ], 16 )
58- elif groups .get (' named' ):
59- entity_name = groups [' named' ]
67+ if groups .get (" dec" ):
68+ number = int (groups [" dec" ], 10 )
69+ elif groups .get (" hex" ):
70+ number = int (groups [" hex" ], 16 )
71+ elif groups .get (" named" ):
72+ entity_name = groups [" named" ]
6073 if entity_name .lower () in keep :
6174 return m .group (0 )
6275 else :
@@ -80,11 +93,12 @@ def convert_entity(m: Match) -> str:
8093
8194 return _ent_re .sub (convert_entity , to_unicode (text , encoding ))
8295
96+
8397def has_entities (text : AnyStr , encoding : Optional [str ] = None ) -> bool :
8498 return bool (_ent_re .search (to_unicode (text , encoding )))
8599
86100
87- def replace_tags (text : AnyStr , token : str = '' , encoding : Optional [str ] = None ) -> str :
101+ def replace_tags (text : AnyStr , token : str = "" , encoding : Optional [str ] = None ) -> str :
88102 """Replace all markup tags found in the given `text` by the given token.
89103 By default `token` is an empty string so it just removes all tags.
90104
@@ -107,11 +121,11 @@ def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None)
107121 return _tag_re .sub (token , to_unicode (text , encoding ))
108122
109123
110- _REMOVECOMMENTS_RE = re .compile (' <!--.*?(?:-->|$)' , re .DOTALL )
124+ _REMOVECOMMENTS_RE = re .compile (" <!--.*?(?:-->|$)" , re .DOTALL )
111125
112126
113127def remove_comments (text : AnyStr , encoding : Optional [str ] = None ) -> str :
114- """ Remove HTML Comments.
128+ """Remove HTML Comments.
115129
116130 >>> import w3lib.html
117131 >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
@@ -121,10 +135,16 @@ def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
121135 """
122136
123137 utext = to_unicode (text , encoding )
124- return _REMOVECOMMENTS_RE .sub ('' , utext )
138+ return _REMOVECOMMENTS_RE .sub ("" , utext )
139+
125140
126- def remove_tags (text : AnyStr , which_ones : Iterable [str ] = (), keep : Iterable [str ] = (), encoding : Optional [str ] = None ) -> str :
127- """ Remove HTML Tags only.
141+ def remove_tags (
142+ text : AnyStr ,
143+ which_ones : Iterable [str ] = (),
144+ keep : Iterable [str ] = (),
145+ encoding : Optional [str ] = None ,
146+ ) -> str :
147+ """Remove HTML Tags only.
128148
129149 `which_ones` and `keep` are both tuples, there are four cases:
130150
@@ -190,7 +210,9 @@ def remove_tag(m: Match) -> str:
190210 return retags .sub (remove_tag , to_unicode (text , encoding ))
191211
192212
193- def remove_tags_with_content (text : AnyStr , which_ones : Iterable [str ] = (), encoding : Optional [str ] = None ) -> str :
213+ def remove_tags_with_content (
214+ text : AnyStr , which_ones : Iterable [str ] = (), encoding : Optional [str ] = None
215+ ) -> str :
194216 """Remove tags and their content.
195217
196218 `which_ones` is a tuple of which tags to remove including their content.
@@ -210,12 +232,16 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod
210232 [r"<%s\b.*?</%s>|<%s\s*/>" % (tag , tag , tag ) for tag in which_ones ]
211233 )
212234 retags = re .compile (tags , re .DOTALL | re .IGNORECASE )
213- utext = retags .sub ('' , utext )
235+ utext = retags .sub ("" , utext )
214236 return utext
215237
216238
217- def replace_escape_chars (text : AnyStr , which_ones : Iterable [str ] = ('\n ' , '\t ' , '\r ' ), replace_by : StrOrBytes = '' , \
218- encoding : Optional [str ] = None ) -> str :
239+ def replace_escape_chars (
240+ text : AnyStr ,
241+ which_ones : Iterable [str ] = ("\n " , "\t " , "\r " ),
242+ replace_by : StrOrBytes = "" ,
243+ encoding : Optional [str ] = None ,
244+ ) -> str :
219245 """Remove escape characters.
220246
221247 `which_ones` is a tuple of which escape characters we want to remove.
@@ -232,7 +258,12 @@ def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t',
232258 return utext
233259
234260
235- def unquote_markup (text : AnyStr , keep : Iterable [str ] = (), remove_illegal : bool = True , encoding : Optional [str ] = None ) -> str :
261+ def unquote_markup (
262+ text : AnyStr ,
263+ keep : Iterable [str ] = (),
264+ remove_illegal : bool = True ,
265+ encoding : Optional [str ] = None ,
266+ ) -> str :
236267 """
237268 This function receives markup as a text (always a unicode string or
238269 a UTF-8 encoded string) and does the following:
@@ -254,7 +285,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
254285 yield txt [offset :]
255286
256287 utext = to_unicode (text , encoding )
257- ret_text = ''
288+ ret_text = ""
258289 for fragment in _get_fragments (utext , _cdata_re ):
259290 if isinstance (fragment , str ):
260291 # it's not a CDATA (so we try to remove its entities)
@@ -266,7 +297,10 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
266297 ret_text += fragment .group ("cdata_d" )
267298 return ret_text
268299
269- def get_base_url (text : AnyStr , baseurl : StrOrBytes = '' , encoding : str = 'utf-8' ) -> str :
300+
301+ def get_base_url (
302+ text : AnyStr , baseurl : StrOrBytes = "" , encoding : str = "utf-8"
303+ ) -> str :
270304 """Return the base url if declared in the given HTML `text`,
271305 relative to the given base url.
272306
@@ -284,7 +318,12 @@ def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8'
284318 return safe_url_string (baseurl )
285319
286320
287- def get_meta_refresh (text : AnyStr , baseurl : str = '' , encoding : str = 'utf-8' , ignore_tags : Iterable [str ] = ('script' , 'noscript' )) -> Tuple [Optional [float ], Optional [str ]]:
321+ def get_meta_refresh (
322+ text : AnyStr ,
323+ baseurl : str = "" ,
324+ encoding : str = "utf-8" ,
325+ ignore_tags : Iterable [str ] = ("script" , "noscript" ),
326+ ) -> Tuple [Optional [float ], Optional [str ]]:
288327 """Return the http-equiv parameter of the HTML meta element from the given
289328 HTML text and return a tuple ``(interval, url)`` where interval is an integer
290329 containing the delay in seconds (or zero if not present) and url is a
0 commit comments