11# -*- coding: utf-8 -*-
22import re
3+ from html .parser import HTMLParser
4+ from html .entities import name2codepoint
5+ from html import escape
36
47from .exceptions import NotAllowedTag , InvalidHTML
58
6- try : # python 3.x
7- from html .parser import HTMLParser
8- from html .entities import name2codepoint
9- from html import escape
10-
11- basestring = str
12-
13- except ImportError : # python 2.x
14- from HTMLParser import HTMLParser
15- from htmlentitydefs import name2codepoint
16- from cgi import escape
17-
18- chr = unichr
19-
209
2110RE_WHITESPACE = re .compile (r'(\s+)' , re .UNICODE )
2211
@@ -50,31 +39,49 @@ def __init__(self):
5039 self .current_nodes = self .nodes
5140 self .parent_nodes = []
5241
42+ self .last_text_node = None
43+
44+ self .tags_path = []
45+
5346 def add_str_node (self , s ):
5447 if not s :
5548 return
5649
57- if self .current_nodes and isinstance (self .current_nodes [- 1 ], basestring ):
50+ if 'pre' not in self .tags_path : # keep whitespace in <pre>
51+ s = RE_WHITESPACE .sub (' ' , s )
52+
53+ if self .last_text_node is None or self .last_text_node .endswith (' ' ):
54+ s = s .lstrip (' ' )
55+
56+ if not s :
57+ self .last_text_node = None
58+ return
59+
60+ self .last_text_node = s
61+
62+ if self .current_nodes and isinstance (self .current_nodes [- 1 ], str ):
5863 self .current_nodes [- 1 ] += s
5964 else :
6065 self .current_nodes .append (s )
6166
6267 def handle_starttag (self , tag , attrs_list ):
6368 if tag not in ALLOWED_TAGS :
64- raise NotAllowedTag ('%s tag is not allowed' % tag )
69+ raise NotAllowedTag (f'{ tag !r} tag is not allowed' )
70+
71+ if tag in BLOCK_ELEMENTS :
72+ self .last_text_node = None
6573
6674 node = {'tag' : tag }
75+ self .tags_path .append (tag )
76+ self .current_nodes .append (node )
6777
6878 if attrs_list :
6979 attrs = {}
80+ node ['attrs' ] = attrs
7081
7182 for attr , value in attrs_list :
7283 attrs [attr ] = value
7384
74- node ['attrs' ] = attrs
75-
76- self .current_nodes .append (node )
77-
7885 if tag not in VOID_ELEMENTS :
7986 self .parent_nodes .append (self .current_nodes )
8087 self .current_nodes = node ['children' ] = []
@@ -84,18 +91,16 @@ def handle_endtag(self, tag):
8491 return
8592
8693 if not len (self .parent_nodes ):
87- raise InvalidHTML ('"{}" missing start tag' .format (
88- tag
89- ))
94+ raise InvalidHTML (f'{ tag !r} missing start tag' )
9095
9196 self .current_nodes = self .parent_nodes .pop ()
9297
9398 last_node = self .current_nodes [- 1 ]
9499
95100 if last_node ['tag' ] != tag :
96- raise InvalidHTML ('"{}" tag closed instead of "{}"' . format (
97- tag , last_node [ 'tag' ]
98- ) )
101+ raise InvalidHTML (f' { tag !r } tag closed instead of { last_node [ "tag" ]!r } ' )
102+
103+ self . tags_path . pop ( )
99104
100105 if not last_node ['children' ]:
101106 last_node .pop ('children' )
@@ -117,128 +122,56 @@ def handle_charref(self, name):
117122 def get_nodes (self ):
118123 if self .parent_nodes :
119124 not_closed_tag = self .parent_nodes [- 1 ][- 1 ]['tag' ]
120- raise InvalidHTML ('"{}" tag is not closed'. format ( not_closed_tag ) )
125+ raise InvalidHTML (f' { not_closed_tag !r } tag is not closed' )
121126
122127 return self .nodes
123128
124129
125- def clear_whitespace_nodes (nodes , last_text_node = None ):
126- """
127-
128- :param nodes:
129- :type nodes: list
130- :param last_text_node:
131- :type last_text_node: basestring
132- :return: list
133- """
134- # TODO: probably possible to move to html parser
135-
136- stack = []
137- current_nodes = nodes [:]
138-
139- new_nodes = []
140- new_children = new_nodes
141-
142- while True :
143- if current_nodes :
144- node = current_nodes .pop (0 )
145-
146- if type (node ) is dict :
147- is_block_element = node ['tag' ] in BLOCK_ELEMENTS
148- if is_block_element :
149- last_text_node = None
150-
151- new_children .append (node )
152-
153- node_children = node .get ('children' )
154-
155- if node_children :
156- stack .append ((current_nodes , new_children ))
157- current_nodes = node_children
158- new_children = []
159- node ['children' ] = new_children
160- else :
161- node = RE_WHITESPACE .sub (' ' , node )
162-
163- if last_text_node is None or last_text_node .endswith (' ' ):
164- node = node .lstrip (' ' )
165-
166- if node :
167- last_text_node = node
168- new_children .append (node )
169- else :
170- last_text_node = None
171-
172- if not current_nodes :
173- if stack :
174- current_nodes , new_children = stack .pop ()
175- else :
176- break
177-
178- return new_nodes , last_text_node
179-
180-
181130def html_to_nodes (html_content ):
182131 parser = HtmlToNodesParser ()
183132 parser .feed (html_content )
184-
185- nodes = parser .get_nodes ()
186- nodes , _ = clear_whitespace_nodes (nodes )
187- return nodes
133+ return parser .get_nodes ()
188134
189135
190136def nodes_to_html (nodes ):
191- html_content = []
137+ out = []
138+ append = out .append
192139
193140 stack = []
194- tags_stack = []
195- current_nodes = nodes [:]
141+ curr = nodes
142+ i = - 1
196143
197144 while True :
198- if current_nodes :
199- node = current_nodes .pop (0 )
200-
201- if type (node ) is dict :
202- tags_stack .append (node ['tag' ])
203-
204- attrs = node .get ('attrs' )
205-
206- if attrs :
207- attrs_str = ['' ]
145+ i += 1
208146
209- for attr , value in attrs .items ():
210- attrs_str .append ('{}="{}"' .format (attr , escape (value )))
211- else :
212- attrs_str = []
213-
214- html_content .append ('<{}{}>' .format (
215- node ['tag' ],
216- ' ' .join (attrs_str )
217- ))
147+ if i >= len (curr ):
148+ if not stack :
149+ break
150+ curr , i = stack .pop ()
151+ append (f'</{ curr [i ]["tag" ]} >' )
152+ continue
218153
219- children = node .get ('children' , [])
220- stack .append (current_nodes )
221- current_nodes = children
222- else :
223- html_content .append (escape (node ))
154+ node = curr [i ]
224155
225- if not current_nodes :
226- if tags_stack :
227- closed_tag = tags_stack . pop ()
156+ if isinstance ( node , str ) :
157+ append ( escape ( node ))
158+ continue
228159
229- last_el = html_content [ - 1 ]
160+ append ( f'< { node [ "tag" ] } ' )
230161
231- if closed_tag in VOID_ELEMENTS and \
232- last_el . startswith ( '<{}' . format ( closed_tag )) and \
233- not last_el . endswith ( '/>' ):
162+ if node . get ( 'attrs' ):
163+ for attr , value in node [ 'attrs' ]. items ():
164+ append ( f' { attr } =" { escape ( value ) } "' )
234165
235- html_content [- 1 ] = last_el [:- 1 ] + '/>'
236- else :
237- html_content .append ('</{}>' .format (closed_tag ))
166+ if node .get ('children' ):
167+ append ('>' )
168+ stack .append ((curr , i ))
169+ curr , i = node ['children' ], - 1
170+ continue
238171
239- if stack :
240- current_nodes = stack . pop ( )
241- else :
242- break
172+ if node [ "tag" ] in VOID_ELEMENTS :
173+ append ( '/>' )
174+ else :
175+ append ( f'></ { node [ "tag" ] } >' )
243176
244- return '' .join (html_content )
177+ return '' .join (out )
0 commit comments