1- try :
2- from html .parser import HTMLParser # python 3.x
1+ # -*- coding: utf-8 -*-
2+
3+ try : # python 3.x
4+ from html .parser import HTMLParser
35 from html .entities import name2codepoint
46 from html import escape
5- except ImportError :
6- chr = unichr
77
8- from HTMLParser import HTMLParser # python 2.x
8+ basestring = str
9+
10+ except ImportError : # python 2.x
11+ from HTMLParser import HTMLParser
912 from htmlentitydefs import name2codepoint
1013 from cgi import escape
1114
15+ chr = unichr
16+
1217from .exceptions import NotAllowedTag , InvalidHTML
1318
1419
1823 'strong' , 'u' , 'ul' , 'video'
1924]
2025
26+ VOID_ELEMENTS = {
27+ 'area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
28+ 'link' , 'menuitem' , 'meta' , 'param' , 'source' , 'track' , 'wbr'
29+ }
30+
2131
2232class HtmlToNodesParser (HTMLParser ):
2333 def __init__ (self ):
2434 HTMLParser .__init__ (self )
2535
2636 self .nodes = []
2737
28- self .current_node_list = self .nodes
29- self .parent_node_lists = []
38+ self .current_nodes = self .nodes
39+ self .parent_nodes = []
40+
41+ def add_str_node (self , s ):
42+ if self .current_nodes and isinstance (self .current_nodes [- 1 ], basestring ):
43+ self .current_nodes [- 1 ] += s
44+ else :
45+ self .current_nodes .append (s )
3046
3147 def handle_starttag (self , tag , attrs_list ):
3248 if tag not in ALLOWED_TAGS :
@@ -42,30 +58,38 @@ def handle_starttag(self, tag, attrs_list):
4258
4359 node ['attrs' ] = attrs
4460
45- self .current_node_list .append (node )
46- self .parent_node_lists .append (self .current_node_list )
61+ self .current_nodes .append (node )
62+ self .parent_nodes .append (self .current_nodes )
4763
48- self .current_node_list = node ['children' ]
64+ self .current_nodes = node ['children' ]
4965
5066 def handle_endtag (self , tag ):
51- self .current_node_list = self .parent_node_lists .pop (- 1 )
67+ self .current_nodes = self .parent_nodes .pop ()
5268
53- if self .current_node_list [- 1 ]['tag' ] != tag :
69+ last_node = self .current_nodes [- 1 ]
70+
71+ if last_node ['tag' ] != tag :
5472 raise InvalidHTML
5573
74+ if not last_node ['children' ]:
75+ last_node .pop ('children' )
76+
5677 def handle_data (self , data ):
57- self .current_node_list .append (data )
78+ if data == '\n ' :
79+ return
80+
81+ self .add_str_node (data )
5882
5983 def handle_entityref (self , name ):
60- self .current_node_list . append (chr (name2codepoint [name ]))
84+ self .add_str_node (chr (name2codepoint [name ]))
6185
6286 def handle_charref (self , name ):
6387 if name .startswith ('x' ):
6488 c = chr (int (name [1 :], 16 ))
6589 else :
6690 c = chr (int (name ))
6791
68- self .current_node_list . append (c )
92+ self .add_str_node (c )
6993
7094
7195def html_to_nodes (html_content ):
@@ -80,7 +104,7 @@ def nodes_to_html(nodes):
80104
81105 stack = []
82106 tags_stack = []
83- current_nodes = nodes
107+ current_nodes = nodes [:]
84108
85109 while True :
86110 if current_nodes :
@@ -95,7 +119,7 @@ def nodes_to_html(nodes):
95119 attrs_str = ['' ]
96120
97121 for attr , value in attrs .items ():
98- attrs_str .append ('{}="{}"' .format (attr , value ))
122+ attrs_str .append ('{}="{}"' .format (attr , escape ( value ) ))
99123 else :
100124 attrs_str = []
101125
@@ -104,20 +128,28 @@ def nodes_to_html(nodes):
104128 ' ' .join (attrs_str )
105129 ))
106130
107- children = node .get ('children' )
108-
109- if children :
110- stack .append (current_nodes )
111- current_nodes = children
131+ children = node .get ('children' , [])
132+ stack .append (current_nodes )
133+ current_nodes = children
112134 else :
113135 html_content .append (escape (node ))
114136
115137 if not current_nodes :
116138 if tags_stack :
117- html_content .append ('</{}>' .format (tags_stack .pop (- 1 )))
139+ closed_tag = tags_stack .pop ()
140+
141+ last_el = html_content [- 1 ]
142+
143+ if closed_tag in VOID_ELEMENTS and \
144+ last_el .startswith ('<{}' .format (closed_tag )) and \
145+ not last_el .endswith ('/>' ):
146+
147+ html_content [- 1 ] = last_el [:- 1 ] + '/>'
148+ else :
149+ html_content .append ('</{}>' .format (closed_tag ))
118150
119151 if stack :
120- current_nodes = stack .pop (- 1 )
152+ current_nodes = stack .pop ()
121153 else :
122154 break
123155
0 commit comments