Skip to content

Commit 420b9da

Browse files
committed
Refactoring & fix pre tag nodes
1 parent 0fcb991 commit 420b9da

File tree

3 files changed

+88
-148
lines changed

3 files changed

+88
-148
lines changed

telegraph/utils.py

Lines changed: 62 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,11 @@
11
# -*- coding: utf-8 -*-
22
import re
3+
from html.parser import HTMLParser
4+
from html.entities import name2codepoint
5+
from html import escape
36

47
from .exceptions import NotAllowedTag, InvalidHTML
58

6-
try: # python 3.x
7-
from html.parser import HTMLParser
8-
from html.entities import name2codepoint
9-
from html import escape
10-
11-
basestring = str
12-
13-
except ImportError: # python 2.x
14-
from HTMLParser import HTMLParser
15-
from htmlentitydefs import name2codepoint
16-
from cgi import escape
17-
18-
chr = unichr
19-
209

2110
RE_WHITESPACE = re.compile(r'(\s+)', re.UNICODE)
2211

@@ -50,31 +39,49 @@ def __init__(self):
5039
self.current_nodes = self.nodes
5140
self.parent_nodes = []
5241

42+
self.last_text_node = None
43+
44+
self.tags_path = []
45+
5346
def add_str_node(self, s):
5447
if not s:
5548
return
5649

57-
if self.current_nodes and isinstance(self.current_nodes[-1], basestring):
50+
if 'pre' not in self.tags_path: # keep whitespace in <pre>
51+
s = RE_WHITESPACE.sub(' ', s)
52+
53+
if self.last_text_node is None or self.last_text_node.endswith(' '):
54+
s = s.lstrip(' ')
55+
56+
if not s:
57+
self.last_text_node = None
58+
return
59+
60+
self.last_text_node = s
61+
62+
if self.current_nodes and isinstance(self.current_nodes[-1], str):
5863
self.current_nodes[-1] += s
5964
else:
6065
self.current_nodes.append(s)
6166

6267
def handle_starttag(self, tag, attrs_list):
6368
if tag not in ALLOWED_TAGS:
64-
raise NotAllowedTag('%s tag is not allowed' % tag)
69+
raise NotAllowedTag(f'{tag!r} tag is not allowed')
70+
71+
if tag in BLOCK_ELEMENTS:
72+
self.last_text_node = None
6573

6674
node = {'tag': tag}
75+
self.tags_path.append(tag)
76+
self.current_nodes.append(node)
6777

6878
if attrs_list:
6979
attrs = {}
80+
node['attrs'] = attrs
7081

7182
for attr, value in attrs_list:
7283
attrs[attr] = value
7384

74-
node['attrs'] = attrs
75-
76-
self.current_nodes.append(node)
77-
7885
if tag not in VOID_ELEMENTS:
7986
self.parent_nodes.append(self.current_nodes)
8087
self.current_nodes = node['children'] = []
@@ -84,18 +91,16 @@ def handle_endtag(self, tag):
8491
return
8592

8693
if not len(self.parent_nodes):
87-
raise InvalidHTML('"{}" missing start tag'.format(
88-
tag
89-
))
94+
raise InvalidHTML(f'{tag!r} missing start tag')
9095

9196
self.current_nodes = self.parent_nodes.pop()
9297

9398
last_node = self.current_nodes[-1]
9499

95100
if last_node['tag'] != tag:
96-
raise InvalidHTML('"{}" tag closed instead of "{}"'.format(
97-
tag, last_node['tag']
98-
))
101+
raise InvalidHTML(f'{tag!r} tag closed instead of {last_node["tag"]!r}')
102+
103+
self.tags_path.pop()
99104

100105
if not last_node['children']:
101106
last_node.pop('children')
@@ -117,128 +122,56 @@ def handle_charref(self, name):
117122
def get_nodes(self):
118123
if self.parent_nodes:
119124
not_closed_tag = self.parent_nodes[-1][-1]['tag']
120-
raise InvalidHTML('"{}" tag is not closed'.format(not_closed_tag))
125+
raise InvalidHTML(f'{not_closed_tag!r} tag is not closed')
121126

122127
return self.nodes
123128

124129

125-
def clear_whitespace_nodes(nodes, last_text_node=None):
126-
"""
127-
128-
:param nodes:
129-
:type nodes: list
130-
:param last_text_node:
131-
:type last_text_node: basestring
132-
:return: list
133-
"""
134-
# TODO: probably possible to move to html parser
135-
136-
stack = []
137-
current_nodes = nodes[:]
138-
139-
new_nodes = []
140-
new_children = new_nodes
141-
142-
while True:
143-
if current_nodes:
144-
node = current_nodes.pop(0)
145-
146-
if type(node) is dict:
147-
is_block_element = node['tag'] in BLOCK_ELEMENTS
148-
if is_block_element:
149-
last_text_node = None
150-
151-
new_children.append(node)
152-
153-
node_children = node.get('children')
154-
155-
if node_children:
156-
stack.append((current_nodes, new_children))
157-
current_nodes = node_children
158-
new_children = []
159-
node['children'] = new_children
160-
else:
161-
node = RE_WHITESPACE.sub(' ', node)
162-
163-
if last_text_node is None or last_text_node.endswith(' '):
164-
node = node.lstrip(' ')
165-
166-
if node:
167-
last_text_node = node
168-
new_children.append(node)
169-
else:
170-
last_text_node = None
171-
172-
if not current_nodes:
173-
if stack:
174-
current_nodes, new_children = stack.pop()
175-
else:
176-
break
177-
178-
return new_nodes, last_text_node
179-
180-
181130
def html_to_nodes(html_content):
182131
parser = HtmlToNodesParser()
183132
parser.feed(html_content)
184-
185-
nodes = parser.get_nodes()
186-
nodes, _ = clear_whitespace_nodes(nodes)
187-
return nodes
133+
return parser.get_nodes()
188134

189135

190136
def nodes_to_html(nodes):
191-
html_content = []
137+
out = []
138+
append = out.append
192139

193140
stack = []
194-
tags_stack = []
195-
current_nodes = nodes[:]
141+
curr = nodes
142+
i = -1
196143

197144
while True:
198-
if current_nodes:
199-
node = current_nodes.pop(0)
200-
201-
if type(node) is dict:
202-
tags_stack.append(node['tag'])
203-
204-
attrs = node.get('attrs')
205-
206-
if attrs:
207-
attrs_str = ['']
145+
i += 1
208146

209-
for attr, value in attrs.items():
210-
attrs_str.append('{}="{}"'.format(attr, escape(value)))
211-
else:
212-
attrs_str = []
213-
214-
html_content.append('<{}{}>'.format(
215-
node['tag'],
216-
' '.join(attrs_str)
217-
))
147+
if i >= len(curr):
148+
if not stack:
149+
break
150+
curr, i = stack.pop()
151+
append(f'</{curr[i]["tag"]}>')
152+
continue
218153

219-
children = node.get('children', [])
220-
stack.append(current_nodes)
221-
current_nodes = children
222-
else:
223-
html_content.append(escape(node))
154+
node = curr[i]
224155

225-
if not current_nodes:
226-
if tags_stack:
227-
closed_tag = tags_stack.pop()
156+
if isinstance(node, str):
157+
append(escape(node))
158+
continue
228159

229-
last_el = html_content[-1]
160+
append(f'<{node["tag"]}')
230161

231-
if closed_tag in VOID_ELEMENTS and \
232-
last_el.startswith('<{}'.format(closed_tag)) and \
233-
not last_el.endswith('/>'):
162+
if node.get('attrs'):
163+
for attr, value in node['attrs'].items():
164+
append(f' {attr}="{escape(value)}"')
234165

235-
html_content[-1] = last_el[:-1] + '/>'
236-
else:
237-
html_content.append('</{}>'.format(closed_tag))
166+
if node.get('children'):
167+
append('>')
168+
stack.append((curr, i))
169+
curr, i = node['children'], -1
170+
continue
238171

239-
if stack:
240-
current_nodes = stack.pop()
241-
else:
242-
break
172+
if node["tag"] in VOID_ELEMENTS:
173+
append('/>')
174+
else:
175+
append(f'></{node["tag"]}>')
243176

244-
return ''.join(html_content)
177+
return ''.join(out)

tests/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from . import test_html_converter
2+
from . import test_telegraph

tests/test_html_converter.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from unittest import TestCase
22

33
from telegraph.exceptions import NotAllowedTag, InvalidHTML
4-
from telegraph.utils import html_to_nodes, nodes_to_html, clear_whitespace_nodes
4+
from telegraph.utils import html_to_nodes, nodes_to_html
55

66
HTML_TEST_STR = """
77
<p>Hello, world!<br/></p>
@@ -71,6 +71,12 @@ def test_html_to_nodes_multi_line(self):
7171
HTML_MULTI_LINES_NODES_LIST
7272
)
7373

74+
def test_uppercase_tags(self):
75+
self.assertEqual(
76+
html_to_nodes("<P>Hello</P>"),
77+
[{'tag': 'p', 'children': ['Hello']}]
78+
)
79+
7480
def test_html_to_nodes_invalid_html(self):
7581
with self.assertRaises(InvalidHTML):
7682
html_to_nodes('<p><b></p></b>')
@@ -99,23 +105,11 @@ def test_nodes_to_html_blank(self):
99105
''
100106
)
101107

102-
def test_clear_whitespace_nodes(self):
103-
nodes = [
104-
'\n',
105-
{'tag': 'p', 'children': [
106-
{'tag': 'i', 'children': ['A']},
107-
{'tag': 'b', 'children': [' ']},
108-
{'tag': 'b', 'children': [
109-
'B ',
110-
{'tag': 'i', 'children': ['C']},
111-
{'tag': 'i', 'children': [{'tag': 'b'}]},
112-
' D '
113-
]},
114-
' E '
115-
]},
116-
{'tag': 'p', 'children': [' F ']},
117-
'\n'
118-
]
108+
def test_clear_whitespace(self):
109+
i = (
110+
'\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i>'
111+
' D </b> E </p><p> F </p>\n'
112+
)
119113
expected = [
120114
{'tag': 'p', 'children': [
121115
{'tag': 'i', 'children': ['A']},
@@ -131,7 +125,18 @@ def test_clear_whitespace_nodes(self):
131125
{'tag': 'p', 'children': ['F ']}
132126
]
133127

134-
self.assertEqual(clear_whitespace_nodes(nodes)[0], expected)
128+
self.assertEqual(html_to_nodes(i), expected)
129+
130+
def test_clear_whitespace_1(self):
131+
x = '\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b> E </p><p> F </p>\n'
132+
y = '<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b>E </p><p>F </p>'
133+
self.assertEqual(nodes_to_html(html_to_nodes(x)), y)
134+
135+
def test_pre_whitespace_preserved(self):
136+
self.assertEqual(
137+
html_to_nodes("<pre>\nhello\nworld</pre>"),
138+
[{'tag': 'pre', 'children': ['\nhello\nworld']}]
139+
)
135140

136141
def test_no_starttag_node(self):
137142
with self.assertRaises(InvalidHTML):

0 commit comments

Comments
 (0)