Skip to content

Commit f40cc94

Browse files
committed
Work around some issues in python2 due to its piss-poor unicode implementation ( can be seen, for example, by using ↑ as a value in a tag ). For proper unicode support you should use python3, but this at least makes python2 more on-par with python3.
1 parent 617b7eb commit f40cc94

File tree

3 files changed

+19
-10
lines changed

3 files changed

+19
-10
lines changed

AdvancedHTMLParser/SpecialAttributes.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from collections import OrderedDict
99

10-
from .utils import escapeQuotes
10+
from .utils import escapeQuotes, tostr
1111

1212
__all__ = ('SpecialAttributesDict', 'AttributeNode', 'AttributeNodeMap', 'StyleAttribute' )
1313

@@ -214,7 +214,7 @@ def X__setitem__(self, name, value):
214214

215215

216216
def __str__(self):
217-
return '[ %s ]' %(' '.join([str(self.getNamedItem(name)) for name in self._attributesDict.keys()]))
217+
return '[ %s ]' %(' '.join([tostr(self.getNamedItem(name)) for name in self._attributesDict.keys()]))
218218

219219

220220

@@ -232,7 +232,7 @@ def __init__(self, styleValue, tag=None):
232232
@param styleValue <str> - A style string ( like "display: none; padding-top: 5px" )
233233
'''
234234
if isinstance(styleValue, StyleAttribute):
235-
styleValue = str(styleValue)
235+
styleValue = tostr(styleValue)
236236

237237
self._styleValue = styleValue
238238
self._styleDict = StyleAttribute.styleToDict(styleValue)

AdvancedHTMLParser/Tags.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from .constants import PREFORMATTED_TAGS, IMPLICIT_SELF_CLOSING_TAGS, TAG_NAMES_TO_ADDITIONAL_ATTRIBUTES, COMMON_JAVASCRIPT_ATTRIBUTES, ALL_JAVASCRIPT_EVENT_ATTRIBUTES, TAG_ITEM_BINARY_ATTRIBUTES, TAG_ITEM_ATTRIBUTE_LINKS, TAG_ITEM_ATTRIBUTES_SPECIAL_VALUES, TAG_ITEM_CHANGE_NAME_FROM_ATTR, TAG_ITEM_CHANGE_NAME_FROM_ITEM
1111
from .SpecialAttributes import SpecialAttributesDict, StyleAttribute, AttributeNodeMap
1212

13-
from .utils import escapeQuotes
13+
from .utils import escapeQuotes, tostr
1414

1515
__all__ = ('AdvancedTag', 'uniqueTags', 'TagCollection', 'FilterableTagCollection', 'toggleAttributesDOM', 'isTextNode', 'isTagNode')
1616

@@ -143,7 +143,7 @@ def __setattr__(self, name, value):
143143
self.setAttribute(name, "")
144144
return value
145145

146-
self.setAttribute(name, str(value))
146+
self.setAttribute(name, tostr(value))
147147
return value
148148

149149
if name == 'style' and not isinstance(value, StyleAttribute):
@@ -855,7 +855,7 @@ def getStartTag(self):
855855
'''
856856
attributeString = []
857857
for name, val in self._attributes.items():
858-
val = str(val)
858+
val = tostr(val)
859859
if val:
860860
val = escapeQuotes(val)
861861
attributeString.append('%s="%s"' %(name, val) )
@@ -932,7 +932,7 @@ def getAttributesList(self):
932932
933933
This is suitable for passing back into AdvancedTag when creating a new tag.
934934
'''
935-
return [ (str(name)[:], str(value)[:]) for name, value in self._attributes.items() ]
935+
return [ (tostr(name)[:], tostr(value)[:]) for name, value in self._attributes.items() ]
936936

937937
def getAttributesDict(self):
938938
'''
@@ -944,7 +944,7 @@ def getAttributesDict(self):
944944
@return <dict ( str(name), str(value) )> - A dict of attrName to attrValue , all as strings and copies.
945945
'''
946946

947-
return { str(name)[:] : str(value)[:] for name, value in self._attributes.items() }
947+
return { tostr(name)[:] : tostr(value)[:] for name, value in self._attributes.items() }
948948

949949
def setAttribute(self, attrName, attrValue):
950950
'''

AdvancedHTMLParser/utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Copyright (c) 2015, 2017 Tim Savannah under terms of LGPLv3
22
# Some misc utils and regular expressions
33

4+
import sys
45
import re
56

67
__all__ = ('IE_CONDITIONAL_PATTERN', 'END_HTML', 'START_HTML', 'DOCTYPE_MATCH',
7-
'stripIEConditionals', 'addStartTag', 'escapeQuotes', 'unescapeQuotes')
8+
'stripIEConditionals', 'addStartTag', 'escapeQuotes', 'unescapeQuotes', 'tostr')
89

910
IE_CONDITIONAL_PATTERN = re.compile('[<][!][-][-][ \t\r\n]*[\[][ \t\r\n]*if.*-->', re.MULTILINE)
1011

@@ -70,4 +71,12 @@ def unescapeQuotes(value):
7071
'''
7172
return value.replace('&quot;', '"')
7273

73-
74+
if sys.version_info.major < 3:
75+
def tostr(value):
76+
if not isinstance(value, (str, unicode)):
77+
value = unicode(value)
78+
return value
79+
else:
80+
def tostr(value):
81+
return str(value)
82+

0 commit comments

Comments
 (0)