Skip to content

Commit f82dd06

Browse files
committed
AdvancedHTMLFormatter - Add two 'slim-tag' formatters with the ability to trim <span id='abc' > into <span id='abc'> and optionally <br /> into <br/>
1 parent 64412c0 commit f82dd06

File tree

1 file changed

+145
-3
lines changed

1 file changed

+145
-3
lines changed

AdvancedHTMLParser/Formatter.py

Lines changed: 145 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import codecs
2828

2929

30-
__all__ = ('AdvancedHTMLFormatter', 'AdvancedHTMLMiniFormatter')
30+
__all__ = ('AdvancedHTMLFormatter', 'AdvancedHTMLMiniFormatter', 'AdvancedHTMLSlimTagFormatter', 'AdvancedHTMLSlimTagMiniFormatter')
3131

3232
class AdvancedHTMLFormatter(HTMLParser):
3333
'''
@@ -204,7 +204,7 @@ def handle_endtag(self, tagName):
204204
if not foundIt:
205205
sys.stderr.write('WARNING: found close tag with no matching start.\n')
206206
return
207-
207+
208208
while inTag[-1].tagName != tagName:
209209
oldTag = inTag.pop()
210210
if oldTag.tagName in PREFORMATTED_TAGS:
@@ -284,7 +284,7 @@ def unknown_decl(self, decl):
284284
def parseFile(self, filename):
285285
'''
286286
parseFile - Parses a file and creates the DOM tree and indexes
287-
287+
288288
@param filename <str/file> - A string to a filename or a file object. If file object, it will not be closed, you must close.
289289
'''
290290
self.reset()
@@ -326,4 +326,146 @@ def __init__(self, encoding='utf-8'):
326326
def _getIndent(self):
327327
return ''
328328

329+
330+
class AdvancedTagSlim(AdvancedTag):
331+
'''
332+
AdvancedTagSlim - A special class which extends AdvancedTag, but uses
333+
334+
slim-endings (which may have parsing issues on some old/strange parsers)
335+
336+
I.e. instead of <span id="blah" > you would have <span id="blah">
337+
338+
We still by default keep <br /> as having the space because of xhtml attribute rules,
339+
but can be disabled y passing slimSelfClosing=True to __init__
340+
341+
342+
@NOTE: You should NOT use this directly, they are for use by the SlimTagFormatter s
343+
'''
344+
345+
def __init__(self, *args, **kwargs):
346+
'''
347+
__init__ - Create an AdvancedTagSlim object.
348+
349+
@see AdvancedTag
350+
351+
Extra arguments:
352+
353+
slimSelfClosing <bool> default False - If True, will use slim-endings on self-closing tags,
354+
355+
i.e. <br/> instead of <br />
356+
357+
This may break xhtml compatibility but modern browsers are okay with it.
358+
'''
359+
if 'slimSelfClosing' in kwargs:
360+
slimSelfClosing = kwargs.pop('slimSelfClosing')
361+
else:
362+
slimSelfClosing = False
363+
AdvancedTag.__init__(self, *args, **kwargs)
364+
365+
object.__setattr__(self, 'slimSelfClosing', slimSelfClosing)
366+
367+
368+
def getStartTag(self, *args, **kwargs):
369+
'''
370+
getStartTag - Override the end-spacing rules
371+
372+
@see AdvancedTag.getStartTag
373+
'''
374+
375+
ret = AdvancedTag.getStartTag(self, *args, **kwargs)
376+
377+
if ret.endswith(' >'):
378+
ret = ret[:-2] + '>'
379+
elif object.__getattribute__(self, 'slimSelfClosing') and ret.endswith(' />'):
380+
ret = ret[:-3] + '/>'
381+
382+
return ret
383+
384+
385+
class AdvancedHTMLSlimTagFormatter(AdvancedHTMLFormatter):
386+
'''
387+
AdvancedHTMLSlimTagFormatter - Formats HTML with slim start tags,
388+
which may break some xhtml-compatible parsers.
389+
390+
For example <span id="abc" > will become <span id="abc">.
391+
392+
Remainder will be pretty-printed. For mini-printing, @see AdvancedHTMLSlimTagMiniFormatter
393+
394+
If slimSelfClosing=True on __init__, <br /> will become <br/> as well
395+
'''
396+
397+
398+
def __init__(self, indent=' ', encoding='utf-8', slimSelfClosing=False):
399+
'''
400+
__init__ - Construct an AdvancedHTMLSlimTagFormatter
401+
402+
@see AdvancedHTMLFormatter
403+
404+
@param slimSelfClosing <bool> Default False - If True, will use slim self-closing tags,
405+
406+
e.x. <br /> becomes <br/>
407+
'''
408+
409+
AdvancedHTMLFormatter.__init__(self, indent=indent, encoding=encoding)
410+
411+
self.slimSelfClosing = slimSelfClosing
412+
413+
def handle_starttag(self, tagName, attributeList, isSelfClosing=False):
414+
'''
415+
handle_starttag - Handles parsing a start tag.
416+
417+
@see AdvancedHTMLFormatter.handle_starttag
418+
'''
419+
tagName = tagName.lower()
420+
inTag = self._inTag
421+
422+
if isSelfClosing is False and tagName in IMPLICIT_SELF_CLOSING_TAGS:
423+
isSelfClosing = True
424+
425+
newTag = AdvancedTagSlim(tagName, attributeList, isSelfClosing, slimSelfClosing=self.slimSelfClosing)
426+
if self.root is None:
427+
self.root = newTag
428+
elif len(inTag) > 0:
429+
inTag[-1].appendChild(newTag)
430+
else:
431+
raise MultipleRootNodeException()
432+
433+
if self.inPreformatted is 0:
434+
newTag._indent = self._getIndent()
435+
436+
if tagName in PREFORMATTED_TAGS:
437+
self.inPreformatted += 1
438+
439+
if isSelfClosing is False:
440+
inTag.append(newTag)
441+
if tagName != INVISIBLE_ROOT_TAG:
442+
self.currentIndentLevel += 1
443+
444+
445+
class AdvancedHTMLSlimTagMiniFormatter(AdvancedHTMLMiniFormatter):
446+
'''
447+
AdvancedHTMLSlimTagMiniFormatter - A "mini" formatter that
448+
removes all non-functional whitespace (including all indentations)
449+
450+
Also uses "slim" start tags, @see AdvancedHTMLSlimTagFormatter for more info
451+
'''
452+
453+
def __init__(self, encoding='utf-8', slimSelfClosing=False):
454+
'''
455+
__init__ - Create an AdvancedHTMLSlimTagMiniFormatter
456+
457+
@see AdvancedHTMLMiniFormatter
458+
459+
@param slimSelfClosing <bool> Default False - If True, will use slim self-closing tags,
460+
461+
e.x. <br /> becomes <br/>
462+
'''
463+
464+
AdvancedHTMLMiniFormatter.__init__(self, encoding=encoding)
465+
466+
self.slimSelfClosing = slimSelfClosing
467+
468+
handle_starttag = AdvancedHTMLSlimTagFormatter.handle_starttag
469+
470+
329471
#vim: set ts=4 sw=4 expandtab

0 commit comments

Comments
 (0)