Skip to content

Commit 8e853a5

Browse files
committed
Add a new AdvancedHTMLMiniFormatter to Formatter.py, which will only output the function parts of whitespace stripping all excess and not pretty-printing. This is exposed on AdvancedHTMLParser.AdvancedHTMLParser via a new method 'getMiniHTML'
1 parent fd6faa0 commit 8e853a5

File tree

2 files changed

+52
-9
lines changed

2 files changed

+52
-9
lines changed

AdvancedHTMLParser/Formatter.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import codecs
2828

2929

30-
__all__ = ('AdvancedHTMLFormatter', )
30+
__all__ = ('AdvancedHTMLFormatter', 'AdvancedHTMLMiniFormatter')
3131

3232
class AdvancedHTMLFormatter(HTMLParser):
3333
'''
@@ -37,10 +37,11 @@ class AdvancedHTMLFormatter(HTMLParser):
3737

3838
def __init__(self, indent=' ', encoding='utf-8'):
3939
'''
40-
Create a formatter.
40+
Create a pretty formatter.
4141
42-
@param indent - Either a space/tab/newline that represents one level of indent, or an integer to use that number of spaces
43-
@param encoding - Use this encoding for the document.
42+
@param indent <str/int>, Default ' ' [4 spaces] - Either a space/tab/newline that represents one level of indent, or an integer to use that number of spaces
43+
44+
@param encoding <str/None>, Default 'utf-8', - Use this encoding for the document. None to not mess with encoding
4445
'''
4546
HTMLParser.__init__(self)
4647

@@ -307,4 +308,22 @@ def parseStr(self, html):
307308
else:
308309
self.feed(html)
309310

311+
312+
class AdvancedHTMLMiniFormatter(AdvancedHTMLFormatter):
313+
'''
314+
AdvancedHTMLMiniFormatter - A formatter that will reformat a document, keeping only functional
315+
whitespace and removing any and all indentation and nesting spaces.
316+
'''
317+
318+
def __init__(self, encoding='utf-8'):
319+
'''
320+
Create a mini formatter.
321+
322+
@param encoding <str/None>, Default 'utf-8', - Use this encoding for the document. None to not mess with encoding
323+
'''
324+
AdvancedHTMLFormatter.__init__(self, indent='', encoding=encoding)
325+
326+
def _getIndent(self):
327+
return ''
328+
310329
#vim: set ts=4 sw=4 expandtab

AdvancedHTMLParser/Parser.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -767,8 +767,15 @@ def doMatchFunc(em):
767767

768768
def getHTML(self):
769769
'''
770-
getHTML - Get the full HTML as contained within this tree
771-
@returns - String
770+
getHTML - Get the full HTML as contained within this tree.
771+
772+
If parsed from a document, this will contain the original whitespacing.
773+
774+
@returns - <str> of html
775+
776+
@see getFormattedHTML
777+
778+
@see getMiniHTML
772779
'''
773780
root = self.getRoot()
774781
if root is None:
@@ -799,18 +806,35 @@ def getHTML(self):
799806

800807
def getFormattedHTML(self, indent=' '):
801808
'''
802-
getFormattedHTML - Get formatted and xhtml of this document
809+
getFormattedHTML - Get formatted and xhtml of this document, replacing the original whitespace
810+
with a pretty-printed version
803811
804812
@param indent - space/tab/newline of each level of indent, or integer for how many spaces per level
805813
806-
@return - Formatted html as string
814+
@return - <str> Formatted html
815+
816+
@see getHTML - Get HTML with original whitespace
817+
818+
@see getMiniHTML - Get HTML with only functional whitespace remaining
807819
'''
808820
from .Formatter import AdvancedHTMLFormatter
809821
html = self.getHTML()
810822
formatter = AdvancedHTMLFormatter(indent, None) # Do not double-encode
811823
formatter.feed(html)
812824
return formatter.getHTML()
813-
825+
826+
def getMiniHTML(self):
827+
'''
828+
getMiniHTML - Gets the HTML representation of this document without any pretty formatting
829+
and disregarding original whitespace beyond the functional.
830+
831+
@return <str> - HTML with only functional whitespace present
832+
'''
833+
from .Formatter import AdvancedHTMLMiniFormatter
834+
html = self.getHTML()
835+
formatter = AdvancedHTMLMiniFormatter(None) # Do not double-encode
836+
formatter.feed(html)
837+
return formatter.getHTML()
814838

815839
def _reset(self):
816840
'''

0 commit comments

Comments
 (0)