|
8 | 8 | import html.parser |
9 | 9 | import debug |
10 | 10 | import filemagic as mFile |
| 11 | +import json |
11 | 12 | from typing import TYPE_CHECKING |
12 | 13 |
|
13 | 14 | if TYPE_CHECKING: |
@@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser): |
95 | 96 | This helps return a relatively clean textual representation of the html file/content being parsed. |
96 | 97 | """ |
97 | 98 |
|
98 | | - def __init__(self): |
| 99 | + def __init__(self, tagDrops: dict): |
99 | 100 | super().__init__() |
| 101 | + self.tagDrops = tagDrops |
100 | 102 | self.inside = { |
101 | 103 | 'body': False, |
102 | 104 | 'script': False, |
103 | 105 | 'style': False, |
104 | 106 | 'header': False, |
105 | 107 | 'footer': False, |
106 | | - 'nav': False |
| 108 | + 'nav': False, |
107 | 109 | } |
108 | 110 | self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ] |
109 | 111 | self.bCapture = False |
110 | 112 | self.text = "" |
111 | 113 | self.textStripped = "" |
| 114 | + self.droptagType = None |
| 115 | + self.droptagCount = 0 |
112 | 116 |
|
113 | 117 | def do_capture(self): |
114 | 118 | """ |
115 | 119 | Helps decide whether to capture contents or discard them. |
116 | 120 | """ |
117 | | - if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']): |
| 121 | + if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)): |
118 | 122 | return True |
119 | 123 | return False |
120 | 124 |
|
121 | 125 | def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): |
122 | 126 | if tag in self.monitored: |
123 | 127 | self.inside[tag] = True |
| 128 | + for tagMeta in self.tagDrops: |
| 129 | + if tag != tagMeta.tag: |
| 130 | + continue |
| 131 | + for attr in attrs: |
| 132 | + if attr[0] != 'id': |
| 133 | + continue |
| 134 | + if attr[1] == tagMeta.id: |
| 135 | + self.droptagCount += 1 |
| 136 | + self.droptagType = tag |
124 | 137 |
|
125 | 138 | def handle_endtag(self, tag: str): |
126 | 139 | if tag in self.monitored: |
127 | 140 | self.inside[tag] = False |
| 141 | + if tag == self.droptagType: |
| 142 | + self.droptagCount -= 1 |
| 143 | + if self.droptagCount < 0: |
| 144 | + self.droptagCount = 0 |
128 | 145 |
|
129 | 146 | def handle_data(self, data: str): |
130 | 147 | if self.do_capture(): |
@@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): |
167 | 184 | ph.send_error(got.httpStatus, got.httpStatusMsg) |
168 | 185 | return |
169 | 186 | # Extract Text |
170 | | - textHtml = TextHtmlParser() |
| 187 | + tagDrops = ph.headers.get('urltext-tag-drops') |
| 188 | + if not tagDrops: |
| 189 | + tagDrops = {} |
| 190 | + else: |
| 191 | + tagDrops = json.loads(tagDrops) |
| 192 | + textHtml = TextHtmlParser(tagDrops) |
171 | 193 | textHtml.feed(got.contentData) |
172 | 194 | # Send back to client |
173 | 195 | ph.send_response(got.httpStatus) |
|
0 commit comments