|
9 | 9 | import debug |
10 | 10 | import filemagic as mFile |
11 | 11 | import json |
12 | | -from typing import TYPE_CHECKING |
| 12 | +from typing import TYPE_CHECKING, Any, cast |
13 | 13 |
|
14 | 14 | if TYPE_CHECKING: |
15 | 15 | from simpleproxy import ProxyHandler |
@@ -93,12 +93,21 @@ class TextHtmlParser(html.parser.HTMLParser): |
93 | 93 | html content, that logic wont be triggered, so also such client side dynamic content wont be |
94 | 94 | got. |
95 | 95 |
|
| 96 | + Supports one to specify a list of tags and their corresponding id attributes, so that contents |
| 97 | + within such specified blocks will be dropped. |
| 98 | +
|
| 99 | + * this works properly only if the html being processed has proper opening and ending tags |
| 100 | + around the area of interest. |
| 101 | + * remember to specify non overlapping tag blocks, if more than one specified for dropping. |
| 102 | + * this path not tested, but should logically work |
| 103 | +
|
96 | 104 | This helps return a relatively clean textual representation of the html file/content being parsed. |
97 | 105 | """ |
98 | 106 |
|
99 | | - def __init__(self, tagDrops: dict): |
| 107 | + def __init__(self, tagDrops: list[dict[str, Any]]): |
100 | 108 | super().__init__() |
101 | 109 | self.tagDrops = tagDrops |
| 110 | + print(f"DBUG:TextHtmlParser:{self.tagDrops}") |
102 | 111 | self.inside = { |
103 | 112 | 'body': False, |
104 | 113 | 'script': False, |
@@ -126,20 +135,27 @@ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): |
126 | 135 | if tag in self.monitored: |
127 | 136 | self.inside[tag] = True |
128 | 137 | for tagMeta in self.tagDrops: |
129 | | - if tag != tagMeta.tag: |
| 138 | + if tag != tagMeta['tag']: |
| 139 | + continue |
| 140 | + if (self.droptagCount > 0) and (self.droptagType == tag): |
| 141 | + self.droptagCount += 1 |
130 | 142 | continue |
131 | 143 | for attr in attrs: |
132 | 144 | if attr[0] != 'id': |
133 | 145 | continue |
134 | | - if attr[1] == tagMeta.id: |
| 146 | + if attr[1] == tagMeta['id']: |
135 | 147 | self.droptagCount += 1 |
136 | 148 | self.droptagType = tag |
| 149 | + print(f"DBUG:THP:Start:Tag found [{tag}:{attr[1]}]...") |
137 | 150 |
|
138 | 151 | def handle_endtag(self, tag: str): |
139 | 152 | if tag in self.monitored: |
140 | 153 | self.inside[tag] = False |
141 | | - if tag == self.droptagType: |
| 154 | + if self.droptagType and (tag == self.droptagType): |
142 | 155 | self.droptagCount -= 1 |
| 156 | + if self.droptagCount == 0: |
| 157 | + self.droptagType = None |
| 158 | + print("DBUG:THP:End:Tag found...") |
143 | 159 | if self.droptagCount < 0: |
144 | 160 | self.droptagCount = 0 |
145 | 161 |
|
@@ -186,9 +202,9 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): |
186 | 202 | # Extract Text |
187 | 203 | tagDrops = ph.headers.get('urltext-tag-drops') |
188 | 204 | if not tagDrops: |
189 | | - tagDrops = {} |
| 205 | + tagDrops = [] |
190 | 206 | else: |
191 | | - tagDrops = json.loads(tagDrops) |
| 207 | + tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) |
192 | 208 | textHtml = TextHtmlParser(tagDrops) |
193 | 209 | textHtml.feed(got.contentData) |
194 | 210 | # Send back to client |
|
0 commit comments