Skip to content

Commit 2c995a4

Browse files
committed
SimpleChatTC:SimpleProxy:XMLText: initial go
Take the existing urltext logic including its html parser and strip it out to be simpler.
1 parent 32429f4 commit 2c995a4

File tree

2 files changed

+78
-0
lines changed

2 files changed

+78
-0
lines changed

tools/server/public_simplechat/local.tools/simpleproxy.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
5050

5151
gAllowedCalls = {
52+
"xmltext": [],
5253
"urltext": [],
5354
"urlraw": [],
5455
"pdftext": [ "pypdf" ]
@@ -139,6 +140,8 @@ def do_GET(self):
139140
self.auth_and_run(pr, mWeb.handle_urlraw)
140141
case '/urltext':
141142
self.auth_and_run(pr, mWeb.handle_urltext)
143+
case '/xmltext':
144+
self.auth_and_run(pr, mWeb.handle_xmltext)
142145
case '/pdftext':
143146
self.auth_and_run(pr, mPdf.handle_pdftext)
144147
case '/aum':

tools/server/public_simplechat/local.tools/webmagic.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,78 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
216216
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
217217
except Exception as exc:
218218
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
219+
220+
221+
class TextXMLParser(html.parser.HTMLParser):
222+
"""
223+
A simple minded logic used to strip xml content of
224+
* all the xml tags as well as
225+
* all the contents belonging to below predefined tags like guid, enclosure, ...
226+
227+
* this works properly only if the xml being processed has proper opening and ending tags
228+
around the area of interest.
229+
230+
This helps return a relatively clean textual representation of the xml file/content being parsed.
231+
"""
232+
233+
def __init__(self, tagDrops: list[str]):
234+
super().__init__()
235+
self.tagDrops = tagDrops
236+
print(f"DBUG:TextXMLParser:{self.tagDrops}")
237+
self.insideTagDrops = {
238+
}
239+
for tag in tagDrops:
240+
self.insideTagDrops[tag] = False
241+
self.bCapture = False
242+
self.text = ""
243+
self.prefix = ""
244+
245+
def do_capture(self):
246+
"""
247+
Helps decide whether to capture contents or discard them.
248+
"""
249+
for tag in self.tagDrops:
250+
if self.insideTagDrops[tag]:
251+
return False
252+
return True
253+
254+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
255+
self.prefix += " "
256+
if tag in self.tagDrops:
257+
self.insideTagDrops[tag] = True
258+
259+
def handle_endtag(self, tag: str):
260+
self.prefix = self.prefix[:-1]
261+
if tag in self.tagDrops:
262+
self.insideTagDrops[tag] = False
263+
264+
def handle_data(self, data: str):
265+
if self.do_capture():
266+
self.text += f"{self.prefix}{data}\n"
267+
268+
269+
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
270+
try:
271+
# Get requested url
272+
got = handle_urlreq(ph, pr, "HandleXMLText")
273+
if not got.callOk:
274+
ph.send_error(got.httpStatus, got.httpStatusMsg)
275+
return
276+
# Extract Text
277+
tagDrops = ph.headers.get('xmltext-tag-drops')
278+
if not tagDrops:
279+
tagDrops = []
280+
else:
281+
tagDrops = cast(list[str], json.loads(tagDrops))
282+
textXML = TextXMLParser(tagDrops)
283+
textXML.feed(got.contentData)
284+
# Send back to client
285+
ph.send_response(got.httpStatus)
286+
ph.send_header('Content-Type', got.contentType)
287+
# Add CORS for browser fetch, just in case
288+
ph.send_header('Access-Control-Allow-Origin', '*')
289+
ph.end_headers()
290+
ph.wfile.write(textXML.text.encode('utf-8'))
291+
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text })
292+
except Exception as exc:
293+
ph.send_error(502, f"WARN:XMLTextFailed:{exc}")

0 commit comments

Comments
 (0)