@@ -216,3 +216,78 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
216216 debug .dump ({ 'RawText' : 'yes' , 'StrippedText' : 'yes' }, { 'RawText' : textHtml .text , 'StrippedText' : textHtml .get_stripped_text () })
217217 except Exception as exc :
218218 ph .send_error (502 , f"WARN:UrlTextFailed:{ exc } " )
219+
220+
221+ class TextXMLParser (html .parser .HTMLParser ):
222+ """
223+ A simple minded logic used to strip xml content of
224+ * all the xml tags as well as
225+ * all the contents belonging to below predefined tags like guid, enclosure, ...
226+
227+ * this works properly only if the xml being processed has proper opening and ending tags
228+ around the area of interest.
229+
230+ This helps return a relatively clean textual representation of the xml file/content being parsed.
231+ """
232+
233+ def __init__ (self , tagDrops : list [str ]):
234+ super ().__init__ ()
235+ self .tagDrops = tagDrops
236+ print (f"DBUG:TextXMLParser:{ self .tagDrops } " )
237+ self .insideTagDrops = {
238+ }
239+ for tag in tagDrops :
240+ self .insideTagDrops [tag ] = False
241+ self .bCapture = False
242+ self .text = ""
243+ self .prefix = ""
244+
245+ def do_capture (self ):
246+ """
247+ Helps decide whether to capture contents or discard them.
248+ """
249+ for tag in self .tagDrops :
250+ if self .insideTagDrops [tag ]:
251+ return False
252+ return True
253+
254+ def handle_starttag (self , tag : str , attrs : list [tuple [str , str | None ]]):
255+ self .prefix += " "
256+ if tag in self .tagDrops :
257+ self .insideTagDrops [tag ] = True
258+
259+ def handle_endtag (self , tag : str ):
260+ self .prefix = self .prefix [:- 1 ]
261+ if tag in self .tagDrops :
262+ self .insideTagDrops [tag ] = False
263+
264+ def handle_data (self , data : str ):
265+ if self .do_capture ():
266+ self .text += f"{ self .prefix } { data } \n "
267+
268+
269+ def handle_xmltext (ph : 'ProxyHandler' , pr : urllib .parse .ParseResult ):
270+ try :
271+ # Get requested url
272+ got = handle_urlreq (ph , pr , "HandleXMLText" )
273+ if not got .callOk :
274+ ph .send_error (got .httpStatus , got .httpStatusMsg )
275+ return
276+ # Extract Text
277+ tagDrops = ph .headers .get ('xmltext-tag-drops' )
278+ if not tagDrops :
279+ tagDrops = []
280+ else :
281+ tagDrops = cast (list [str ], json .loads (tagDrops ))
282+ textXML = TextXMLParser (tagDrops )
283+ textXML .feed (got .contentData )
284+ # Send back to client
285+ ph .send_response (got .httpStatus )
286+ ph .send_header ('Content-Type' , got .contentType )
287+ # Add CORS for browser fetch, just in case
288+ ph .send_header ('Access-Control-Allow-Origin' , '*' )
289+ ph .end_headers ()
290+ ph .wfile .write (textXML .text .encode ('utf-8' ))
291+ debug .dump ({ 'RawText' : 'yes' , 'StrippedText' : 'yes' }, { 'RawText' : textXML .text })
292+ except Exception as exc :
293+ ph .send_error (502 , f"WARN:XMLTextFailed:{ exc } " )
0 commit comments