SimpleChatTC:XmlText: Cleanup initial go

hanishkvc · hanishkvc · commit 802fc737017e · 2025-11-07T04:54:40.000+05:30
At simpleproxy end

* Add the tag names hierarchy before contents of a tag

* Remember to convert the tagDrops to small case as HTMLParser base
  class seems to do that by default.

At the client ui end

* if undefined remember to pass a empty list wrt tagDrops.

* cleanup the func description and also mention possible tagDrops
  for RSS feeds in the tool meta
diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py
@@ -232,15 +232,15 @@ class TextXMLParser(html.parser.HTMLParser):
 
     def __init__(self, tagDrops: list[str]):
         super().__init__()
-        self.tagDrops = tagDrops
+        self.tagDrops = list(map(str.lower, tagDrops))
         print(f"DBUG:TextXMLParser:{self.tagDrops}")
         self.insideTagDrops = {
         }
         for tag in tagDrops:
             self.insideTagDrops[tag] = False
         self.bCapture = False
         self.text = ""
-        self.prefix = ""
+        self.prefix = []
 
     def do_capture(self):
         """
@@ -252,18 +252,18 @@ def do_capture(self):
         return True
 
     def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
-        self.prefix += " "
+        self.prefix.append(tag)
         if tag in self.tagDrops:
             self.insideTagDrops[tag] = True
 
     def handle_endtag(self, tag: str):
-        self.prefix = self.prefix[:-1]
+        self.prefix.pop()
         if tag in self.tagDrops:
             self.insideTagDrops[tag] = False
 
     def handle_data(self, data: str):
         if self.do_capture():
-            self.text += f"{self.prefix}{data}\n"
+            self.text += f"{':'.join(self.prefix)}:{data}\n"
 
 
 def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md
@@ -463,6 +463,10 @@ plain textual content from the search result page.
 * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
   * this depends on the pypdf python based open source library
 
+* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
+  * prefixes the tag heirarchy with each leaf content
+  * allows one to specify a list of tags that are to be dropped fully.
+
 the above set of web related tool calls work by handshaking with a bundled simple local web proxy
 (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
 directly fetch from the browser js runtime environment.
@@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
   or if there is no response within the configured timeout period.
   NOTE: Currently the logic supports only 1 pending tool call per chat session.
 
+* add support for fetch_xml_as_text tool call, fix importmaps in index.html
 
 #### ToDo
 
diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs
@@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
 //
 
 
+let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
+
 let fetchxmltext_meta = {
         "type": "function",
         "function": {
             "name": "fetch_xml_as_text",
-            "description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds",
+            "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
             "parameters": {
                 "type": "object",
                 "properties": {
@@ -348,7 +350,7 @@ let fetchxmltext_meta = {
                     },
                     "tagDrops":{
                         "type":"string",
-                        "description":"specify a json stringified form of list of xml tags to drop"
+                        "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
                     }
                 },
                 "required": ["url"]
@@ -367,7 +369,11 @@ let fetchxmltext_meta = {
  * @param {any} obj
  */
 function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
-    let headers = { 'xmltext-tag-drops': obj.tagDrops }
+    let tagDrops = obj.tagDrops
+    if (tagDrops == undefined) {
+        tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
+    }
+    let headers = { 'xmltext-tag-drops': tagDrops }
     return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
 }