SimpleChatTC:WebTools: urltext-tag-drops python side - skel

hanishkvc · hanishkvc · commit 34fe99eba029 · 2025-11-06T15:32:19.000+05:30
Rename search-drops to urltext-tag-drops, to indicate its more
generic semantic. Rather search drops specified in UI by user
will be mapped to urltext-tag-drops header entry of a urltext
web fetch request.

Implement a crude urltext-tag-drops logic in TextHtmlParser.
If there is any mismatch with opening and closing tags in the
html being parsed and inturn wrt the type of tag being targetted
for dropping, things can mess up.
diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py
@@ -8,6 +8,7 @@
 import html.parser
 import debug
 import filemagic as mFile
+import json
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser):
     This helps return a relatively clean textual representation of the html file/content being parsed.
     """
 
-    def __init__(self):
+    def __init__(self, tagDrops: dict):
         super().__init__()
+        self.tagDrops = tagDrops
         self.inside = {
             'body': False,
             'script': False,
             'style': False,
             'header': False,
             'footer': False,
-            'nav': False
+            'nav': False,
         }
         self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
         self.bCapture = False
         self.text = ""
         self.textStripped = ""
+        self.droptagType = None
+        self.droptagCount = 0
 
     def do_capture(self):
         """
         Helps decide whether to capture contents or discard them.
         """
-        if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
+        if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)):
             return True
         return False
 
     def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
         if tag in self.monitored:
             self.inside[tag] = True
+        for tagMeta in self.tagDrops:
+            if tag != tagMeta.tag:
+                continue
+            for attr in attrs:
+                if attr[0] != 'id':
+                    continue
+                if attr[1] == tagMeta.id:
+                    self.droptagCount += 1
+                    self.droptagType = tag
 
     def handle_endtag(self, tag: str):
         if tag in self.monitored:
             self.inside[tag] = False
+        if tag == self.droptagType:
+            self.droptagCount -= 1
+            if self.droptagCount < 0:
+                self.droptagCount = 0
 
     def handle_data(self, data: str):
         if self.do_capture():
@@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
             ph.send_error(got.httpStatus, got.httpStatusMsg)
             return
         # Extract Text
-        textHtml = TextHtmlParser()
+        tagDrops = ph.headers.get('urltext-tag-drops')
+        if not tagDrops:
+            tagDrops = {}
+        else:
+            tagDrops = json.loads(tagDrops)
+        textHtml = TextHtmlParser(tagDrops)
         textHtml.feed(got.contentData)
         # Send back to client
         ph.send_response(got.httpStatus)
diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs
@@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
         searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
         delete(obj.words)
         obj['url'] = searchUrl
-        let headers = { 'Search-Drops': get_gme().tools.searchDrops }
+        let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
         return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {`
`259`	`259`	`searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));`
`260`	`260`	`delete(obj.words)`
`261`	`261`	`obj['url'] = searchUrl`
`262`		`- let headers = { 'Search-Drops': get_gme().tools.searchDrops }`
	`262`	`+ let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }`
`263`	`263`	`return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);`
`264`	`264`	`}`
`265`	`265`	`}`