Skip to content

Commit 34fe99e

Browse files
committed
SimpleChatTC:WebTools: urltext-tag-drops python side - skel
Rename search-drops to urltext-tag-drops, to indicate its more generic semantic. Rather search drops specified in UI by user will be mapped to urltext-tag-drops header entry of a urltext web fetch request. Implement a crude urltext-tag-drops logic in TextHtmlParser. If there is any mismatch with opening and closing tags in the html being parsed and inturn wrt the type of tag being targetted for dropping, things can mess up.
1 parent fe2443b commit 34fe99e

File tree

2 files changed

+27
-5
lines changed

2 files changed

+27
-5
lines changed

tools/server/public_simplechat/local.tools/webmagic.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import html.parser
99
import debug
1010
import filemagic as mFile
11+
import json
1112
from typing import TYPE_CHECKING
1213

1314
if TYPE_CHECKING:
@@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser):
9596
This helps return a relatively clean textual representation of the html file/content being parsed.
9697
"""
9798

98-
def __init__(self):
99+
def __init__(self, tagDrops: dict):
99100
super().__init__()
101+
self.tagDrops = tagDrops
100102
self.inside = {
101103
'body': False,
102104
'script': False,
103105
'style': False,
104106
'header': False,
105107
'footer': False,
106-
'nav': False
108+
'nav': False,
107109
}
108110
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
109111
self.bCapture = False
110112
self.text = ""
111113
self.textStripped = ""
114+
self.droptagType = None
115+
self.droptagCount = 0
112116

113117
def do_capture(self):
114118
"""
115119
Helps decide whether to capture contents or discard them.
116120
"""
117-
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
121+
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)):
118122
return True
119123
return False
120124

121125
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
122126
if tag in self.monitored:
123127
self.inside[tag] = True
128+
for tagMeta in self.tagDrops:
129+
if tag != tagMeta.tag:
130+
continue
131+
for attr in attrs:
132+
if attr[0] != 'id':
133+
continue
134+
if attr[1] == tagMeta.id:
135+
self.droptagCount += 1
136+
self.droptagType = tag
124137

125138
def handle_endtag(self, tag: str):
126139
if tag in self.monitored:
127140
self.inside[tag] = False
141+
if tag == self.droptagType:
142+
self.droptagCount -= 1
143+
if self.droptagCount < 0:
144+
self.droptagCount = 0
128145

129146
def handle_data(self, data: str):
130147
if self.do_capture():
@@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
167184
ph.send_error(got.httpStatus, got.httpStatusMsg)
168185
return
169186
# Extract Text
170-
textHtml = TextHtmlParser()
187+
tagDrops = ph.headers.get('urltext-tag-drops')
188+
if not tagDrops:
189+
tagDrops = {}
190+
else:
191+
tagDrops = json.loads(tagDrops)
192+
textHtml = TextHtmlParser(tagDrops)
171193
textHtml.feed(got.contentData)
172194
# Send back to client
173195
ph.send_response(got.httpStatus)

tools/server/public_simplechat/toolweb.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
259259
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
260260
delete(obj.words)
261261
obj['url'] = searchUrl
262-
let headers = { 'Search-Drops': get_gme().tools.searchDrops }
262+
let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
263263
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
264264
}
265265
}

0 commit comments

Comments
 (0)