Skip to content

Commit babfb96

Browse files
committed
SimpleChatTC:WebTools:UrlText:HtmlParser: tag drops - refine
Update the initial skeleton wrt the tag drops logic * had forgotten to convert object to json string at the client end * had confused between js and python and tried accessing the dict elements using . notation rather than [] notation in python. * if the id filtered tag to be dropped is found, from then on track all other tags of the same type (independent of id), so that start and end tags can be matched. bcas end tag call wont have attribute, so all other tags of same type need to be tracked, for proper winding and unwinding to try find matching end tag * remember to reset the tracked drop tag type to None once matching end tag at same depth is found. should avoid some unnecessary unwinding. * set/fix the type wrt tagDrops explicitly to needed depth and ensure the dummy one and any explicitly got one is of right type. Tested with duckduckgo search engine and now the div based unneeded header is avoided in returned search result.
1 parent 34fe99e commit babfb96

File tree

3 files changed

+31
-8
lines changed

3 files changed

+31
-8
lines changed

tools/server/public_simplechat/local.tools/webmagic.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import debug
1010
import filemagic as mFile
1111
import json
12-
from typing import TYPE_CHECKING
12+
from typing import TYPE_CHECKING, Any, cast
1313

1414
if TYPE_CHECKING:
1515
from simpleproxy import ProxyHandler
@@ -93,12 +93,21 @@ class TextHtmlParser(html.parser.HTMLParser):
9393
html content, that logic wont be triggered, so also such client side dynamic content wont be
9494
got.
9595
96+
Supports one to specify a list of tags and their corresponding id attributes, so that contents
97+
within such specified blocks will be dropped.
98+
99+
* this works properly only if the html being processed has proper opening and ending tags
100+
around the area of interest.
101+
* remember to specify non overlapping tag blocks, if more than one specified for dropping.
102+
* this path not tested, but should logically work
103+
96104
This helps return a relatively clean textual representation of the html file/content being parsed.
97105
"""
98106

99-
def __init__(self, tagDrops: dict):
107+
def __init__(self, tagDrops: list[dict[str, Any]]):
100108
super().__init__()
101109
self.tagDrops = tagDrops
110+
print(f"DBUG:TextHtmlParser:{self.tagDrops}")
102111
self.inside = {
103112
'body': False,
104113
'script': False,
@@ -126,20 +135,27 @@ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
126135
if tag in self.monitored:
127136
self.inside[tag] = True
128137
for tagMeta in self.tagDrops:
129-
if tag != tagMeta.tag:
138+
if tag != tagMeta['tag']:
139+
continue
140+
if (self.droptagCount > 0) and (self.droptagType == tag):
141+
self.droptagCount += 1
130142
continue
131143
for attr in attrs:
132144
if attr[0] != 'id':
133145
continue
134-
if attr[1] == tagMeta.id:
146+
if attr[1] == tagMeta['id']:
135147
self.droptagCount += 1
136148
self.droptagType = tag
149+
print(f"DBUG:THP:Start:Tag found [{tag}:{attr[1]}]...")
137150

138151
def handle_endtag(self, tag: str):
139152
if tag in self.monitored:
140153
self.inside[tag] = False
141-
if tag == self.droptagType:
154+
if self.droptagType and (tag == self.droptagType):
142155
self.droptagCount -= 1
156+
if self.droptagCount == 0:
157+
self.droptagType = None
158+
print("DBUG:THP:End:Tag found...")
143159
if self.droptagCount < 0:
144160
self.droptagCount = 0
145161

@@ -186,9 +202,9 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
186202
# Extract Text
187203
tagDrops = ph.headers.get('urltext-tag-drops')
188204
if not tagDrops:
189-
tagDrops = {}
205+
tagDrops = []
190206
else:
191-
tagDrops = json.loads(tagDrops)
207+
tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops))
192208
textHtml = TextHtmlParser(tagDrops)
193209
textHtml.feed(got.contentData)
194210
# Send back to client

tools/server/public_simplechat/readme.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,13 @@ SimpleProxy updates
590590
* Helpers to fetch file from local file system or the web, transparently
591591
* Help check for needed modules before a particular service path is acknowledged as available
592592
through /aum service path
593+
* urltext and related - logic to drop contents of specified tag with a given id
594+
* allow its use for the web search tool flow
595+
* setup wrt default duckduckgo search result urltext plain text cleanup and found working.
596+
* this works properly only if the html being processed has proper opening and ending tags
597+
around the area of interest.
598+
* remember to specify non overlapping tag blocks, if more than one specified for dropping.
599+
* this path not tested, but should logically work
593600

594601
Settings/Config default changes
595602

tools/server/public_simplechat/toolweb.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
259259
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
260260
delete(obj.words)
261261
obj['url'] = searchUrl
262-
let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
262+
let headers = { 'urltext-tag-drops': JSON.stringify(get_gme().tools.searchDrops) }
263263
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
264264
}
265265
}

0 commit comments

Comments
 (0)