Skip to content

Commit 802fc73

Browse files
committed
SimpleChatTC:XmlText: Cleanup initial go
At simpleproxy end * Add the tag names hierarchy before contents of a tag * Remember to convert the tagDrops to small case as HTMLParser base class seems to do that by default. At the client ui end * if undefined remember to pass a empty list wrt tagDrops. * cleanup the func description and also mention possible tagDrops for RSS feeds in the tool meta
1 parent f7897a4 commit 802fc73

File tree

3 files changed

+19
-8
lines changed

3 files changed

+19
-8
lines changed

tools/server/public_simplechat/local.tools/webmagic.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,15 +232,15 @@ class TextXMLParser(html.parser.HTMLParser):
232232

233233
def __init__(self, tagDrops: list[str]):
234234
super().__init__()
235-
self.tagDrops = tagDrops
235+
self.tagDrops = list(map(str.lower, tagDrops))
236236
print(f"DBUG:TextXMLParser:{self.tagDrops}")
237237
self.insideTagDrops = {
238238
}
239239
for tag in tagDrops:
240240
self.insideTagDrops[tag] = False
241241
self.bCapture = False
242242
self.text = ""
243-
self.prefix = ""
243+
self.prefix = []
244244

245245
def do_capture(self):
246246
"""
@@ -252,18 +252,18 @@ def do_capture(self):
252252
return True
253253

254254
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
255-
self.prefix += " "
255+
self.prefix.append(tag)
256256
if tag in self.tagDrops:
257257
self.insideTagDrops[tag] = True
258258

259259
def handle_endtag(self, tag: str):
260-
self.prefix = self.prefix[:-1]
260+
self.prefix.pop()
261261
if tag in self.tagDrops:
262262
self.insideTagDrops[tag] = False
263263

264264
def handle_data(self, data: str):
265265
if self.do_capture():
266-
self.text += f"{self.prefix}{data}\n"
266+
self.text += f"{':'.join(self.prefix)}:{data}\n"
267267

268268

269269
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):

tools/server/public_simplechat/readme.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,10 @@ plain textual content from the search result page.
463463
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
464464
* this depends on the pypdf python based open source library
465465

466+
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
467+
* prefixes the tag heirarchy with each leaf content
468+
* allows one to specify a list of tags that are to be dropped fully.
469+
466470
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
467471
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
468472
directly fetch from the browser js runtime environment.
@@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
650654
or if there is no response within the configured timeout period.
651655
NOTE: Currently the logic supports only 1 pending tool call per chat session.
652656

657+
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
653658

654659
#### ToDo
655660

tools/server/public_simplechat/toolweb.mjs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
334334
//
335335

336336

337+
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
338+
337339
let fetchxmltext_meta = {
338340
"type": "function",
339341
"function": {
340342
"name": "fetch_xml_as_text",
341-
"description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds",
343+
"description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
342344
"parameters": {
343345
"type": "object",
344346
"properties": {
@@ -348,7 +350,7 @@ let fetchxmltext_meta = {
348350
},
349351
"tagDrops":{
350352
"type":"string",
351-
"description":"specify a json stringified form of list of xml tags to drop"
353+
"description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
352354
}
353355
},
354356
"required": ["url"]
@@ -367,7 +369,11 @@ let fetchxmltext_meta = {
367369
* @param {any} obj
368370
*/
369371
function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
370-
let headers = { 'xmltext-tag-drops': obj.tagDrops }
372+
let tagDrops = obj.tagDrops
373+
if (tagDrops == undefined) {
374+
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
375+
}
376+
let headers = { 'xmltext-tag-drops': tagDrops }
371377
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
372378
}
373379

0 commit comments

Comments
 (0)