Skip to content

Commit e4d2ca0

Browse files
committed
SimpleChatTC:Pdf2Text: Make it work with a subset of pages
Initial go, need to review the code flow as well as test it out
1 parent 27979fe commit e4d2ca0

File tree

2 files changed

+28
-5
lines changed

2 files changed

+28
-5
lines changed

tools/server/public_simplechat/local.tools/simpleproxy.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
378378
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
379379

380380

381-
def process_pdf2text(url: str):
381+
def process_pdf2text(url: str, startPN: int, endPN: int):
382382
import pypdf
383383
import io
384384
urlParts = url.split('://',1)
@@ -388,7 +388,12 @@ def process_pdf2text(url: str):
388388
dPdf = fPdf.read()
389389
tPdf = ""
390390
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
391-
for (pn, pd) in enumerate(oPdf.pages):
391+
if (startPN < 0):
392+
startPN = 0
393+
if (endPN < 0) or (endPN >= len(oPdf.pages)):
394+
endPN = len(oPdf.pages)-1
395+
for i in range(startPN, endPN+1):
396+
pd = oPdf.pages[i]
392397
tPdf = tPdf + pd.extract_text()
393398
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
394399

@@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
407412
if (not url) or (len(url) == 0):
408413
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
409414
return
415+
startP = queryParams['startPageNumber'][0]
416+
if startP:
417+
startP = int(startP)
418+
else:
419+
startP = -1
420+
endP = queryParams['endPageNumber'][0]
421+
if endP:
422+
endP = int(endP)
423+
else:
424+
endP = -1
410425
print(f"INFO:HandlePdf2Text:Processing:{url}...")
411-
gotP2T = process_pdf2text(url)
426+
gotP2T = process_pdf2text(url, startP, endP)
412427
if (gotP2T['status'] != 200):
413428
ph.send_error(gotP2T['status'], gotP2T['msg'] )
414429
return

tools/server/public_simplechat/toolweb.mjs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,14 +284,22 @@ let pdf2text_meta = {
284284
"type": "function",
285285
"function": {
286286
"name": "pdf2text",
287-
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds",
287+
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
288288
"parameters": {
289289
"type": "object",
290290
"properties": {
291291
"url":{
292292
"type":"string",
293293
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
294-
}
294+
},
295+
"startPageNumber":{
296+
"type":"integer",
297+
"description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
298+
},
299+
"endPageNumber":{
300+
"type":"integer",
301+
"description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
302+
},
295303
},
296304
"required": ["url"]
297305
}

0 commit comments

Comments
 (0)