Skip to content

Commit d4bbe05

Browse files
author
prima
committed
feat: Addition of API request for basic text extraction, and button in lite to import into TextDB as a document (only basic text files for now)
1 parent 420b95a commit d4bbe05

File tree

2 files changed

+83
-1
lines changed

2 files changed

+83
-1
lines changed

klite.embd

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21391,6 +21391,12 @@ Current version indicated by LITEVER below.
2139121391
<button title="Delete Embedding cache" class="btn btn-primary" onclick="embeddingDB.deleteAllDocuments()" style="margin:0px 0 0;">Delete Embedding cache</button>
2139221392
</div>
2139321393

21394+
<div class="settinglabel" style="padding: 4px;">
21395+
<div class="justifyleft settingsmall" title="Add document to TextDB">Add document to TextDB <span class="helpicon">?
21396+
<span class="helptext">Tries to extract the text from the document, adding (or overwriting) an existing document in the TextDB with the same name.</span></span></div>
21397+
<button title="Add document to TextDB" class="btn btn-primary" onclick="documentParser.addDocumentToTextDB()" style="margin:0px 0 0;">Add document to TextDB</button>
21398+
</div>
21399+
2139421400
<div class="settinglabel" style="padding: 4px;">
2139521401
<div class="justifyleft settingsmall" title="Search Includes Context History">Search Includes Context History <span class="helpicon">?
2139621402
<span class="helptext">If enabled, the entire story/chat history is used as a searchable document.</span></span></div>
@@ -25190,6 +25196,7 @@ let checkFinalThoughtsPrompt = `Action: {"command":{"name":"thought","args":{"me
2519025196
{
2519125197
documentdb_data += `[DOCUMENT BREAK][${documentName}]${documentContent.trim()}[DOCUMENT BREAK]`
2519225198
}
25199+
document.getElementById("documentdb_data").value = documentdb_data;
2519325200
}
2519425201

2519525202
let calcImageSizing = (aspect) => {
@@ -26230,4 +26237,39 @@ let checkFinalThoughtsPrompt = `Action: {"command":{"name":"thought","args":{"me
2623026237

2623126238
window.embeddingSearcher = new EmbeddingSearcher()
2623226239
</script>
26240+
<script type="module">
26241+
class DocumentParser
26242+
{
26243+
extractTextFromDocument(content)
26244+
{
26245+
let reqOpt = {
26246+
method: 'POST',
26247+
headers: get_kobold_header(),
26248+
body: JSON.stringify({
26249+
docData: content
26250+
}),
26251+
};
26252+
if (globalabortcontroller) {
26253+
reqOpt.signal = globalabortcontroller.signal;
26254+
}
26255+
let sub_endpt = apply_proxy_url(`${custom_kobold_endpoint}/api/extra/extractText`);
26256+
26257+
return fetch(sub_endpt, reqOpt)
26258+
.then((response) => response.json())
26259+
}
26260+
26261+
addDocumentToTextDB()
26262+
{
26263+
promptUserForLocalFile(async (fileDetails) => {
26264+
let {file, fileName, ext, content} = fileDetails
26265+
let extractedText = await this.extractTextFromDocument(content)
26266+
if (!!extractedText?.text)
26267+
{
26268+
replaceDocumentFromTextDB(fileName, extractedText?.text)
26269+
}
26270+
})
26271+
}
26272+
}
26273+
document.documentParser = new DocumentParser()
26274+
</script>
2623326275
</html>

koboldcpp.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1619,6 +1619,29 @@ def whisper_load_model(model_filename):
16191619
ret = handle.whisper_load_model(inputs)
16201620
return ret
16211621

1622+
def extract_text(genparams):
1623+
global args
1624+
docData = genparams.get("docData", "")
1625+
if docData.startswith("data:text"):
1626+
docData = docData.split(",", 1)[1]
1627+
else:
1628+
return ""
1629+
1630+
try:
1631+
# Add padding if necessary
1632+
padding = len(docData) % 4
1633+
if padding != 0:
1634+
docData += '=' * (4 - padding)
1635+
1636+
# Decode the Base64 string
1637+
decoded_bytes = base64.b64decode(docData)
1638+
# Convert the decoded bytes to a string
1639+
decoded_string = decoded_bytes.decode("UTF-8")
1640+
return decoded_string
1641+
except Exception as e:
1642+
print(f"Error decoding Base64: {str(e)}")
1643+
return ""
1644+
16221645
def whisper_generate(genparams):
16231646
global args
16241647
prompt = genparams.get("prompt", "")
@@ -3807,6 +3830,7 @@ def do_POST(self):
38073830
is_imggen = False
38083831
is_comfyui_imggen = False
38093832
is_transcribe = False
3833+
is_extract_text = False
38103834
is_tts = False
38113835
is_embeddings = False
38123836

@@ -3848,6 +3872,9 @@ def do_POST(self):
38483872
if self.path=="/prompt":
38493873
is_comfyui_imggen = True
38503874

3875+
if self.path.endswith('/api/extra/extractText'):
3876+
is_extract_text = True
3877+
38513878
if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
38523879
is_transcribe = True
38533880

@@ -3857,7 +3884,7 @@ def do_POST(self):
38573884
if self.path.endswith('/api/extra/embeddings') or self.path.endswith('/v1/embeddings'):
38583885
is_embeddings = True
38593886

3860-
if is_imggen or is_transcribe or is_tts or is_embeddings or api_format > 0:
3887+
if is_imggen or is_transcribe or is_tts or is_embeddings or is_extract_text or api_format > 0:
38613888
global last_req_time
38623889
last_req_time = time.time()
38633890

@@ -3978,6 +4005,19 @@ def do_POST(self):
39784005
print("Transcribe: The response could not be sent, maybe connection was terminated?")
39794006
time.sleep(0.2) #short delay
39804007
return
4008+
elif is_extract_text:
4009+
try:
4010+
gen = extract_text(genparams)
4011+
genresp = (json.dumps({"text":gen}).encode())
4012+
self.send_response(200)
4013+
self.send_header('content-length', str(len(genresp)))
4014+
self.end_headers(content_type='application/json')
4015+
self.wfile.write(genresp)
4016+
except Exception as ex:
4017+
utfprint(ex,1)
4018+
print("Extract text: The response could not be sent, maybe connection was terminated?")
4019+
time.sleep(0.2) #short delay
4020+
return
39814021
elif is_tts:
39824022
try:
39834023
gen = tts_generate(genparams)

0 commit comments

Comments
 (0)