feat: Addition of API request for basic text extraction, and button in lite to import into TextDB as a document (only basic text files for now)

prima · prima · commit d4bbe05c60d5 · 2025-05-03T16:29:03.000+01:00
diff --git a/klite.embd b/klite.embd
@@ -21391,6 +21391,12 @@ Current version indicated by LITEVER below.
 					<button title="Delete Embedding cache" class="btn btn-primary" onclick="embeddingDB.deleteAllDocuments()" style="margin:0px 0 0;">Delete Embedding cache</button>
 				</div>
 
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Add document to TextDB">Add document to TextDB <span class="helpicon">?
+						<span class="helptext">Tries to extract the text from the document, adding (or overwriting) an existing document in the TextDB with the same name.</span></span></div>
+					<button title="Add document to TextDB" class="btn btn-primary" onclick="documentParser.addDocumentToTextDB()" style="margin:0px 0 0;">Add document to TextDB</button>
+				</div>
+
 				<div class="settinglabel" style="padding: 4px;">
 					<div class="justifyleft settingsmall" title="Search Includes Context History">Search Includes Context History <span class="helpicon">?
 						<span class="helptext">If enabled, the entire story/chat history is used as a searchable document.</span></span></div>
@@ -25190,6 +25196,7 @@ let checkFinalThoughtsPrompt = `Action: {"command":{"name":"thought","args":{"me
 		{
 			documentdb_data += `[DOCUMENT BREAK][${documentName}]${documentContent.trim()}[DOCUMENT BREAK]`
 		}
+		document.getElementById("documentdb_data").value = documentdb_data;
 	}
 
 	let calcImageSizing = (aspect) => {
@@ -26230,4 +26237,39 @@ let checkFinalThoughtsPrompt = `Action: {"command":{"name":"thought","args":{"me
 
 		window.embeddingSearcher = new EmbeddingSearcher()
 </script>
+<script type="module">
+	class DocumentParser
+	{
+		extractTextFromDocument(content)
+		{
+			let reqOpt = {
+				method: 'POST',
+				headers: get_kobold_header(),
+				body: JSON.stringify({
+					docData: content
+				}),
+			};
+			if (globalabortcontroller) {
+				reqOpt.signal = globalabortcontroller.signal;
+			}
+			let sub_endpt = apply_proxy_url(`${custom_kobold_endpoint}/api/extra/extractText`);
+
+			return fetch(sub_endpt, reqOpt)
+				.then((response) => response.json())
+		}
+
+		addDocumentToTextDB()
+		{
+			promptUserForLocalFile(async (fileDetails) => {
+				let {file, fileName, ext, content} = fileDetails
+				let extractedText = await this.extractTextFromDocument(content)
+				if (!!extractedText?.text)
+				{
+					replaceDocumentFromTextDB(fileName, extractedText?.text)
+				}
+			})
+		}
+	}
+	document.documentParser = new DocumentParser()
+</script>
 </html>
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -1619,6 +1619,29 @@ def whisper_load_model(model_filename):
     ret = handle.whisper_load_model(inputs)
     return ret
 
+def extract_text(genparams):
+    global args
+    docData = genparams.get("docData", "")
+    if docData.startswith("data:text"):
+        docData = docData.split(",", 1)[1]
+    else:
+        return ""
+
+    try:
+        # Add padding if necessary
+        padding = len(docData) % 4
+        if padding != 0:
+            docData += '=' * (4 - padding)
+
+        # Decode the Base64 string
+        decoded_bytes = base64.b64decode(docData)
+        # Convert the decoded bytes to a string
+        decoded_string = decoded_bytes.decode("UTF-8")
+        return decoded_string
+    except Exception as e:
+        print(f"Error decoding Base64: {str(e)}")
+        return ""
+
 def whisper_generate(genparams):
     global args
     prompt = genparams.get("prompt", "")
@@ -3807,6 +3830,7 @@ def do_POST(self):
             is_imggen = False
             is_comfyui_imggen = False
             is_transcribe = False
+            is_extract_text = False
             is_tts = False
             is_embeddings = False
 
@@ -3848,6 +3872,9 @@ def do_POST(self):
                 if self.path=="/prompt":
                     is_comfyui_imggen = True
 
+            if self.path.endswith('/api/extra/extractText'):
+                is_extract_text = True
+
             if self.path.endswith('/api/extra/transcribe') or self.path.endswith('/v1/audio/transcriptions'):
                 is_transcribe = True
 
@@ -3857,7 +3884,7 @@ def do_POST(self):
             if self.path.endswith('/api/extra/embeddings') or self.path.endswith('/v1/embeddings'):
                 is_embeddings = True
 
-            if is_imggen or is_transcribe or is_tts or is_embeddings or api_format > 0:
+            if is_imggen or is_transcribe or is_tts or is_embeddings or is_extract_text or api_format > 0:
                 global last_req_time
                 last_req_time = time.time()
 
@@ -3978,6 +4005,19 @@ def do_POST(self):
                         print("Transcribe: The response could not be sent, maybe connection was terminated?")
                         time.sleep(0.2) #short delay
                     return
+                elif is_extract_text:
+                    try:
+                        gen = extract_text(genparams)
+                        genresp = (json.dumps({"text":gen}).encode())
+                        self.send_response(200)
+                        self.send_header('content-length', str(len(genresp)))
+                        self.end_headers(content_type='application/json')
+                        self.wfile.write(genresp)
+                    except Exception as ex:
+                        utfprint(ex,1)
+                        print("Extract text: The response could not be sent, maybe connection was terminated?")
+                        time.sleep(0.2) #short delay
+                    return
                 elif is_tts:
                     try:
                         gen = tts_generate(genparams)