Skip to content

Commit db84b7d

Browse files
author
prima
committed
feat: Added extract text support for OCR (based on vision LLMs and audio based on whisper)
1 parent d4bbe05 commit db84b7d

File tree

2 files changed

+25
-10
lines changed

2 files changed

+25
-10
lines changed

klite.embd

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26262,10 +26262,21 @@ let checkFinalThoughtsPrompt = `Action: {"command":{"name":"thought","args":{"me
2626226262
{
2626326263
promptUserForLocalFile(async (fileDetails) => {
2626426264
let {file, fileName, ext, content} = fileDetails
26265-
let extractedText = await this.extractTextFromDocument(content)
26266-
if (!!extractedText?.text)
26265+
let extractedText = undefined
26266+
26267+
if (content.startsWith("data:image"))
26268+
{
26269+
let analysisPrompt = "Perform OCR on the provided image."
26270+
extractedText = await generateAndGetTextFromPrompt(`${createInstructPrompt(analysisPrompt)}${instructendplaceholder}${!!localsettings?.inject_jailbreak_instruct ? localsettings.custom_jailbreak_text : ""}`, undefined, [content.split(",")[1]])
26271+
}
26272+
else
26273+
{
26274+
extractedText = (await this.extractTextFromDocument(content))?.text
26275+
}
26276+
26277+
if (!!extractedText)
2626726278
{
26268-
replaceDocumentFromTextDB(fileName, extractedText?.text)
26279+
replaceDocumentFromTextDB(fileName, extractedText)
2626926280
}
2627026281
})
2627126282
}

koboldcpp.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,13 +1621,17 @@ def whisper_load_model(model_filename):
16211621

16221622
def extract_text(genparams):
16231623
global args
1624-
docData = genparams.get("docData", "")
1625-
if docData.startswith("data:text"):
1626-
docData = docData.split(",", 1)[1]
1627-
else:
1628-
return ""
1629-
1624+
16301625
try:
1626+
docData = genparams.get("docData", "")
1627+
if docData.startswith("data:text"):
1628+
docData = docData.split(",", 1)[1]
1629+
elif docData.startswith("data:audio"):
1630+
genparams["audio_data"] = docData
1631+
return whisper_generate(genparams)
1632+
else:
1633+
return ""
1634+
16311635
# Add padding if necessary
16321636
padding = len(docData) % 4
16331637
if padding != 0:
@@ -1639,7 +1643,7 @@ def extract_text(genparams):
16391643
decoded_string = decoded_bytes.decode("UTF-8")
16401644
return decoded_string
16411645
except Exception as e:
1642-
print(f"Error decoding Base64: {str(e)}")
1646+
print(f"Error extracting text: {str(e)}")
16431647
return ""
16441648

16451649
def whisper_generate(genparams):

0 commit comments

Comments
 (0)