Skip to content

Commit 2cb0937

Browse files
committed
remove language detection
1 parent 32c6724 commit 2cb0937

File tree

2 files changed

+4
-55
lines changed

2 files changed

+4
-55
lines changed

fastchat/serve/gradio_block_arena_vision.py

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -233,20 +233,6 @@ def wrap_pdfchat_query(query, document):
233233

234234

235235
LLAMA_PARSE_MAX_RETRY = 2
236-
TESSERACT_SUPPORTED_LANGS = "+".join(
237-
[
238-
"en",
239-
"chi_tra",
240-
"chi_sim",
241-
"rus",
242-
"spa",
243-
"jpn",
244-
"kor",
245-
"fra",
246-
"deu", # German
247-
"vie",
248-
]
249-
)
250236
LLAMAPARSE_SUPPORTED_LANGS = {
251237
"English": "en",
252238
"Chinese": "ch_sim",
@@ -260,44 +246,21 @@ def wrap_pdfchat_query(query, document):
260246
}
261247

262248

263-
def detect_language_from_doc(pdf_file_path):
264-
from pdf2image import convert_from_path
265-
from polyglot.detect import Detector
266-
267-
import pytesseract # Google's open-source OCR tool
268-
269-
assert os.environ[
270-
"TESSDATA_PREFIX"
271-
], "Make sure to specify location of train data for Tesseract."
272-
273-
# Convert pdf into image (first page only for efficiency)
274-
images = convert_from_path(pdf_file_path)
275-
276-
extracted_text = pytesseract.image_to_string(
277-
images[0], lang=TESSERACT_SUPPORTED_LANGS
278-
)
279-
280-
languages = Detector(extracted_text, quiet=True)
281-
# return languages
282-
return [lang.name for lang in languages.languages if lang.name != "un"]
283-
284-
285249
def parse_pdf(file_path):
286250
from llama_parse import LlamaParse
287251

288252
assert (
289253
"LLAMA_CLOUD_API_KEY" in os.environ
290254
), "Make sure to specify LlamaParse API key."
291255

292-
doc_lang = detect_language_from_doc(file_path)
293-
doc_lang = LLAMAPARSE_SUPPORTED_LANGS[doc_lang[0]]
294-
295256
for _ in range(LLAMA_PARSE_MAX_RETRY):
296257
try:
297258
documents = LlamaParse(
298259
result_type="markdown",
299260
verbose=True,
300-
language=doc_lang,
261+
languages=list(
262+
LLAMAPARSE_SUPPORTED_LANGS.values()
263+
),
301264
accurate_mode=True,
302265
).load_data(file_path)
303266
assert len(documents) > 0

fastchat/serve/setup_pdfchat.sh

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,4 @@
22

33
# Install Python packages
44
pip install llama-index-core llama-parse llama-index-readers-file python-dotenv
5-
pip install polyglot
6-
pip install PyICU
7-
pip install pycld2
8-
pip install pytesseract
9-
10-
pip install pdf2image
11-
12-
# Clone the Tesseract tessdata repository
13-
git clone https://github.com/tesseract-ocr/tessdata
14-
15-
# cd into tessdata and set TESSDATA_PREFIX to the current directory
16-
cd tessdata
17-
export TESSDATA_PREFIX="$(pwd)"
18-
19-
echo "TESSDATA_PREFIX is set to: $TESSDATA_PREFIX"
5+
pip install pdf2image

0 commit comments

Comments
 (0)