@@ -233,20 +233,6 @@ def wrap_pdfchat_query(query, document):
233233
234234
235235LLAMA_PARSE_MAX_RETRY = 2
236- TESSERACT_SUPPORTED_LANGS = "+" .join (
237- [
238- "en" ,
239- "chi_tra" ,
240- "chi_sim" ,
241- "rus" ,
242- "spa" ,
243- "jpn" ,
244- "kor" ,
245- "fra" ,
246- "deu" , # German
247- "vie" ,
248- ]
249- )
250236LLAMAPARSE_SUPPORTED_LANGS = {
251237 "English" : "en" ,
252238 "Chinese" : "ch_sim" ,
@@ -260,44 +246,21 @@ def wrap_pdfchat_query(query, document):
260246}
261247
262248
263- def detect_language_from_doc (pdf_file_path ):
264- from pdf2image import convert_from_path
265- from polyglot .detect import Detector
266-
267- import pytesseract # Google's open-source OCR tool
268-
269- assert os .environ [
270- "TESSDATA_PREFIX"
271- ], "Make sure to specify location of train data for Tesseract."
272-
273- # Convert pdf into image (first page only for efficiency)
274- images = convert_from_path (pdf_file_path )
275-
276- extracted_text = pytesseract .image_to_string (
277- images [0 ], lang = TESSERACT_SUPPORTED_LANGS
278- )
279-
280- languages = Detector (extracted_text , quiet = True )
281- # return languages
282- return [lang .name for lang in languages .languages if lang .name != "un" ]
283-
284-
285249def parse_pdf (file_path ):
286250 from llama_parse import LlamaParse
287251
288252 assert (
289253 "LLAMA_CLOUD_API_KEY" in os .environ
290254 ), "Make sure to specify LlamaParse API key."
291255
292- doc_lang = detect_language_from_doc (file_path )
293- doc_lang = LLAMAPARSE_SUPPORTED_LANGS [doc_lang [0 ]]
294-
295256 for _ in range (LLAMA_PARSE_MAX_RETRY ):
296257 try :
297258 documents = LlamaParse (
298259 result_type = "markdown" ,
299260 verbose = True ,
300- language = doc_lang ,
261+ languages = list (
262+ LLAMAPARSE_SUPPORTED_LANGS .values ()
263+ ),
301264 accurate_mode = True ,
302265 ).load_data (file_path )
303266 assert len (documents ) > 0
0 commit comments