From 605b15498405f044172ac89e951bc1d81b70b043 Mon Sep 17 00:00:00 2001 From: Mohking1 Date: Fri, 1 Aug 2025 11:31:09 +0530 Subject: [PATCH] Fixed streamlit app not running bad ocr test in pdf --- surya/scripts/streamlit_app.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/surya/scripts/streamlit_app.py b/surya/scripts/streamlit_app.py index 87b69aab..877b70b0 100644 --- a/surya/scripts/streamlit_app.py +++ b/surya/scripts/streamlit_app.py @@ -28,16 +28,22 @@ def load_predictors_cached(): def ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15): from pdftext.extraction import plain_text_output - with tempfile.NamedTemporaryFile(suffix=".pdf") as f: + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: f.write(pdf_file.getvalue()) - f.seek(0) + f.flush() + temp_path = f.name + try: # Sample the text from the middle of the PDF page_middle = page_count // 2 page_range = range( max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count) ) - text = plain_text_output(f.name, page_range=page_range) + text = plain_text_output(temp_path, page_range=page_range) + finally: + # Clean up the temporary file + import os + os.unlink(temp_path) sample_gap = len(text) // max_samples if len(text) == 0 or sample_gap == 0: