From 605b15498405f044172ac89e951bc1d81b70b043 Mon Sep 17 00:00:00 2001
From: Mohking1 <lokhandwalamohammed100@gmail.com>
Date: Fri, 1 Aug 2025 11:31:09 +0530
Subject: [PATCH] Fixed streamlit app not running bad ocr test in pdf

---
 surya/scripts/streamlit_app.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/surya/scripts/streamlit_app.py b/surya/scripts/streamlit_app.py
index 87b69aab..877b70b0 100644
--- a/surya/scripts/streamlit_app.py
+++ b/surya/scripts/streamlit_app.py
@@ -28,16 +28,22 @@ def load_predictors_cached():
 def ocr_errors(pdf_file, page_count, sample_len=512, max_samples=10, max_pages=15):
     from pdftext.extraction import plain_text_output
 
-    with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
         f.write(pdf_file.getvalue())
-        f.seek(0)
+        f.flush()
+        temp_path = f.name
 
+    try:
         # Sample the text from the middle of the PDF
         page_middle = page_count // 2
         page_range = range(
             max(page_middle - max_pages, 0), min(page_middle + max_pages, page_count)
         )
-        text = plain_text_output(f.name, page_range=page_range)
+        text = plain_text_output(temp_path, page_range=page_range)
+    finally:
+        # Clean up the temporary file
+        import os
+        os.unlink(temp_path)
 
     sample_gap = len(text) // max_samples
     if len(text) == 0 or sample_gap == 0: