Skip to content

Commit 463c8ec

Browse files
committed
Display markdown and HTML outputs
1 parent 8d63f25 commit 463c8ec

File tree

1 file changed

+131
-10
lines changed

1 file changed

+131
-10
lines changed

src/gradio_app.py

Lines changed: 131 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import tempfile
66
import base64
7+
import zipfile
78

89
API_BASE_URL = os.environ.get("API_BASE_URL", "http://localhost:5060")
910

@@ -289,7 +290,7 @@ def convert_to_markdown(
289290
# Validate PDF file
290291
is_valid, validation_msg = validate_pdf_file(pdf_file)
291292
if not is_valid:
292-
return validation_msg, None
293+
return validation_msg, "", None
293294

294295
progress(0, desc="Preparing conversion...")
295296

@@ -329,17 +330,74 @@ def convert_to_markdown(
329330
tmp_file.write(response.content)
330331
output_path = tmp_file.name
331332

333+
# Extract the base language markdown content from the ZIP
334+
markdown_content = ""
335+
try:
336+
with zipfile.ZipFile(output_path, "r") as zip_ref:
337+
# Look for the base markdown file (not translated versions)
338+
markdown_files = [
339+
f
340+
for f in zip_ref.namelist()
341+
if f.endswith(".md")
342+
and not any(
343+
lang.lower() in f.lower()
344+
for lang in [
345+
"arabic",
346+
"chinese",
347+
"czech",
348+
"danish",
349+
"dutch",
350+
"english",
351+
"finnish",
352+
"french",
353+
"german",
354+
"greek",
355+
"hebrew",
356+
"hindi",
357+
"hungarian",
358+
"indonesian",
359+
"italian",
360+
"japanese",
361+
"korean",
362+
"norwegian",
363+
"polish",
364+
"portuguese",
365+
"romanian",
366+
"russian",
367+
"spanish",
368+
"swedish",
369+
"thai",
370+
"turkish",
371+
"ukrainian",
372+
"vietnamese",
373+
]
374+
if target_languages_str and lang in target_languages_str
375+
)
376+
]
377+
378+
if not markdown_files:
379+
# If no base file found, just get the first .md file
380+
markdown_files = [f for f in zip_ref.namelist() if f.endswith(".md")]
381+
382+
if markdown_files:
383+
# Read the first markdown file
384+
with zip_ref.open(markdown_files[0]) as md_file:
385+
markdown_content = md_file.read().decode("utf-8", errors="ignore")
386+
except Exception as e:
387+
print(f"Error extracting markdown content: {e}")
388+
markdown_content = "Could not extract markdown content from ZIP file."
389+
332390
summary = "✅ Converted to Markdown successfully!\n"
333391
summary += "Download the ZIP file below (contains markdown, images, and segmentation data)"
334392
if target_languages_str and target_languages_str.strip():
335393
summary += f"\nIncludes translations to: {target_languages_str}"
336394

337395
progress(1.0, desc="Done!")
338-
return summary, output_path
396+
return summary, markdown_content, output_path
339397
except Exception as e:
340398
error_msg = f"❌ Error: {str(e)}"
341399
print(f"Markdown conversion error: {e}")
342-
return error_msg, None
400+
return error_msg, "", None
343401

344402

345403
def convert_to_html(
@@ -356,7 +414,7 @@ def convert_to_html(
356414
# Validate PDF file
357415
is_valid, validation_msg = validate_pdf_file(pdf_file)
358416
if not is_valid:
359-
return validation_msg, None
417+
return validation_msg, "", None
360418

361419
progress(0, desc="Preparing conversion...")
362420

@@ -396,17 +454,74 @@ def convert_to_html(
396454
tmp_file.write(response.content)
397455
output_path = tmp_file.name
398456

457+
# Extract the base language HTML content from the ZIP
458+
html_content = ""
459+
try:
460+
with zipfile.ZipFile(output_path, "r") as zip_ref:
461+
# Look for the base HTML file (not translated versions)
462+
html_files = [
463+
f
464+
for f in zip_ref.namelist()
465+
if f.endswith(".html")
466+
and not any(
467+
lang.lower() in f.lower()
468+
for lang in [
469+
"arabic",
470+
"chinese",
471+
"czech",
472+
"danish",
473+
"dutch",
474+
"english",
475+
"finnish",
476+
"french",
477+
"german",
478+
"greek",
479+
"hebrew",
480+
"hindi",
481+
"hungarian",
482+
"indonesian",
483+
"italian",
484+
"japanese",
485+
"korean",
486+
"norwegian",
487+
"polish",
488+
"portuguese",
489+
"romanian",
490+
"russian",
491+
"spanish",
492+
"swedish",
493+
"thai",
494+
"turkish",
495+
"ukrainian",
496+
"vietnamese",
497+
]
498+
if target_languages_str and lang in target_languages_str
499+
)
500+
]
501+
502+
if not html_files:
503+
# If no base file found, just get the first .html file
504+
html_files = [f for f in zip_ref.namelist() if f.endswith(".html")]
505+
506+
if html_files:
507+
# Read the first HTML file
508+
with zip_ref.open(html_files[0]) as html_file:
509+
html_content = html_file.read().decode("utf-8", errors="ignore")
510+
except Exception as e:
511+
print(f"Error extracting HTML content: {e}")
512+
html_content = "Could not extract HTML content from ZIP file."
513+
399514
summary = "✅ Converted to HTML successfully!\n"
400515
summary += "Download the ZIP file below (contains HTML, images, and segmentation data)"
401516
if target_languages_str and target_languages_str.strip():
402517
summary += f"\nIncludes translations to: {target_languages_str}"
403518

404519
progress(1.0, desc="Done!")
405-
return summary, output_path
520+
return summary, html_content, output_path
406521
except Exception as e:
407522
error_msg = f"❌ Error: {str(e)}"
408523
print(f"HTML conversion error: {e}")
409-
return error_msg, None
524+
return error_msg, "", None
410525

411526

412527
# Create the Gradio interface with modern Gradio 5 styling
@@ -631,7 +746,7 @@ def convert_to_html(
631746
extract_toc_md = gr.Checkbox(label="Add TOC", value=False)
632747
dpi_md = gr.Slider(label="DPI", minimum=72, maximum=300, value=120, step=1)
633748
target_langs_md = gr.Dropdown(
634-
label="Target Languages",
749+
label="Translate to:",
635750
choices=[
636751
"Arabic",
637752
"Chinese",
@@ -678,12 +793,15 @@ def convert_to_html(
678793

679794
with gr.Column(scale=2):
680795
md_summary = gr.Textbox(label="Summary", lines=2)
796+
md_content = gr.Textbox(
797+
label="Markdown Output (Base Language)", lines=20, elem_classes="output-text", show_copy_button=True
798+
)
681799
md_output = gr.File(label="Download ZIP (contains Markdown + images + segmentation)")
682800

683801
md_btn.click(
684802
fn=convert_to_markdown,
685803
inputs=[pdf_input_md, fast_mode_md, extract_toc_md, dpi_md, target_langs_md, translation_model_md],
686-
outputs=[md_summary, md_output],
804+
outputs=[md_summary, md_content, md_output],
687805
concurrency_limit=2,
688806
show_progress="full",
689807
)
@@ -702,7 +820,7 @@ def convert_to_html(
702820
extract_toc_html = gr.Checkbox(label="Add TOC", value=False)
703821
dpi_html = gr.Slider(label="DPI", minimum=72, maximum=300, value=120, step=1)
704822
target_langs_html = gr.Dropdown(
705-
label="Target Languages",
823+
label="Translate to:",
706824
choices=[
707825
"Arabic",
708826
"Chinese",
@@ -749,6 +867,9 @@ def convert_to_html(
749867

750868
with gr.Column(scale=2):
751869
html_summary = gr.Textbox(label="Summary", lines=2)
870+
html_content = gr.Textbox(
871+
label="HTML Output (Base Language)", lines=20, elem_classes="output-text", show_copy_button=True
872+
)
752873
html_output = gr.File(label="Download ZIP (contains HTML + images + segmentation)")
753874

754875
html_btn.click(
@@ -761,7 +882,7 @@ def convert_to_html(
761882
target_langs_html,
762883
translation_model_html,
763884
],
764-
outputs=[html_summary, html_output],
885+
outputs=[html_summary, html_content, html_output],
765886
concurrency_limit=2,
766887
show_progress="full",
767888
)

0 commit comments

Comments
 (0)