44import os
55import tempfile
66import base64
7+ import zipfile
78
89API_BASE_URL = os .environ .get ("API_BASE_URL" , "http://localhost:5060" )
910
@@ -289,7 +290,7 @@ def convert_to_markdown(
289290 # Validate PDF file
290291 is_valid , validation_msg = validate_pdf_file (pdf_file )
291292 if not is_valid :
292- return validation_msg , None
293+ return validation_msg , "" , None
293294
294295 progress (0 , desc = "Preparing conversion..." )
295296
@@ -329,17 +330,74 @@ def convert_to_markdown(
329330 tmp_file .write (response .content )
330331 output_path = tmp_file .name
331332
333+ # Extract the base language markdown content from the ZIP
334+ markdown_content = ""
335+ try :
336+ with zipfile .ZipFile (output_path , "r" ) as zip_ref :
337+ # Look for the base markdown file (not translated versions)
338+ markdown_files = [
339+ f
340+ for f in zip_ref .namelist ()
341+ if f .endswith (".md" )
342+ and not any (
343+ lang .lower () in f .lower ()
344+ for lang in [
345+ "arabic" ,
346+ "chinese" ,
347+ "czech" ,
348+ "danish" ,
349+ "dutch" ,
350+ "english" ,
351+ "finnish" ,
352+ "french" ,
353+ "german" ,
354+ "greek" ,
355+ "hebrew" ,
356+ "hindi" ,
357+ "hungarian" ,
358+ "indonesian" ,
359+ "italian" ,
360+ "japanese" ,
361+ "korean" ,
362+ "norwegian" ,
363+ "polish" ,
364+ "portuguese" ,
365+ "romanian" ,
366+ "russian" ,
367+ "spanish" ,
368+ "swedish" ,
369+ "thai" ,
370+ "turkish" ,
371+ "ukrainian" ,
372+ "vietnamese" ,
373+ ]
374+ if target_languages_str and lang in target_languages_str
375+ )
376+ ]
377+
378+ if not markdown_files :
379+ # If no base file found, just get the first .md file
380+ markdown_files = [f for f in zip_ref .namelist () if f .endswith (".md" )]
381+
382+ if markdown_files :
383+ # Read the first markdown file
384+ with zip_ref .open (markdown_files [0 ]) as md_file :
385+ markdown_content = md_file .read ().decode ("utf-8" , errors = "ignore" )
386+ except Exception as e :
387+ print (f"Error extracting markdown content: { e } " )
388+ markdown_content = "Could not extract markdown content from ZIP file."
389+
332390 summary = "✅ Converted to Markdown successfully!\n "
333391 summary += "Download the ZIP file below (contains markdown, images, and segmentation data)"
334392 if target_languages_str and target_languages_str .strip ():
335393 summary += f"\n Includes translations to: { target_languages_str } "
336394
337395 progress (1.0 , desc = "Done!" )
338- return summary , output_path
396+ return summary , markdown_content , output_path
339397 except Exception as e :
340398 error_msg = f"❌ Error: { str (e )} "
341399 print (f"Markdown conversion error: { e } " )
342- return error_msg , None
400+ return error_msg , "" , None
343401
344402
345403def convert_to_html (
@@ -356,7 +414,7 @@ def convert_to_html(
356414 # Validate PDF file
357415 is_valid , validation_msg = validate_pdf_file (pdf_file )
358416 if not is_valid :
359- return validation_msg , None
417+ return validation_msg , "" , None
360418
361419 progress (0 , desc = "Preparing conversion..." )
362420
@@ -396,17 +454,74 @@ def convert_to_html(
396454 tmp_file .write (response .content )
397455 output_path = tmp_file .name
398456
457+ # Extract the base language HTML content from the ZIP
458+ html_content = ""
459+ try :
460+ with zipfile .ZipFile (output_path , "r" ) as zip_ref :
461+ # Look for the base HTML file (not translated versions)
462+ html_files = [
463+ f
464+ for f in zip_ref .namelist ()
465+ if f .endswith (".html" )
466+ and not any (
467+ lang .lower () in f .lower ()
468+ for lang in [
469+ "arabic" ,
470+ "chinese" ,
471+ "czech" ,
472+ "danish" ,
473+ "dutch" ,
474+ "english" ,
475+ "finnish" ,
476+ "french" ,
477+ "german" ,
478+ "greek" ,
479+ "hebrew" ,
480+ "hindi" ,
481+ "hungarian" ,
482+ "indonesian" ,
483+ "italian" ,
484+ "japanese" ,
485+ "korean" ,
486+ "norwegian" ,
487+ "polish" ,
488+ "portuguese" ,
489+ "romanian" ,
490+ "russian" ,
491+ "spanish" ,
492+ "swedish" ,
493+ "thai" ,
494+ "turkish" ,
495+ "ukrainian" ,
496+ "vietnamese" ,
497+ ]
498+ if target_languages_str and lang in target_languages_str
499+ )
500+ ]
501+
502+ if not html_files :
503+ # If no base file found, just get the first .html file
504+ html_files = [f for f in zip_ref .namelist () if f .endswith (".html" )]
505+
506+ if html_files :
507+ # Read the first HTML file
508+ with zip_ref .open (html_files [0 ]) as html_file :
509+ html_content = html_file .read ().decode ("utf-8" , errors = "ignore" )
510+ except Exception as e :
511+ print (f"Error extracting HTML content: { e } " )
512+ html_content = "Could not extract HTML content from ZIP file."
513+
399514 summary = "✅ Converted to HTML successfully!\n "
400515 summary += "Download the ZIP file below (contains HTML, images, and segmentation data)"
401516 if target_languages_str and target_languages_str .strip ():
402517 summary += f"\n Includes translations to: { target_languages_str } "
403518
404519 progress (1.0 , desc = "Done!" )
405- return summary , output_path
520+ return summary , html_content , output_path
406521 except Exception as e :
407522 error_msg = f"❌ Error: { str (e )} "
408523 print (f"HTML conversion error: { e } " )
409- return error_msg , None
524+ return error_msg , "" , None
410525
411526
412527# Create the Gradio interface with modern Gradio 5 styling
@@ -631,7 +746,7 @@ def convert_to_html(
631746 extract_toc_md = gr .Checkbox (label = "Add TOC" , value = False )
632747 dpi_md = gr .Slider (label = "DPI" , minimum = 72 , maximum = 300 , value = 120 , step = 1 )
633748 target_langs_md = gr .Dropdown (
634- label = "Target Languages " ,
749+ label = "Translate to: " ,
635750 choices = [
636751 "Arabic" ,
637752 "Chinese" ,
@@ -678,12 +793,15 @@ def convert_to_html(
678793
679794 with gr .Column (scale = 2 ):
680795 md_summary = gr .Textbox (label = "Summary" , lines = 2 )
796+ md_content = gr .Textbox (
797+ label = "Markdown Output (Base Language)" , lines = 20 , elem_classes = "output-text" , show_copy_button = True
798+ )
681799 md_output = gr .File (label = "Download ZIP (contains Markdown + images + segmentation)" )
682800
683801 md_btn .click (
684802 fn = convert_to_markdown ,
685803 inputs = [pdf_input_md , fast_mode_md , extract_toc_md , dpi_md , target_langs_md , translation_model_md ],
686- outputs = [md_summary , md_output ],
804+ outputs = [md_summary , md_content , md_output ],
687805 concurrency_limit = 2 ,
688806 show_progress = "full" ,
689807 )
@@ -702,7 +820,7 @@ def convert_to_html(
702820 extract_toc_html = gr .Checkbox (label = "Add TOC" , value = False )
703821 dpi_html = gr .Slider (label = "DPI" , minimum = 72 , maximum = 300 , value = 120 , step = 1 )
704822 target_langs_html = gr .Dropdown (
705- label = "Target Languages " ,
823+ label = "Translate to: " ,
706824 choices = [
707825 "Arabic" ,
708826 "Chinese" ,
@@ -749,6 +867,9 @@ def convert_to_html(
749867
750868 with gr .Column (scale = 2 ):
751869 html_summary = gr .Textbox (label = "Summary" , lines = 2 )
870+ html_content = gr .Textbox (
871+ label = "HTML Output (Base Language)" , lines = 20 , elem_classes = "output-text" , show_copy_button = True
872+ )
752873 html_output = gr .File (label = "Download ZIP (contains HTML + images + segmentation)" )
753874
754875 html_btn .click (
@@ -761,7 +882,7 @@ def convert_to_html(
761882 target_langs_html ,
762883 translation_model_html ,
763884 ],
764- outputs = [html_summary , html_output ],
885+ outputs = [html_summary , html_content , html_output ],
765886 concurrency_limit = 2 ,
766887 show_progress = "full" ,
767888 )
0 commit comments