33import base64
44import copy
55import hashlib
6+ import html
67import json
78import mimetypes
89import os
@@ -1045,7 +1046,7 @@ def export_to_html(
10451046
10461047 text = ""
10471048 if doc is not None and add_caption and len (self .captions ):
1048- text = self .caption_text (doc )
1049+ text = html . escape ( self .caption_text (doc ) )
10491050
10501051 if len (self .data .table_cells ) == 0 :
10511052 return ""
@@ -1071,7 +1072,7 @@ def export_to_html(
10711072 if colstart != j :
10721073 continue
10731074
1074- content = cell .text .strip ()
1075+ content = html . escape ( cell .text .strip () )
10751076 celltag = "td"
10761077 if cell .column_header :
10771078 celltag = "th"
@@ -2082,6 +2083,46 @@ def export_to_markdown( # noqa: C901
20822083 previous_level = 0 # Track the previous item's level
20832084 in_list = False # Track if we're currently processing list items
20842085
2086+ # Our export markdown doesn't contain any emphasis styling:
2087+ # Bold, Italic, or Bold-Italic
2088+ # Hence, any underscore that we print into Markdown is coming from document text
2089+ # That means we need to escape it, to properly reflect content in the markdown
2090+ # However, we need to preserve underscores in image URLs
2091+ # to maintain their validity
2092+ # For example:  should remain unchanged
2093+ def _escape_underscores (text ):
2094+ """Escape underscores but leave them intact in the URL.."""
2095+ # Firstly, identify all the URL patterns.
2096+ url_pattern = r"!\[.*?\]\((.*?)\)"
2097+ # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2098+ latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2099+ combined_pattern = f"({ url_pattern } )|({ latex_pattern } )"
2100+
2101+ parts = []
2102+ last_end = 0
2103+
2104+ for match in re .finditer (combined_pattern , text ):
2105+ # Text to add before the URL (needs to be escaped)
2106+ before_url = text [last_end : match .start ()]
2107+ parts .append (re .sub (r"(?<!\\)_" , r"\_" , before_url ))
2108+
2109+ # Add the full URL part (do not escape)
2110+ parts .append (match .group (0 ))
2111+ last_end = match .end ()
2112+
2113+ # Add the final part of the text (which needs to be escaped)
2114+ if last_end < len (text ):
2115+ parts .append (re .sub (r"(?<!\\)_" , r"\_" , text [last_end :]))
2116+
2117+ return "" .join (parts )
2118+
2119+ def _append_text (text : str , do_escape_html = True , do_escape_underscores = True ):
2120+ if do_escape_underscores and escaping_underscores :
2121+ text = _escape_underscores (text )
2122+ if do_escape_html :
2123+ text = html .escape (text , quote = False )
2124+ mdtexts .append (text )
2125+
20852126 for ix , (item , level ) in enumerate (
20862127 self .iterate_items (self .body , with_groups = True , page_no = page_no )
20872128 ):
@@ -2130,7 +2171,7 @@ def export_to_markdown( # noqa: C901
21302171 in_list = False
21312172 marker = "" if strict_text else "#"
21322173 text = f"{ marker } { item .text } "
2133- mdtexts . append (text .strip () + "\n " )
2174+ _append_text (text .strip () + "\n " )
21342175
21352176 elif (
21362177 isinstance (item , TextItem )
@@ -2143,12 +2184,12 @@ def export_to_markdown( # noqa: C901
21432184 if len (marker ) < 2 :
21442185 marker = "##"
21452186 text = f"{ marker } { item .text } \n "
2146- mdtexts . append (text .strip () + "\n " )
2187+ _append_text (text .strip () + "\n " )
21472188
21482189 elif isinstance (item , CodeItem ) and item .label in labels :
21492190 in_list = False
21502191 text = f"```\n { item .text } \n ```\n "
2151- mdtexts . append (text )
2192+ _append_text (text , do_escape_underscores = False , do_escape_html = False )
21522193
21532194 elif isinstance (item , ListItem ) and item .label in [DocItemLabel .LIST_ITEM ]:
21542195 in_list = True
@@ -2165,85 +2206,54 @@ def export_to_markdown( # noqa: C901
21652206 marker = "-" # Markdown needs only dash as item marker.
21662207
21672208 text = f"{ list_indent } { marker } { item .text } "
2168- mdtexts . append (text )
2209+ _append_text (text )
21692210
21702211 elif isinstance (item , TextItem ) and item .label in [DocItemLabel .FORMULA ]:
21712212 in_list = False
2172- mdtexts .append (f"$${ item .text } $$\n " )
2213+ _append_text (
2214+ f"$${ item .text } $$\n " ,
2215+ do_escape_underscores = False ,
2216+ do_escape_html = False ,
2217+ )
21732218
21742219 elif isinstance (item , TextItem ) and item .label in labels :
21752220 in_list = False
21762221 if len (item .text ) and text_width > 0 :
2222+ text = item .text
21772223 wrapped_text = textwrap .fill (text , width = text_width )
2178- mdtexts . append (wrapped_text + "\n " )
2224+ _append_text (wrapped_text + "\n " )
21792225 elif len (item .text ):
21802226 text = f"{ item .text } \n "
2181- mdtexts . append (text )
2227+ _append_text (text )
21822228
21832229 elif isinstance (item , TableItem ) and not strict_text :
21842230 in_list = False
2185- mdtexts . append (item .caption_text (self ))
2231+ _append_text (item .caption_text (self ))
21862232 md_table = item .export_to_markdown ()
2187- mdtexts . append ("\n " + md_table + "\n " )
2233+ _append_text ("\n " + md_table + "\n " )
21882234
21892235 elif isinstance (item , PictureItem ) and not strict_text :
21902236 in_list = False
2191- mdtexts . append (item .caption_text (self ))
2237+ _append_text (item .caption_text (self ))
21922238
21932239 line = item .export_to_markdown (
21942240 doc = self ,
21952241 image_placeholder = image_placeholder ,
21962242 image_mode = image_mode ,
21972243 )
21982244
2199- mdtexts . append (line )
2245+ _append_text (line , do_escape_html = False , do_escape_underscores = False )
22002246
22012247 elif isinstance (item , DocItem ) and item .label in labels :
22022248 in_list = False
2203- text = "<missing-text>"
2204- mdtexts . append (text )
2249+ text = "<!-- missing-text -- >"
2250+ _append_text (text , do_escape_html = False , do_escape_underscores = False )
22052251
22062252 mdtext = (delim .join (mdtexts )).strip ()
22072253 mdtext = re .sub (
22082254 r"\n\n\n+" , "\n \n " , mdtext
22092255 ) # remove cases of double or more empty lines.
22102256
2211- # Our export markdown doesn't contain any emphasis styling:
2212- # Bold, Italic, or Bold-Italic
2213- # Hence, any underscore that we print into Markdown is coming from document text
2214- # That means we need to escape it, to properly reflect content in the markdown
2215- # However, we need to preserve underscores in image URLs
2216- # to maintain their validity
2217- # For example:  should remain unchanged
2218- def escape_underscores (text ):
2219- """Escape underscores but leave them intact in the URL.."""
2220- # Firstly, identify all the URL patterns.
2221- url_pattern = r"!\[.*?\]\((.*?)\)"
2222- # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
2223- latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
2224- combined_pattern = f"({ url_pattern } )|({ latex_pattern } )"
2225-
2226- parts = []
2227- last_end = 0
2228-
2229- for match in re .finditer (combined_pattern , text ):
2230- # Text to add before the URL (needs to be escaped)
2231- before_url = text [last_end : match .start ()]
2232- parts .append (re .sub (r"(?<!\\)_" , r"\_" , before_url ))
2233-
2234- # Add the full URL part (do not escape)
2235- parts .append (match .group (0 ))
2236- last_end = match .end ()
2237-
2238- # Add the final part of the text (which needs to be escaped)
2239- if last_end < len (text ):
2240- parts .append (re .sub (r"(?<!\\)_" , r"\_" , text [last_end :]))
2241-
2242- return "" .join (parts )
2243-
2244- if escaping_underscores :
2245- mdtext = escape_underscores (mdtext )
2246-
22472257 return mdtext
22482258
22492259 def export_to_text ( # noqa: C901
@@ -2371,6 +2381,11 @@ def close_lists(
23712381
23722382 in_ordered_list : List [bool ] = [] # False
23732383
2384+ def _sanitize_text (text : str , do_escape_html = True ) -> str :
2385+ if do_escape_html :
2386+ text = html .escape (text , quote = False )
2387+ return text
2388+
23742389 for ix , (item , curr_level ) in enumerate (
23752390 self .iterate_items (self .body , with_groups = True , page_no = page_no )
23762391 ):
@@ -2421,14 +2436,17 @@ def close_lists(
24212436
24222437 elif isinstance (item , TextItem ) and item .label in [DocItemLabel .TITLE ]:
24232438
2424- text = f"<h1>{ item .text } </h1>"
2439+ text = f"<h1>{ _sanitize_text ( item .text ) } </h1>"
24252440 html_texts .append (text .strip ())
24262441
24272442 elif isinstance (item , SectionHeaderItem ):
24282443
24292444 section_level : int = item .level + 1
24302445
2431- text = f"<h{ (section_level )} >{ item .text } </h{ (section_level )} >"
2446+ text = (
2447+ f"<h{ (section_level )} >"
2448+ f"{ _sanitize_text (item .text )} </h{ (section_level )} >"
2449+ )
24322450 html_texts .append (text .strip ())
24332451
24342452 elif isinstance (item , TextItem ) and item .label in [
@@ -2443,31 +2461,37 @@ def close_lists(
24432461 if section_level >= 6 :
24442462 section_level = 6
24452463
2446- text = f"<h{ section_level } >{ item .text } </h{ section_level } >"
2464+ text = (
2465+ f"<h{ section_level } >{ _sanitize_text (item .text )} </h{ section_level } >"
2466+ )
24472467 html_texts .append (text .strip ())
24482468
24492469 elif isinstance (item , TextItem ) and item .label in [DocItemLabel .CODE ]:
24502470
2451- text = f"<pre>{ item .text } </pre>"
2471+ text = f"<pre>{ _sanitize_text ( item .text , do_escape_html = False ) } </pre>"
24522472 html_texts .append (text )
24532473
24542474 elif isinstance (item , ListItem ):
24552475
2456- text = f"<li>{ item .text } </li>"
2476+ text = f"<li>{ _sanitize_text ( item .text ) } </li>"
24572477 html_texts .append (text )
24582478
24592479 elif isinstance (item , TextItem ) and item .label in [DocItemLabel .LIST_ITEM ]:
24602480
2461- text = f"<li>{ item .text } </li>"
2481+ text = f"<li>{ _sanitize_text ( item .text ) } </li>"
24622482 html_texts .append (text )
24632483
24642484 elif isinstance (item , CodeItem ) and item .label in labels :
2465- text = f"<pre><code>{ item .text } </code></pre>"
2485+ text = (
2486+ "<pre><code>"
2487+ f"{ _sanitize_text (item .text , do_escape_html = False )} "
2488+ "</code></pre>"
2489+ )
24662490 html_texts .append (text .strip ())
24672491
24682492 elif isinstance (item , TextItem ) and item .label in labels :
24692493
2470- text = f"<p>{ item .text } </p>"
2494+ text = f"<p>{ _sanitize_text ( item .text ) } </p>"
24712495 html_texts .append (text .strip ())
24722496 elif isinstance (item , TableItem ):
24732497
0 commit comments