@@ -139,41 +139,61 @@ def serialize(
139139 res_parts : list [SerializationResult ] = []
140140 post_processed = False
141141
142- # Prepare the HTML based on item type
143- if isinstance (item , TitleItem ):
144- text_inner = self ._prepare_content (item .text )
145- text = get_html_tag_with_text_direction (html_tag = "h1" , text = text_inner )
142+ has_inline_repr = (
143+ item .text == ""
144+ and len (item .children ) == 1
145+ and isinstance ((child_group := item .children [0 ].resolve (doc )), InlineGroup )
146+ )
147+ if has_inline_repr :
148+ text = doc_serializer .serialize (item = child_group , visited = my_visited ).text
149+ post_processed = True
150+ else :
151+ text = item .text
152+ if not isinstance (item , (CodeItem , FormulaItem )):
153+ text = html .escape (text , quote = False )
154+ text = text .replace ("\n " , "<br>" )
146155
147- elif isinstance (item , SectionHeaderItem ):
148- section_level = min (item .level + 1 , 6 )
149- text_inner = self ._prepare_content (item .text )
156+ # Prepare the HTML based on item type
157+ if isinstance (item , (TitleItem , SectionHeaderItem )):
158+ section_level = (
159+ min (item .level + 1 , 6 ) if isinstance (item , SectionHeaderItem ) else 1
160+ )
150161 text = get_html_tag_with_text_direction (
151- html_tag = f"h{ section_level } " , text = text_inner
162+ html_tag = f"h{ section_level } " , text = text
152163 )
153164
154165 elif isinstance (item , FormulaItem ):
155166 text = self ._process_formula (
156167 item = item ,
168+ text = text ,
169+ orig = item .orig ,
157170 doc = doc ,
158171 image_mode = params .image_mode ,
159172 formula_to_mathml = params .formula_to_mathml ,
160173 is_inline_scope = is_inline_scope ,
161174 )
162175
163176 elif isinstance (item , CodeItem ):
164- text = self ._process_code (item = item , is_inline_scope = is_inline_scope )
177+ text = (
178+ f"<code>{ text } </code>"
179+ if is_inline_scope
180+ else f"<pre><code>{ text } </code></pre>"
181+ )
165182
166183 elif isinstance (item , ListItem ):
167184 # List items are handled by list serializer
168185 text_parts : list [str ] = []
169- if item_text := self ._prepare_content (item .text ):
170- item_text = doc_serializer .post_process (
171- text = item_text ,
172- formatting = item .formatting ,
173- hyperlink = item .hyperlink ,
174- )
175- post_processed = True
176- text_parts .append (item_text )
186+ if text :
187+ if has_inline_repr :
188+ text = f"\n { text } \n "
189+ else :
190+ text = doc_serializer .post_process (
191+ text = text ,
192+ formatting = item .formatting ,
193+ hyperlink = item .hyperlink ,
194+ )
195+ post_processed = True
196+ text_parts .append (text )
177197 nested_parts = [
178198 r .text
179199 for r in doc_serializer .get_parts (
@@ -184,29 +204,26 @@ def serialize(
184204 )
185205 ]
186206 text_parts .extend (nested_parts )
187- text_inner = "\n " .join (text_parts )
207+ text = "\n " .join (text_parts )
188208 if nested_parts :
189- text_inner = f"\n { text_inner } \n "
209+ text = f"\n { text } \n "
190210 text = (
191211 get_html_tag_with_text_direction (
192212 html_tag = "li" ,
193- text = text_inner ,
213+ text = text ,
194214 attrs = (
195215 {"style" : f"list-style-type: '{ item .marker } ';" }
196216 if params .show_original_list_item_marker and item .marker
197217 else {}
198218 ),
199219 )
200- if text_inner
220+ if text
201221 else ""
202222 )
203223
204- elif is_inline_scope :
205- text = self ._prepare_content (item .text )
206- else :
224+ elif not is_inline_scope :
207225 # Regular text item
208- text_inner = self ._prepare_content (item .text )
209- text = get_html_tag_with_text_direction (html_tag = "p" , text = text_inner )
226+ text = get_html_tag_with_text_direction (html_tag = "p" , text = text )
210227
211228 # Apply formatting and hyperlinks
212229 if not post_processed :
@@ -227,66 +244,44 @@ def serialize(
227244
228245 return create_ser_result (text = text , span_source = res_parts )
229246
230- def _prepare_content (
231- self , text : str , do_escape_html = True , do_replace_newline = True
232- ) -> str :
233- """Prepare text content for HTML inclusion."""
234- if do_escape_html :
235- text = html .escape (text , quote = False )
236- if do_replace_newline :
237- text = text .replace ("\n " , "<br>" )
238- return text
239-
240- def _process_code (
241- self ,
242- item : CodeItem ,
243- is_inline_scope : bool ,
244- ) -> str :
245- code_text = self ._prepare_content (
246- item .text , do_escape_html = False , do_replace_newline = False
247- )
248- if is_inline_scope :
249- text = f"<code>{ code_text } </code>"
250- else :
251- text = f"<pre><code>{ code_text } </code></pre>"
252-
253- return text
254-
255247 def _process_formula (
256248 self ,
257- item : FormulaItem ,
249+ * ,
250+ item : DocItem ,
251+ text : str ,
252+ orig : str ,
258253 doc : DoclingDocument ,
259254 image_mode : ImageRefMode ,
260255 formula_to_mathml : bool ,
261256 is_inline_scope : bool ,
262257 ) -> str :
263258 """Process a formula item to HTML/MathML."""
264- math_formula = self ._prepare_content (
265- item .text , do_escape_html = False , do_replace_newline = False
266- )
267-
268259 # If formula is empty, try to use an image fallback
269- if item .text == "" and item .orig != "" :
270- img_fallback = self ._get_formula_image_fallback (item , doc )
271- if (
272- image_mode == ImageRefMode .EMBEDDED
273- and len (item .prov ) > 0
274- and img_fallback
275- ):
276- return img_fallback
260+ if (
261+ text == ""
262+ and orig != ""
263+ and len (item .prov ) > 0
264+ and image_mode == ImageRefMode .EMBEDDED
265+ and (
266+ img_fallback := self ._get_formula_image_fallback (
267+ item = item , orig = orig , doc = doc
268+ )
269+ )
270+ ):
271+ return img_fallback
277272
278273 # Try to generate MathML
279- if formula_to_mathml and math_formula :
274+ elif formula_to_mathml and text :
280275 try :
281276 # Set display mode based on context
282277 display_mode = "inline" if is_inline_scope else "block"
283278 mathml_element = latex2mathml .converter .convert_to_element (
284- math_formula , display = display_mode
279+ text , display = display_mode
285280 )
286281 annotation = SubElement (
287282 mathml_element , "annotation" , dict (encoding = "TeX" )
288283 )
289- annotation .text = math_formula
284+ annotation .text = text
290285 mathml = unescape (tostring (mathml_element , encoding = "unicode" ))
291286
292287 # Don't wrap in div for inline formulas
@@ -296,40 +291,40 @@ def _process_formula(
296291 return f"<div>{ mathml } </div>"
297292
298293 except Exception :
299- img_fallback = self ._get_formula_image_fallback (item , doc )
294+ img_fallback = self ._get_formula_image_fallback (
295+ item = item , orig = orig , doc = doc
296+ )
300297 if (
301298 image_mode == ImageRefMode .EMBEDDED
302299 and len (item .prov ) > 0
303300 and img_fallback
304301 ):
305302 return img_fallback
306- elif math_formula :
307- return f"<pre>{ math_formula } </pre>"
303+ elif text :
304+ return f"<pre>{ text } </pre>"
308305 else :
309306 return "<pre>Formula not decoded</pre>"
310307
311308 _logger .warning ("Could not parse formula with MathML" )
312309
313310 # Fallback options if we got here
314- if math_formula and is_inline_scope :
315- return f"<code>{ math_formula } </code>"
316- elif math_formula and (not is_inline_scope ):
317- f"<pre>{ math_formula } </pre>"
311+ if text and is_inline_scope :
312+ return f"<code>{ text } </code>"
313+ elif text and (not is_inline_scope ):
314+ f"<pre>{ text } </pre>"
318315 elif is_inline_scope :
319316 return '<span class="formula-not-decoded">Formula not decoded</span>'
320317
321318 return '<div class="formula-not-decoded">Formula not decoded</div>'
322319
323320 def _get_formula_image_fallback (
324- self , item : TextItem , doc : DoclingDocument
321+ self , * , item : DocItem , orig : str , doc : DoclingDocument
325322 ) -> Optional [str ]:
326323 """Try to get an image fallback for a formula."""
327324 item_image = item .get_image (doc = doc )
328325 if item_image is not None :
329326 img_ref = ImageRef .from_pil (item_image , dpi = 72 )
330- return (
331- "<figure>" f'<img src="{ img_ref .uri } " alt="{ item .orig } " />' "</figure>"
332- )
327+ return "<figure>" f'<img src="{ img_ref .uri } " alt="{ orig } " />' "</figure>"
333328 return None
334329
335330
0 commit comments