Skip to content

Commit c38f476

Browse files
reformatted code
Signed-off-by: Peter Staar <[email protected]>
1 parent 8c5ac50 commit c38f476

File tree

2 files changed

+63
-23
lines changed

2 files changed

+63
-23
lines changed

docling_core/transforms/serializer/latex.py

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,12 @@ def serialize(
118118
is_inline_scope: bool = False,
119119
**kwargs: Any,
120120
) -> SerializationResult:
121-
params = LaTeXParams(**kwargs)
121+
"""Serialize a ``TextItem`` into LaTeX, handling lists, titles, headers, code and formulas.
122+
123+
Applies post-processing (escape, formatting, hyperlinks) when appropriate and
124+
returns a ``SerializationResult`` ready to be joined into the document.
125+
"""
126+
LaTeXParams(**kwargs)
122127
parts: list[SerializationResult] = []
123128

124129
# Inline group passthrough
@@ -225,6 +230,7 @@ def serialize(
225230
doc: DoclingDocument,
226231
**kwargs: Any,
227232
) -> SerializationResult:
233+
"""Serialize supported annotations of ``item`` as LaTeX comments."""
228234
params = LaTeXParams(**kwargs)
229235
res_parts: list[SerializationResult] = []
230236
if not params.include_annotations:
@@ -263,6 +269,7 @@ def serialize(
263269
doc: DoclingDocument,
264270
**kwargs: Any,
265271
) -> SerializationResult:
272+
"""Serialize a ``TableItem`` into a LaTeX ``tabular`` wrapped in ``table``."""
266273
params = LaTeXParams(**kwargs)
267274
res_parts: list[SerializationResult] = []
268275

@@ -282,7 +289,9 @@ def serialize(
282289
).text
283290
else:
284291
cell_text = (
285-
_escape_latex(cell.text) if params.escape_latex else cell.text
292+
_escape_latex(cell.text)
293+
if params.escape_latex
294+
else cell.text
286295
)
287296
body_row.append(cell_text.replace("\n", " "))
288297
body_rows.append(body_row)
@@ -293,8 +302,10 @@ def serialize(
293302
ncols = max(len(r) for r in body_rows)
294303
colspec = "|" + "|".join(["l"] * ncols) + "|"
295304
lines = [f"\\begin{{tabular}}{{{colspec}}}", "\\hline"]
296-
for row in body_rows:
297-
line = " & ".join(row) + r" \\ \hline"
305+
# Use a distinct variable name to avoid shadowing the earlier
306+
# 'row' (which iterates over TableCell lists) and confusing type inference
307+
for str_row in body_rows:
308+
line = " & ".join(str_row) + r" \\ \hline"
298309
lines.append(line)
299310
lines.append("\\end{tabular}")
300311
table_text = "\n".join(lines)
@@ -311,9 +322,14 @@ def serialize(
311322
if table_text:
312323
content.append(table_text)
313324
content.append("\\end{table}")
314-
res_parts.append(create_ser_result(text="\n".join(content), span_source=item))
325+
res_parts.append(
326+
create_ser_result(text="\n".join(content), span_source=item)
327+
)
315328

316-
return create_ser_result(text="\n\n".join([r.text for r in res_parts if r.text]), span_source=res_parts)
329+
return create_ser_result(
330+
text="\n\n".join([r.text for r in res_parts if r.text]),
331+
span_source=res_parts,
332+
)
317333

318334

319335
class LaTeXPictureSerializer(BasePictureSerializer):
@@ -328,6 +344,7 @@ def serialize(
328344
doc: DoclingDocument,
329345
**kwargs: Any,
330346
) -> SerializationResult:
347+
"""Serialize a ``PictureItem`` into a LaTeX ``figure`` with optional caption and notes."""
331348
params = LaTeXParams(**kwargs)
332349
res_parts: list[SerializationResult] = []
333350

@@ -358,7 +375,9 @@ def serialize(
358375
fig_lines.append(ann_res.text)
359376

360377
fig_lines.append("\\end{figure}")
361-
res_parts.append(create_ser_result(text="\n".join(fig_lines), span_source=item))
378+
res_parts.append(
379+
create_ser_result(text="\n".join(fig_lines), span_source=item)
380+
)
362381

363382
# Optional chart data as a simple table after the figure
364383
if params.enable_chart_tables:
@@ -372,9 +391,9 @@ def serialize(
372391
temp_table = temp_doc.add_table(
373392
data=tabular_chart_annotations[0].chart_data
374393
)
375-
latex_table_content = LaTeXDocSerializer(doc=temp_doc).serialize(
376-
item=temp_table
377-
).text
394+
latex_table_content = (
395+
LaTeXDocSerializer(doc=temp_doc).serialize(item=temp_table).text
396+
)
378397
if latex_table_content:
379398
res_parts.append(
380399
create_ser_result(
@@ -383,7 +402,10 @@ def serialize(
383402
)
384403
)
385404

386-
return create_ser_result(text="\n\n".join([r.text for r in res_parts if r.text]), span_source=res_parts)
405+
return create_ser_result(
406+
text="\n\n".join([r.text for r in res_parts if r.text]),
407+
span_source=res_parts,
408+
)
387409

388410
def _serialize_image_part(
389411
self,
@@ -424,6 +446,7 @@ def serialize(
424446
doc: DoclingDocument,
425447
**kwargs: Any,
426448
) -> SerializationResult:
449+
"""Serialize a ``KeyValueItem``; emits a placeholder when not excluded."""
427450
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
428451
return create_ser_result(text="% missing-key-value-item", span_source=item)
429452
else:
@@ -442,6 +465,7 @@ def serialize(
442465
doc: DoclingDocument,
443466
**kwargs: Any,
444467
) -> SerializationResult:
468+
"""Serialize a ``FormItem``; emits a placeholder when not excluded."""
445469
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
446470
return create_ser_result(text="% missing-form-item", span_source=item)
447471
else:
@@ -462,6 +486,7 @@ def serialize(
462486
is_inline_scope: bool = False,
463487
**kwargs: Any,
464488
) -> SerializationResult:
489+
"""Serialize a list group into a nested ``itemize``/``enumerate`` environment."""
465490
params = LaTeXParams(**kwargs)
466491
parts = doc_serializer.get_parts(
467492
item=item,
@@ -493,6 +518,7 @@ def serialize(
493518
list_level: int = 0,
494519
**kwargs: Any,
495520
) -> SerializationResult:
521+
"""Serialize inline children joining them with spaces for LaTeX output."""
496522
parts = doc_serializer.get_parts(
497523
item=item,
498524
list_level=list_level,
@@ -515,12 +541,16 @@ def serialize(
515541
doc: DoclingDocument,
516542
**kwargs: Any,
517543
) -> SerializationResult:
544+
"""Serialize generic nodes by concatenating serialized children or a placeholder."""
518545
if isinstance(item, GroupItem):
519546
parts = doc_serializer.get_parts(item=item, **kwargs)
520547
text_res = "\n\n".join([p.text for p in parts if p.text])
521548
return create_ser_result(text=text_res, span_source=parts)
522549
else:
523-
return create_ser_result(text="% missing-text", span_source=item if isinstance(item, DocItem) else [])
550+
return create_ser_result(
551+
text="% missing-text",
552+
span_source=item if isinstance(item, DocItem) else [],
553+
)
524554

525555

526556
class LaTeXDocSerializer(DocSerializer):
@@ -542,27 +572,32 @@ class LaTeXDocSerializer(DocSerializer):
542572

543573
@override
544574
def serialize_bold(self, text: str, **kwargs: Any) -> str:
575+
"""Return LaTeX for bold text."""
545576
return f"\\textbf{{{text}}}"
546577

547578
@override
548579
def serialize_italic(self, text: str, **kwargs: Any) -> str:
580+
"""Return LaTeX for italic text."""
549581
return f"\\textit{{{text}}}"
550582

551583
@override
552584
def serialize_underline(self, text: str, **kwargs: Any) -> str:
585+
"""Return LaTeX for underlined text."""
553586
return f"\\underline{{{text}}}"
554587

555588
@override
556589
def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
557-
# Requires \usepackage[normalem]{ulem}
590+
"""Return LaTeX for strikethrough text (requires ``ulem`` package)."""
558591
return f"\\sout{{{text}}}"
559592

560593
@override
561594
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
595+
"""Return LaTeX for subscript text."""
562596
return f"$_{{{text}}}$"
563597

564598
@override
565599
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
600+
"""Return LaTeX for superscript text."""
566601
return f"$^{{{text}}}$"
567602

568603
@override
@@ -572,7 +607,7 @@ def serialize_hyperlink(
572607
hyperlink: Union[AnyUrl, Path],
573608
**kwargs: Any,
574609
) -> str:
575-
# Requires \usepackage{hyperref}
610+
"""Return LaTeX hyperlink command (requires ``hyperref`` package)."""
576611
return f"\\href{{{str(hyperlink)}}}{{{text}}}"
577612

578613
@override
@@ -582,6 +617,7 @@ def serialize_doc(
582617
parts: list[SerializationResult],
583618
**kwargs: Any,
584619
) -> SerializationResult:
620+
"""Assemble serialized parts into the final LaTeX document text."""
585621
text_res = "\n\n".join([p.text for p in parts if p.text])
586622
if self.requires_page_break():
587623
page_cmd = self.params.page_break_command or ""
@@ -591,6 +627,7 @@ def serialize_doc(
591627

592628
@override
593629
def requires_page_break(self) -> bool:
630+
"""Return True if page break replacement is enabled."""
594631
return self.params.page_break_command is not None
595632

596633
def post_process(

test/test_latex_serialization.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@
44
generate or update the golden files. Additionally, if a golden file does not
55
exist, the test will create it to bootstrap the baseline.
66
"""
7-
import yaml
7+
88
from pathlib import Path
99

10-
from docling_core.transforms.serializer.latex import (
11-
LaTeXDocSerializer,
12-
LaTeXParams,
13-
)
10+
import yaml
11+
12+
from docling_core.transforms.serializer.latex import LaTeXDocSerializer, LaTeXParams
1413
from docling_core.types.doc.base import ImageRefMode
1514
from docling_core.types.doc.document import DoclingDocument
1615

1716
from .test_data_gen_flag import GEN_TEST_DATA
1817
from .test_docling_doc import _construct_doc, _construct_rich_table_doc
1918

19+
2020
def verify_or_update(exp_file: Path, actual: str):
2121
exp_file.parent.mkdir(parents=True, exist_ok=True)
2222
# If GEN_TEST_DATA is enabled or the expected file is missing, write/update it
@@ -59,9 +59,10 @@ def test_latex_inline_and_formatting():
5959
actual = ser.serialize().text
6060
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
6161

62+
6263
def test_dummy_doc():
6364
src = Path("test/data/doc/dummy_doc.yaml")
64-
65+
6566
# Read YAML file of manual reference doc
6667
with open(src, "r", encoding="utf-8") as fp:
6768
dict_from_yaml = yaml.safe_load(fp)
@@ -78,6 +79,7 @@ def test_dummy_doc():
7879
actual = ser.serialize().text
7980
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
8081

82+
8183
def test_constructed_doc():
8284
doc = _construct_doc()
8385

@@ -92,6 +94,7 @@ def test_constructed_doc():
9294
src = Path("test/data/doc/construct_doc.yaml")
9395
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
9496

97+
9598
def test_constructed_rich_table_doc():
9699
doc = _construct_rich_table_doc()
97100

@@ -105,8 +108,8 @@ def test_constructed_rich_table_doc():
105108
actual = ser.serialize().text
106109
src = Path("test/data/doc/construct_rich_table_doc.yaml")
107110
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
108-
109-
111+
112+
110113
def test_latex_paper():
111114
src = Path("./test/data/doc/2408.09869v3_enriched.json")
112115
doc = DoclingDocument.load_from_json(src)
@@ -121,6 +124,7 @@ def test_latex_paper():
121124
actual = ser.serialize().text
122125
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
123126

127+
124128
def test_latex_nested_lists():
125129
src = Path("./test/data/doc/polymers.json")
126130
doc = DoclingDocument.load_from_json(src)
@@ -134,4 +138,3 @@ def test_latex_nested_lists():
134138
)
135139
actual = ser.serialize().text
136140
verify_or_update(exp_file=src.with_suffix(".gt.tex"), actual=actual)
137-

0 commit comments

Comments
 (0)