Skip to content

Commit 430d1b8

Browse files
initial latex serializer
Signed-off-by: Peter Staar <[email protected]>
1 parent c38f476 commit 430d1b8

File tree

8 files changed

+295
-79
lines changed

8 files changed

+295
-79
lines changed

docling_core/transforms/serializer/latex.py

Lines changed: 111 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
"""Define classes for LaTeX serialization."""
77

88
from pathlib import Path
9+
import re
910
from typing import Any, Optional, Union
1011

11-
from pydantic import AnyUrl, BaseModel
12+
from pydantic import AnyUrl, BaseModel, Field
1213
from typing_extensions import override
1314

1415
from docling_core.transforms.serializer.base import (
@@ -79,6 +80,25 @@ class LaTeXParams(CommonParams):
7980
# Escape LaTeX special characters in text
8081
escape_latex: bool = True
8182

83+
# Optional LaTeX preamble configuration
84+
# When provided, emitted before the document environment
85+
# Example: "\\documentclass[11pt,a4paper]{article}"
86+
document_class: str = r"\documentclass[11pt,a4paper]{article}"
87+
# List of packages to include. Accepts either full lines
88+
# like "\\usepackage{graphicx}" or bare package names like "graphicx".
89+
packages: list[str] = [
90+
r"\usepackage[utf8]{inputenc} % allow utf-8 input",
91+
r"\usepackage[T1]{fontenc} % use 8-bit T1 fonts",
92+
r"\usepackage{hyperref} % hyperlinks",
93+
r"\usepackage{url} % simple URL typesetting",
94+
r"\usepackage{booktabs} % professional-quality tables",
95+
r"\usepackage{amsfonts} % blackboard math symbols",
96+
r"\usepackage{nicefrac} % compact symbols for 1/2, etc.",
97+
r"\usepackage{microtype} % microtypography",
98+
r"\usepackage{xcolor} % colors",
99+
r"\usepackage{graphicx} % graphics",
100+
]
101+
82102

83103
def _escape_latex(text: str) -> str:
84104
"""Escape LaTeX special characters in text.
@@ -151,32 +171,30 @@ def serialize(
151171
text_part = f"\\item {text}"
152172
post_process = False
153173
elif isinstance(item, TitleItem):
154-
# Treat document title as an unnumbered section
174+
# Emit document title using \title{...}
155175
if post_process:
156176
text = doc_serializer.post_process(
157177
text=text,
158178
formatting=item.formatting,
159179
hyperlink=item.hyperlink,
160180
)
161-
text_part = f"\\section*{{{text}}}"
181+
text_part = f"\\title{{{text}}}"
162182
post_process = False
163183
else:
164-
# Section headers: level 0->section, 1->subsection, ... up to subparagraph
184+
# Section headers: level 1->section, 2->subsection, 3->subsubsection
185+
# Raise error for unsupported levels
165186
if post_process:
166187
text = doc_serializer.post_process(
167188
text=text,
168189
formatting=item.formatting,
169190
hyperlink=item.hyperlink,
170191
)
171-
level_map = [
172-
"section",
173-
"subsection",
174-
"subsubsection",
175-
"paragraph",
176-
"subparagraph",
177-
]
178-
idx = max(0, min(item.level, len(level_map) - 1))
179-
cmd = level_map[idx]
192+
lvl = item.level
193+
if lvl <= 0 or lvl >= 4:
194+
raise ValueError(
195+
"LaTeX serializer: SectionHeaderItem.level must be in [1, 3]"
196+
)
197+
cmd = {1: "section", 2: "subsection", 3: "subsubsection"}[lvl]
180198
text_part = f"\\{cmd}{{{text}}}"
181199
post_process = False
182200

@@ -617,19 +635,92 @@ def serialize_doc(
617635
parts: list[SerializationResult],
618636
**kwargs: Any,
619637
) -> SerializationResult:
620-
"""Assemble serialized parts into the final LaTeX document text."""
621-
text_res = "\n\n".join([p.text for p in parts if p.text])
622-
if self.requires_page_break():
623-
page_cmd = self.params.page_break_command or ""
624-
for full_match, _, _ in self._get_page_breaks(text=text_res):
625-
text_res = text_res.replace(full_match, page_cmd)
626-
return create_ser_result(text=text_res, span_source=parts)
638+
r"""Assemble serialized parts into a LaTeX document with environment wrapper.
639+
640+
Adds optional preamble lines (document class and packages), ensures the
641+
output starts with "\\begin{document}" and ends with "\\end{document}".
642+
"""
643+
# Merge any runtime overrides into params
644+
params = self.params.merge_with_patch(patch=kwargs)
645+
646+
# Join body content and handle page break replacement within the body
647+
body_text = "\n\n".join([p.text for p in parts if p.text])
648+
if params.page_break_command is not None:
649+
for full_match, _, _ in self._get_page_breaks(text=body_text):
650+
body_text = body_text.replace(full_match, params.page_break_command)
651+
652+
# Post-process title: move any \title{...} into the preamble
653+
# and add \maketitle after \begin{document}
654+
title_cmd, body_text, needs_maketitle = self._post_process_title(body_text)
655+
656+
# Build optional preamble
657+
preamble_lines: list[str] = []
658+
if params.document_class:
659+
preamble_lines.append(params.document_class+"\n")
660+
for pkg in params.packages:
661+
line = pkg.strip()
662+
if not line:
663+
continue
664+
if line.startswith("\\"):
665+
preamble_lines.append(line)
666+
else:
667+
preamble_lines.append(f"\\usepackage{{{line}}}")
668+
669+
# Ensure title (if any) is before \begin{document}
670+
if title_cmd:
671+
preamble_lines.append(title_cmd)
672+
673+
header = (
674+
"\n".join(preamble_lines + ["\n\\begin{document}"])
675+
if preamble_lines
676+
else "\\begin{document}"
677+
)
678+
footer = "\\end{document}"
679+
680+
# Compose final document with optional \maketitle after begin{document}
681+
body_parts: list[str] = []
682+
if needs_maketitle:
683+
body_parts.append("\\maketitle")
684+
if body_text:
685+
body_parts.append(body_text)
686+
body_block = "\n\n".join(body_parts)
687+
688+
if body_block:
689+
full_text = f"{header}\n\n{body_block}\n\n{footer}"
690+
else:
691+
full_text = f"{header}\n\n{footer}"
692+
693+
return create_ser_result(text=full_text, span_source=parts)
627694

628695
@override
629696
def requires_page_break(self) -> bool:
630697
"""Return True if page break replacement is enabled."""
631698
return self.params.page_break_command is not None
632699

700+
def _post_process_title(self, body_text: str) -> tuple[Optional[str], str, bool]:
701+
"""Detect and relocate LaTeX \title{...} commands.
702+
703+
- Extracts the first \title{...} command found in the body.
704+
- Removes all \title{...} occurrences from the body.
705+
- Returns (title_cmd, new_body_text, needs_maketitle).
706+
707+
Note: Regex assumes no nested braces inside \title{...}.
708+
"""
709+
# Match \title{...} allowing whitespace, but not nested braces
710+
pattern = re.compile(r"\\title\s*\{([^{}]*)\}", re.DOTALL)
711+
first = pattern.search(body_text)
712+
if not first:
713+
# Nothing to do
714+
return None, body_text, False
715+
716+
title_content = first.group(1)
717+
title_cmd = f"\\title{{{title_content}}}"
718+
# Remove all \title occurrences from the body
719+
new_body = pattern.sub("", body_text)
720+
# Trim excess empty lines that might remain
721+
new_body = re.sub(r"\n{3,}", "\n\n", new_body).strip()
722+
return title_cmd, new_body, True
723+
633724
def post_process(
634725
self,
635726
text: str,

0 commit comments

Comments
 (0)