|
6 | 6 | """Define classes for LaTeX serialization.""" |
7 | 7 |
|
8 | 8 | from pathlib import Path |
| 9 | +import re |
9 | 10 | from typing import Any, Optional, Union |
10 | 11 |
|
11 | | -from pydantic import AnyUrl, BaseModel |
| 12 | +from pydantic import AnyUrl, BaseModel, Field |
12 | 13 | from typing_extensions import override |
13 | 14 |
|
14 | 15 | from docling_core.transforms.serializer.base import ( |
@@ -79,6 +80,25 @@ class LaTeXParams(CommonParams): |
79 | 80 | # Escape LaTeX special characters in text |
80 | 81 | escape_latex: bool = True |
81 | 82 |
|
| 83 | + # Optional LaTeX preamble configuration |
| 84 | + # When provided, emitted before the document environment |
| 85 | + # Example: "\\documentclass[11pt,a4paper]{article}" |
| 86 | + document_class: str = r"\documentclass[11pt,a4paper]{article}" |
| 87 | + # List of packages to include. Accepts either full lines |
| 88 | + # like "\\usepackage{graphicx}" or bare package names like "graphicx". |
| 89 | + packages: list[str] = [ |
| 90 | + r"\usepackage[utf8]{inputenc} % allow utf-8 input", |
| 91 | + r"\usepackage[T1]{fontenc} % use 8-bit T1 fonts", |
| 92 | + r"\usepackage{hyperref} % hyperlinks", |
| 93 | + r"\usepackage{url} % simple URL typesetting", |
| 94 | + r"\usepackage{booktabs} % professional-quality tables", |
| 95 | + r"\usepackage{amsfonts} % blackboard math symbols", |
| 96 | + r"\usepackage{nicefrac} % compact symbols for 1/2, etc.", |
| 97 | + r"\usepackage{microtype} % microtypography", |
| 98 | + r"\usepackage{xcolor} % colors", |
| 99 | + r"\usepackage{graphicx} % graphics", |
| 100 | + ] |
| 101 | + |
82 | 102 |
|
83 | 103 | def _escape_latex(text: str) -> str: |
84 | 104 | """Escape LaTeX special characters in text. |
@@ -151,32 +171,30 @@ def serialize( |
151 | 171 | text_part = f"\\item {text}" |
152 | 172 | post_process = False |
153 | 173 | elif isinstance(item, TitleItem): |
154 | | - # Treat document title as an unnumbered section |
| 174 | + # Emit document title using \title{...} |
155 | 175 | if post_process: |
156 | 176 | text = doc_serializer.post_process( |
157 | 177 | text=text, |
158 | 178 | formatting=item.formatting, |
159 | 179 | hyperlink=item.hyperlink, |
160 | 180 | ) |
161 | | - text_part = f"\\section*{{{text}}}" |
| 181 | + text_part = f"\\title{{{text}}}" |
162 | 182 | post_process = False |
163 | 183 | else: |
164 | | - # Section headers: level 0->section, 1->subsection, ... up to subparagraph |
| 184 | + # Section headers: level 1->section, 2->subsection, 3->subsubsection |
| 185 | + # Raise error for unsupported levels |
165 | 186 | if post_process: |
166 | 187 | text = doc_serializer.post_process( |
167 | 188 | text=text, |
168 | 189 | formatting=item.formatting, |
169 | 190 | hyperlink=item.hyperlink, |
170 | 191 | ) |
171 | | - level_map = [ |
172 | | - "section", |
173 | | - "subsection", |
174 | | - "subsubsection", |
175 | | - "paragraph", |
176 | | - "subparagraph", |
177 | | - ] |
178 | | - idx = max(0, min(item.level, len(level_map) - 1)) |
179 | | - cmd = level_map[idx] |
| 192 | + lvl = item.level |
| 193 | + if lvl <= 0 or lvl >= 4: |
| 194 | + raise ValueError( |
| 195 | + "LaTeX serializer: SectionHeaderItem.level must be in [1, 3]" |
| 196 | + ) |
| 197 | + cmd = {1: "section", 2: "subsection", 3: "subsubsection"}[lvl] |
180 | 198 | text_part = f"\\{cmd}{{{text}}}" |
181 | 199 | post_process = False |
182 | 200 |
|
@@ -617,19 +635,92 @@ def serialize_doc( |
617 | 635 | parts: list[SerializationResult], |
618 | 636 | **kwargs: Any, |
619 | 637 | ) -> SerializationResult: |
620 | | - """Assemble serialized parts into the final LaTeX document text.""" |
621 | | - text_res = "\n\n".join([p.text for p in parts if p.text]) |
622 | | - if self.requires_page_break(): |
623 | | - page_cmd = self.params.page_break_command or "" |
624 | | - for full_match, _, _ in self._get_page_breaks(text=text_res): |
625 | | - text_res = text_res.replace(full_match, page_cmd) |
626 | | - return create_ser_result(text=text_res, span_source=parts) |
| 638 | + r"""Assemble serialized parts into a LaTeX document with environment wrapper. |
| 639 | +
|
| 640 | + Adds optional preamble lines (document class and packages), ensures the |
| 641 | + output starts with "\\begin{document}" and ends with "\\end{document}". |
| 642 | + """ |
| 643 | + # Merge any runtime overrides into params |
| 644 | + params = self.params.merge_with_patch(patch=kwargs) |
| 645 | + |
| 646 | + # Join body content and handle page break replacement within the body |
| 647 | + body_text = "\n\n".join([p.text for p in parts if p.text]) |
| 648 | + if params.page_break_command is not None: |
| 649 | + for full_match, _, _ in self._get_page_breaks(text=body_text): |
| 650 | + body_text = body_text.replace(full_match, params.page_break_command) |
| 651 | + |
| 652 | + # Post-process title: move any \title{...} into the preamble |
| 653 | + # and add \maketitle after \begin{document} |
| 654 | + title_cmd, body_text, needs_maketitle = self._post_process_title(body_text) |
| 655 | + |
| 656 | + # Build optional preamble |
| 657 | + preamble_lines: list[str] = [] |
| 658 | + if params.document_class: |
| 659 | + preamble_lines.append(params.document_class+"\n") |
| 660 | + for pkg in params.packages: |
| 661 | + line = pkg.strip() |
| 662 | + if not line: |
| 663 | + continue |
| 664 | + if line.startswith("\\"): |
| 665 | + preamble_lines.append(line) |
| 666 | + else: |
| 667 | + preamble_lines.append(f"\\usepackage{{{line}}}") |
| 668 | + |
| 669 | + # Ensure title (if any) is before \begin{document} |
| 670 | + if title_cmd: |
| 671 | + preamble_lines.append(title_cmd) |
| 672 | + |
| 673 | + header = ( |
| 674 | + "\n".join(preamble_lines + ["\n\\begin{document}"]) |
| 675 | + if preamble_lines |
| 676 | + else "\\begin{document}" |
| 677 | + ) |
| 678 | + footer = "\\end{document}" |
| 679 | + |
| 680 | + # Compose final document with optional \maketitle after begin{document} |
| 681 | + body_parts: list[str] = [] |
| 682 | + if needs_maketitle: |
| 683 | + body_parts.append("\\maketitle") |
| 684 | + if body_text: |
| 685 | + body_parts.append(body_text) |
| 686 | + body_block = "\n\n".join(body_parts) |
| 687 | + |
| 688 | + if body_block: |
| 689 | + full_text = f"{header}\n\n{body_block}\n\n{footer}" |
| 690 | + else: |
| 691 | + full_text = f"{header}\n\n{footer}" |
| 692 | + |
| 693 | + return create_ser_result(text=full_text, span_source=parts) |
627 | 694 |
|
628 | 695 | @override |
629 | 696 | def requires_page_break(self) -> bool: |
630 | 697 | """Return True if page break replacement is enabled.""" |
631 | 698 | return self.params.page_break_command is not None |
632 | 699 |
|
| 700 | + def _post_process_title(self, body_text: str) -> tuple[Optional[str], str, bool]: |
| 701 | + """Detect and relocate LaTeX \title{...} commands. |
| 702 | +
|
| 703 | + - Extracts the first \title{...} command found in the body. |
| 704 | + - Removes all \title{...} occurrences from the body. |
| 705 | + - Returns (title_cmd, new_body_text, needs_maketitle). |
| 706 | +
|
| 707 | + Note: Regex assumes no nested braces inside \title{...}. |
| 708 | + """ |
| 709 | + # Match \title{...} allowing whitespace, but not nested braces |
| 710 | + pattern = re.compile(r"\\title\s*\{([^{}]*)\}", re.DOTALL) |
| 711 | + first = pattern.search(body_text) |
| 712 | + if not first: |
| 713 | + # Nothing to do |
| 714 | + return None, body_text, False |
| 715 | + |
| 716 | + title_content = first.group(1) |
| 717 | + title_cmd = f"\\title{{{title_content}}}" |
| 718 | + # Remove all \title occurrences from the body |
| 719 | + new_body = pattern.sub("", body_text) |
| 720 | + # Trim excess empty lines that might remain |
| 721 | + new_body = re.sub(r"\n{3,}", "\n\n", new_body).strip() |
| 722 | + return title_cmd, new_body, True |
| 723 | + |
633 | 724 | def post_process( |
634 | 725 | self, |
635 | 726 | text: str, |
|
0 commit comments