Skip to content

Commit 8c5ac50

Browse files
Added the latex serializer
Signed-off-by: Peter Staar <[email protected]>
1 parent e234480 commit 8c5ac50

File tree

10 files changed

+1698
-0
lines changed

10 files changed

+1698
-0
lines changed

docling_core/transforms/serializer/latex.py

Lines changed: 614 additions & 0 deletions
Large diffs are not rendered by default.

docling_core/types/doc/document.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4607,6 +4607,46 @@ def save_as_html(
46074607
with open(filename, "w", encoding="utf-8") as fw:
46084608
fw.write(html_out)
46094609

4610+
def save_as_latex(
4611+
self,
4612+
filename: Union[str, Path],
4613+
artifacts_dir: Optional[Path] = None,
4614+
from_element: int = 0,
4615+
to_element: int = sys.maxsize,
4616+
labels: Optional[set[DocItemLabel]] = None,
4617+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
4618+
page_no: Optional[int] = None,
4619+
included_content_layers: Optional[set[ContentLayer]] = None,
4620+
include_annotations: bool = True,
4621+
page_break_command: Optional[str] = None,
4622+
):
4623+
"""Save to LaTeX."""
4624+
if isinstance(filename, str):
4625+
filename = Path(filename)
4626+
4627+
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
4628+
4629+
if image_mode == ImageRefMode.REFERENCED:
4630+
os.makedirs(artifacts_dir, exist_ok=True)
4631+
4632+
new_doc = self._make_copy_with_refmode(
4633+
artifacts_dir, image_mode, page_no, reference_path=reference_path
4634+
)
4635+
4636+
latex_out = new_doc.export_to_latex(
4637+
from_element=from_element,
4638+
to_element=to_element,
4639+
labels=labels,
4640+
image_mode=image_mode,
4641+
page_no=page_no,
4642+
included_content_layers=included_content_layers,
4643+
include_annotations=include_annotations,
4644+
page_break_command=page_break_command,
4645+
)
4646+
4647+
with open(filename, "w", encoding="utf-8") as fw:
4648+
fw.write(latex_out)
4649+
46104650
def _get_output_paths(
46114651
self, filename: Union[str, Path], artifacts_dir: Optional[Path] = None
46124652
) -> Tuple[Path, Optional[Path]]:
@@ -4703,6 +4743,57 @@ def export_to_html( # noqa: C901
47034743

47044744
return ser_res.text
47054745

4746+
def export_to_latex(
4747+
self,
4748+
from_element: int = 0,
4749+
to_element: int = sys.maxsize,
4750+
labels: Optional[set[DocItemLabel]] = None,
4751+
enable_chart_tables: bool = True,
4752+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
4753+
page_no: Optional[int] = None,
4754+
included_content_layers: Optional[set[ContentLayer]] = None,
4755+
include_annotations: bool = True,
4756+
page_break_command: Optional[str] = None,
4757+
indent: int = 2,
4758+
image_placeholder: str = "% image",
4759+
escape_latex: bool = True,
4760+
) -> str:
4761+
"""Serialize to LaTeX."""
4762+
from docling_core.transforms.serializer.latex import (
4763+
LaTeXDocSerializer,
4764+
LaTeXParams,
4765+
)
4766+
4767+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
4768+
my_layers = (
4769+
included_content_layers
4770+
if included_content_layers is not None
4771+
else DEFAULT_CONTENT_LAYERS
4772+
)
4773+
4774+
params = LaTeXParams(
4775+
labels=my_labels,
4776+
layers=my_layers,
4777+
pages={page_no} if page_no is not None else None,
4778+
start_idx=from_element,
4779+
stop_idx=to_element,
4780+
image_mode=image_mode,
4781+
enable_chart_tables=enable_chart_tables,
4782+
include_annotations=include_annotations,
4783+
page_break_command=page_break_command,
4784+
indent=indent,
4785+
image_placeholder=image_placeholder,
4786+
escape_latex=escape_latex,
4787+
)
4788+
4789+
serializer = LaTeXDocSerializer(
4790+
doc=self,
4791+
params=params,
4792+
)
4793+
ser_res = serializer.serialize()
4794+
4795+
return ser_res.text
4796+
47064797
@staticmethod
47074798
def load_from_doctags( # noqa: C901
47084799
doctag_document: DocTagsDocument, document_name: str = "Document"

test/data/doc/2408.09869v3_enriched.gt.tex

Lines changed: 447 additions & 0 deletions
Large diffs are not rendered by default.

test/data/doc/activities.gt.tex

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
\subsection{Summer activities}
2+
3+
\subsection{Swimming in the lake}
4+
5+
Duck
6+
7+
\begin{figure}[h]
8+
% image
9+
\caption{Figure 1: This is a cute duckling}
10+
\end{figure}
11+
12+
\subsection{Let's swim!}
13+
14+
To get started with swimming, first lay down in a water and try not to drown:
15+
16+
\begin{itemize}
17+
\item ∞ You can relax and look around
18+
\item ∞ Paddle about
19+
\item ∞ Enjoy summer warmth
20+
\end{itemize}
21+
22+
Also, don't forget:
23+
24+
\begin{itemize}
25+
\item 1. Wear sunglasses
26+
\item 2. Don't forget to drink water
27+
\item 3. Use sun cream
28+
\end{itemize}
29+
30+
Hmm, what else…
31+
32+
\begin{itemize}
33+
\item -Another activity item
34+
\item -Yet another one
35+
\item -Stopping it here
36+
\end{itemize}
37+
38+
Some text.
39+
40+
\begin{itemize}
41+
\item -Starting the next page with a list item.
42+
\item -Second item.
43+
\end{itemize}

test/data/doc/construct_doc.gt.tex

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
\begin{itemize}
2+
\item item of leading list
3+
\end{itemize}
4+
5+
\section*{Title of the Document}
6+
7+
Author 1
8+
Affiliation 1
9+
10+
Author 2
11+
Affiliation 2
12+
13+
\subsection{1. Introduction}
14+
15+
This paper introduces the biggest invention ever made. ...
16+
17+
\begin{itemize}
18+
\item list item 1
19+
\item list item 2
20+
\item list item 3
21+
\begin{enumerate}
22+
\item list item 3.a
23+
\item list item 3.b
24+
\item list item 3.c
25+
\begin{enumerate}
26+
\item list item 3.c.i
27+
\end{enumerate}
28+
\end{enumerate}
29+
\item list item 4
30+
\end{itemize}
31+
32+
\begin{table}[h]
33+
\caption{This is the caption of table 1.}
34+
\begin{tabular}{|l|l|l|}
35+
\hline
36+
Product & Years & Years \\ \hline
37+
Product & 2016 & 2017 \\ \hline
38+
Apple & 49823 & 695944 \\ \hline
39+
\end{tabular}
40+
\end{table}
41+
42+
\begin{figure}[h]
43+
% image
44+
\caption{This is the caption of figure 1.}
45+
\end{figure}
46+
47+
\begin{figure}[h]
48+
% image
49+
\caption{This is the caption of figure 2.}
50+
\end{figure}
51+
52+
\begin{itemize}
53+
\item item 1 of list
54+
\end{itemize}
55+
56+
\begin{itemize}
57+
\item item 1 of list after empty list
58+
\item item 2 of list after empty list
59+
\end{itemize}
60+
61+
\begin{itemize}
62+
\item item 1 of neighboring list
63+
\item item 2 of neighboring list
64+
\begin{itemize}
65+
\item item 1 of sub list
66+
\item Here a code snippet: \texttt{print("Hello world")} (to be displayed inline)
67+
Here a code snippet: \texttt{print("Hello world")} (to be displayed inline)
68+
\item Here a formula: $E=mc^2$ (to be displayed inline)
69+
Here a formula: $E=mc^2$ (to be displayed inline)
70+
\end{itemize}
71+
\end{itemize}
72+
73+
Here a code block:
74+
75+
\begin{verbatim}
76+
print("Hello world")
77+
\end{verbatim}
78+
79+
Here a formula block:
80+
81+
$$E=mc^2$$
82+
83+
% missing-key-value-item
84+
85+
% missing-form-item
86+
87+
Some formatting chops: \textbf{bold} \textit{italic} \underline{underline} \sout{strikethrough} $_{subscript}$ $^{superscript}$ \href{.}{hyperlink} \& \href{https://github.com/DS4SD/docling}{\sout{\underline{\textit{\textbf{everything at the same time.}}}}}
88+
89+
\begin{enumerate}
90+
\item Item 1 in A
91+
\item Item 2 in A
92+
\item Item 3 in A
93+
\begin{enumerate}
94+
\item Item 1 in B
95+
\item Item 2 in B
96+
\begin{enumerate}
97+
\item Item 1 in C
98+
\item Item 2 in C
99+
\end{enumerate}
100+
\item Item 3 in B
101+
\end{enumerate}
102+
\item Item 4 in A
103+
\end{enumerate}
104+
105+
\begin{itemize}
106+
\item List item without parent list group
107+
\end{itemize}
108+
109+
The end.
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
\section*{Rich tables}
2+
3+
\begin{table}[h]
4+
\begin{tabular}{|l|l|}
5+
\hline
6+
cell 0,0 & cell 0,1 \\ \hline
7+
cell 1,0 & \textit{text in italic} \\ \hline
8+
\begin{itemize} \item list item 1 \item list item 2 \end{itemize} & cell 2,1 \\ \hline
9+
cell 3,0 & \begin{table}[h] \begin{tabular}{|l|l|l|} \hline inner cell 0,0 & inner cell 0,1 & inner cell 0,2 \\ \hline inner cell 1,0 & inner cell 1,1 & inner cell 1,2 \\ \hline \end{tabular} \end{table} \\ \hline
10+
Some text in a generic group. More text in the group. & cell 4,1 \\ \hline
11+
\end{tabular}
12+
\end{table}

test/data/doc/dummy_doc.gt.tex

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
\section*{DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis}
2+
3+
\begin{figure}[h]
4+
% image
5+
\caption{Figure 1: Four examples of complex page layouts across different document categories}
6+
% annotation[classification]: bar chart
7+
% annotation[description]: ...
8+
% annotation[molecule_data]: CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
9+
\end{figure}
10+
11+
% annotation[description]: A description annotation for this table.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
\section*{Contribution guideline example}
2+
3+
This is simple.
4+
5+
Foo \textit{emphasis} \textbf{strong emphasis} \textit{\textbf{both}} .
6+
7+
Create your feature branch: \texttt{git checkout -b feature/AmazingFeature} .
8+
9+
\begin{enumerate}
10+
\item Pull the \href{https://github.com/docling-project/docling}{\textbf{repository}} .
11+
Pull the \href{https://github.com/docling-project/docling}{\textbf{repository}} .
12+
\item Create your feature branch ( \texttt{git checkout -b feature/AmazingFeature} )
13+
Create your feature branch ( \texttt{git checkout -b feature/AmazingFeature} )
14+
\item Commit your changes ( \texttt{git commit -m 'Add some AmazingFeature'} )
15+
Commit your changes ( \texttt{git commit -m 'Add some AmazingFeature'} )
16+
\item Push to the branch ( \texttt{git push origin feature/AmazingFeature} )
17+
Push to the branch ( \texttt{git push origin feature/AmazingFeature} )
18+
\item Open a Pull Request
19+
\item \textbf{Whole list item has same formatting}
20+
\item List item has \textit{mixed or partial} formatting
21+
List item has \textit{mixed or partial} formatting
22+
\end{enumerate}
23+
24+
\section*{\textit{Whole heading is italic}}
25+
26+
Some \texttt{formatted_code}
27+
28+
\subsection{\textit{Partially formatted} heading to\_escape \texttt{not_to_escape} $E=mc^2$ \& ampersand}
29+
30+
\textit{Partially formatted} heading to\_escape \texttt{not_to_escape} $E=mc^2$ \& ampersand
31+
32+
The end.

0 commit comments

Comments
 (0)