1+ # %% [markdown]
2+ # Customize PDF conversion by toggling OCR/backends and pipeline options.
3+ #
4+ # What this example does
5+ # - Shows several alternative configurations for the Docling PDF pipeline.
6+ # - Lets you try OCR engines (EasyOCR, Tesseract, system OCR) or no OCR.
7+ # - Converts a single sample PDF and exports results to `scratch/`.
8+ #
9+ # Prerequisites
10+ # - Install Docling and its optional OCR backends per the docs.
11+ # - Ensure you can import `docling` from your Python environment.
12+ #
13+ # How to run
14+ # - From the repository root, run: `python docs/examples/custom_convert.py`.
15+ # - Outputs are written under `scratch/` next to where you run the script.
16+ #
17+ # Choosing a configuration
18+ # - Only one configuration block should be active at a time.
19+ # - Uncomment exactly one of the sections below to experiment.
20+ # - The file ships with "Docling Parse with EasyOCR" enabled as a sensible default.
21+ # - If you uncomment a backend or OCR option that is not imported above, also
22+ # import its class, e.g.:
23+ # - `from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend`
24+ # - `from docling.datamodel.pipeline_options import TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions`
25+ #
26+ # Input document
27+ # - Defaults to a single PDF from `tests/data/pdf/` in the repo.
28+ # - If you don't have the test data, update `input_doc_path` to a local PDF.
29+ #
30+ # Notes
31+ # - EasyOCR language: adjust `pipeline_options.ocr_options.lang` (e.g., ["en"], ["es"], ["en", "de"]).
32+ # - Accelerators: tune `AcceleratorOptions` to select CPU/GPU or threads.
33+ # - Exports: JSON, plain text, Markdown, and doctags are saved in `scratch/`.
34+
35+ # %%
36+
137import json
238import logging
339import time
@@ -21,9 +57,8 @@ def main():
2157
2258 ###########################################################################
2359
24- # The following sections contain a combination of PipelineOptions
25- # and PDF Backends for various configurations.
26- # Uncomment one section at the time to see the differences in the output.
60+ # The sections below demo combinations of PdfPipelineOptions and backends.
61+ # Tip: Uncomment exactly one section at a time to compare outputs.
2762
2863 # PyPdfium without EasyOCR
2964 # --------------------
@@ -68,8 +103,10 @@ def main():
68103 # }
69104 # )
70105
71- # Docling Parse with EasyOCR
72- # ----------------------
106+ # Docling Parse with EasyOCR (default)
107+ # -------------------------------
108+ # Enables OCR and table structure with EasyOCR, using automatic device
109+ # selection via AcceleratorOptions. Adjust languages as needed.
73110 pipeline_options = PdfPipelineOptions ()
74111 pipeline_options .do_ocr = True
75112 pipeline_options .do_table_structure = True
@@ -86,7 +123,7 @@ def main():
86123 )
87124
88125 # Docling Parse with EasyOCR (CPU only)
89- # ----------------------
126+ # -------------------------------------
90127 # pipeline_options = PdfPipelineOptions()
91128 # pipeline_options.do_ocr = True
92129 # pipeline_options.ocr_options.use_gpu = False # <-- set this.
@@ -100,7 +137,7 @@ def main():
100137 # )
101138
102139 # Docling Parse with Tesseract
103- # ----------------------
140+ # ----------------------------
104141 # pipeline_options = PdfPipelineOptions()
105142 # pipeline_options.do_ocr = True
106143 # pipeline_options.do_table_structure = True
@@ -114,7 +151,7 @@ def main():
114151 # )
115152
116153 # Docling Parse with Tesseract CLI
117- # ----------------------
154+ # --------------------------------
118155 # pipeline_options = PdfPipelineOptions()
119156 # pipeline_options.do_ocr = True
120157 # pipeline_options.do_table_structure = True
@@ -127,8 +164,8 @@ def main():
127164 # }
128165 # )
129166
130- # Docling Parse with ocrmac(Mac only)
131- # ----------------------
167+ # Docling Parse with ocrmac (macOS only)
168+ # --------------------------------------
132169 # pipeline_options = PdfPipelineOptions()
133170 # pipeline_options.do_ocr = True
134171 # pipeline_options.do_table_structure = True
@@ -154,13 +191,13 @@ def main():
154191 output_dir .mkdir (parents = True , exist_ok = True )
155192 doc_filename = conv_result .input .file .stem
156193
157- # Export Deep Search document JSON format:
194+ # Export Docling document JSON format:
158195 with (output_dir / f"{ doc_filename } .json" ).open ("w" , encoding = "utf-8" ) as fp :
159196 fp .write (json .dumps (conv_result .document .export_to_dict ()))
160197
161- # Export Text format:
198+ # Export Text format (plain text via Markdown export) :
162199 with (output_dir / f"{ doc_filename } .txt" ).open ("w" , encoding = "utf-8" ) as fp :
163- fp .write (conv_result .document .export_to_text ( ))
200+ fp .write (conv_result .document .export_to_markdown ( strict_text = True ))
164201
165202 # Export Markdown format:
166203 with (output_dir / f"{ doc_filename } .md" ).open ("w" , encoding = "utf-8" ) as fp :
0 commit comments