|
1 | 1 | # docling-hierarchical-pdf |
2 | 2 |
|
3 | | -[](https://img.shields.io/github/v/release/krrome/docling-hierarchical-pdf) |
4 | | -[](https://github.com/krrome/docling-hierarchical-pdf/actions/workflows/main.yml?query=branch%3Amain) |
5 | 3 | [](https://img.shields.io/github/commit-activity/m/krrome/docling-hierarchical-pdf) |
6 | 4 | [](https://img.shields.io/github/license/krrome/docling-hierarchical-pdf) |
7 | 5 |
|
8 | 6 | This package enables inference of header hierarchy in the docling PDF parsing pipeline. |
| 7 | + |
| 8 | +The docs are still in the making, but as a user all you need is: |
| 9 | + |
| 10 | +Install it: |
| 11 | +```bash |
| 12 | +pip install docling-hierarchical-pdf |
| 13 | +``` |
| 14 | + |
| 15 | +Use it: |
| 16 | +```python |
| 17 | +from docling.document_converter import DocumentConverter |
| 18 | +from hierarchical.postprocessor import ResultPostprocessor |
| 19 | + |
| 20 | +source = "my_file.pdf" # document per local path or URL |
| 21 | +converter = DocumentConverter() |
| 22 | +result = converter.convert(source) |
| 23 | +# the postprocessor modifies the result.document in place. |
| 24 | +ResultPostprocessor(result).process() |
| 25 | + |
| 26 | +# enjoy the reordered document - for example convert it to markdown |
| 27 | +result.document.export_to_markdown() |
| 28 | + |
| 29 | +# or use a chunker on it... |
| 30 | +``` |
| 31 | + |
| 32 | +or for the VLM-pipeline |
| 33 | + |
| 34 | +```python |
| 35 | +from docling.datamodel.base_models import InputFormat |
| 36 | +from docling.document_converter import DocumentConverter, PdfFormatOption |
| 37 | +from docling.pipeline.vlm_pipeline import VlmPipeline |
| 38 | + |
| 39 | +source = "my_scanned.pdf" # document per local path or URL |
| 40 | + |
| 41 | +converter = DocumentConverter( |
| 42 | + format_options={ |
| 43 | + InputFormat.PDF: PdfFormatOption( |
| 44 | + pipeline_cls=VlmPipeline, |
| 45 | + ), |
| 46 | + } |
| 47 | +) |
| 48 | +result = converter.convert(source=source) |
| 49 | +ResultPostprocessor(result).process() |
| 50 | + |
| 51 | +# enjoy the reordered document - for example convert it to markdown |
| 52 | +result.document.export_to_markdown() |
| 53 | + |
| 54 | +# or use a chunker on it... |
| 55 | +``` |
0 commit comments