Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 55 additions & 10 deletions notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@
"source": [
"print(\"Files to pre-process\\n--------------------\")\n",
"for contribution in contributions:\n",
" print(f\"\\nContribution: {contribution.get(\"name\")}\")\n",
" print(f\"\\nContribution: {contribution.get('name')}\")\n",
" print(\"Files:\")\n",
" files = list((contribution['dir'] / SOURCE_DOCUMENT_DIR).glob(\"*.pdf\"))\n",
" for file in files:\n",
Expand Down Expand Up @@ -273,9 +273,13 @@
"source": [
"### Configure Docling conversion pipeline\n",
"\n",
"Next we set the configuration options for our conversion pipeline. The PDF Conversion options set here are the defaults. More information about pipeline configuration can be found on Docling.\n",
"Next we set the configuration options for our conversion pipeline. \n",
"\n",
"For a complete reference on Docling conversion pipeline configuration, see [PDFPipelineOptions](https://docling-project.github.io/docling/reference/pipeline_options/#docling.datamodel.pipeline_options.PdfPipelineOptions) and [PDFFormatOptions](https://docling-project.github.io/docling/reference/document_converter/#docling.document_converter.InputFormat.XML_JATS)."
"The standard pipeline options generally yield good and fast results for most documents. In some cases, however, alternative conversion techniques can lead to better outcomes. For instance, OCR is effective for scanned documents or images that contain text to be extracted and analyzed. In cases where other techniques didn't produce good results, using a vision-language model (VLM) may be a good option.\n",
"\n",
"The next cell contains three combinations of pipeline options: the default (standard) options, a variant that forces OCR on the entire document, and another that uses a VLM. You can comment or uncomment the corresponding code blocks to switch between them. For more information and additional conversion techniques, check our [Docling Conversion Tutorials](https://github.com/instructlab/examples/blob/main/docs/docling-conversion/README.md).\n",
"\n",
"For a complete reference on Docling's conversion pipeline configuration, check the [Examples](https://docling-project.github.io/docling/examples/) section of the official documentation, as well as the [PDFPipelineOptions](https://docling-project.github.io/docling/reference/pipeline_options/#docling.datamodel.pipeline_options.PdfPipelineOptions) and [PDFFormatOptions](https://docling-project.github.io/docling/reference/document_converter/#docling.document_converter.InputFormat.XML_JATS) reference pages."
]
},
{
Expand All @@ -285,19 +289,58 @@
"metadata": {},
"outputs": [],
"source": [
"from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions\n",
"from docling.document_converter import DocumentConverter, PdfFormatOption\n",
"from docling.datamodel.base_models import InputFormat\n",
"from docling.datamodel.pipeline_options import PdfPipelineOptions\n",
"\n",
"pipeline_options = PdfPipelineOptions() # TODO: show the options that can be set\n",
"\n",
"from docling.datamodel.pipeline_options import (\n",
" EasyOcrOptions,\n",
" PdfPipelineOptions,\n",
" VlmPipelineOptions,\n",
" smoldocling_vlm_conversion_options,\n",
")\n",
"from docling.pipeline.vlm_pipeline import VlmPipeline\n",
"from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend\n",
"\n",
"# Standard pipeline options\n",
"pipeline_options = PdfPipelineOptions()\n",
"doc_converter = DocumentConverter(\n",
" format_options={\n",
" InputFormat.PDF: PdfFormatOption(\n",
" pipeline_options=pipeline_options\n",
" )\n",
" }\n",
")"
")\n",
"\n",
"# Force OCR on the entire page\n",
"# pipeline_options = PdfPipelineOptions()\n",
"# pipeline_options.do_ocr = True\n",
"# pipeline_options.ocr_options.force_full_page_ocr = True\n",
"# pipeline_options.ocr_options.lang = [\"en\"]\n",
"# pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)\n",
"# pipeline_options.accelerator_options = AcceleratorOptions(\n",
"# num_threads=4, device=AcceleratorDevice.AUTO\n",
"# )\n",
"# doc_converter = DocumentConverter(\n",
"# format_options={\n",
"# InputFormat.PDF: PdfFormatOption(\n",
"# pipeline_options=pipeline_options,\n",
"# backend=DoclingParseV4DocumentBackend,\n",
"# )\n",
"# }\n",
"# )\n",
"\n",
"# Use the SmolDocling VLM\n",
"# pipeline_options = VlmPipelineOptions()\n",
"# pipeline_options.vlm_options = smoldocling_vlm_conversion_options\n",
"# doc_converter = DocumentConverter(\n",
"# format_options={\n",
"# InputFormat.PDF: PdfFormatOption(\n",
"# pipeline_options=pipeline_options,\n",
"# pipeline_cls=VlmPipeline,\n",
"# )\n",
"# }\n",
"# )\n",
"\n"
]
},
{
Expand Down Expand Up @@ -332,7 +375,9 @@
" with open(json_output_path, \"w\") as f:\n",
" json.dump(doc_dict, f)\n",
" print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
" json_files.append(json_output_path.resolve())"
" json_files.append(json_output_path.resolve())\n",
"\n",
" print(f\"\\nSample:\\n ${doc.export_to_text()[:500]}...\")"
]
},
{
Expand Down Expand Up @@ -817,7 +862,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.13"
}
},
"nbformat": 4,
Expand Down