diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb index f98e519..7a6b493 100644 --- a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb +++ b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb @@ -239,7 +239,7 @@ "source": [ "print(\"Files to pre-process\\n--------------------\")\n", "for contribution in contributions:\n", - " print(f\"\\nContribution: {contribution.get(\"name\")}\")\n", + " print(f\"\\nContribution: {contribution.get('name')}\")\n", " print(\"Files:\")\n", " files = list((contribution['dir'] / SOURCE_DOCUMENT_DIR).glob(\"*.pdf\"))\n", " for file in files:\n", @@ -273,9 +273,13 @@ "source": [ "### Configure Docling conversion pipeline\n", "\n", - "Next we set the configuration options for our conversion pipeline. The PDF Conversion options set here are the defaults. More information about pipeline configuration can be found on Docling.\n", + "Next we set the configuration options for our conversion pipeline. \n", "\n", - "For a complete reference on Docling conversion pipeline configuration, see [PDFPipelineOptions](https://docling-project.github.io/docling/reference/pipeline_options/#docling.datamodel.pipeline_options.PdfPipelineOptions) and [PDFFormatOptions](https://docling-project.github.io/docling/reference/document_converter/#docling.document_converter.InputFormat.XML_JATS)." + "The standard pipeline options generally yield good and fast results for most documents. In some cases, however, alternative conversion techniques can lead to better outcomes. For instance, OCR is effective for scanned documents or images that contain text to be extracted and analyzed. In cases where other techniques didn't produce good results, using a vision-language model (VLM) may be a good option.\n", + "\n", + "The next cell contains three combinations of pipeline options: the default (standard) options, a variant that forces OCR on the entire document, and another that uses a VLM. You can comment or uncomment the corresponding code blocks to switch between them. For more information and additional conversion techniques, check our [Docling Conversion Tutorials](https://github.com/instructlab/examples/blob/main/docs/docling-conversion/README.md).\n", + "\n", + "For a complete reference on Docling's conversion pipeline configuration, check the [Examples](https://docling-project.github.io/docling/examples/) section of the official documentation, as well as the [PDFPipelineOptions](https://docling-project.github.io/docling/reference/pipeline_options/#docling.datamodel.pipeline_options.PdfPipelineOptions) and [PDFFormatOptions](https://docling-project.github.io/docling/reference/document_converter/#docling.document_converter.InputFormat.XML_JATS) reference pages." ] }, { @@ -285,19 +289,58 @@ "metadata": {}, "outputs": [], "source": [ + "from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions\n", "from docling.document_converter import DocumentConverter, PdfFormatOption\n", "from docling.datamodel.base_models import InputFormat\n", - "from docling.datamodel.pipeline_options import PdfPipelineOptions\n", - "\n", - "pipeline_options = PdfPipelineOptions() # TODO: show the options that can be set\n", - "\n", + "from docling.datamodel.pipeline_options import (\n", + " EasyOcrOptions,\n", + " PdfPipelineOptions,\n", + " VlmPipelineOptions,\n", + " smoldocling_vlm_conversion_options,\n", + ")\n", + "from docling.pipeline.vlm_pipeline import VlmPipeline\n", + "from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend\n", + "\n", + "# Standard pipeline options\n", + "pipeline_options = PdfPipelineOptions()\n", "doc_converter = DocumentConverter(\n", " format_options={\n", " InputFormat.PDF: PdfFormatOption(\n", " pipeline_options=pipeline_options\n", " )\n", " }\n", - ")" + ")\n", + "\n", + "# Force OCR on the entire page\n", + "# pipeline_options = PdfPipelineOptions()\n", + "# pipeline_options.do_ocr = True\n", + "# pipeline_options.ocr_options.force_full_page_ocr = True\n", + "# pipeline_options.ocr_options.lang = [\"en\"]\n", + "# pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)\n", + "# pipeline_options.accelerator_options = AcceleratorOptions(\n", + "# num_threads=4, device=AcceleratorDevice.AUTO\n", + "# )\n", + "# doc_converter = DocumentConverter(\n", + "# format_options={\n", + "# InputFormat.PDF: PdfFormatOption(\n", + "# pipeline_options=pipeline_options,\n", + "# backend=DoclingParseV4DocumentBackend,\n", + "# )\n", + "# }\n", + "# )\n", + "\n", + "# Use the SmolDocling VLM\n", + "# pipeline_options = VlmPipelineOptions()\n", + "# pipeline_options.vlm_options = smoldocling_vlm_conversion_options\n", + "# doc_converter = DocumentConverter(\n", + "# format_options={\n", + "# InputFormat.PDF: PdfFormatOption(\n", + "# pipeline_options=pipeline_options,\n", + "# pipeline_cls=VlmPipeline,\n", + "# )\n", + "# }\n", + "# )\n", + "\n" ] }, { @@ -332,7 +375,9 @@ " with open(json_output_path, \"w\") as f:\n", " json.dump(doc_dict, f)\n", " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n", - " json_files.append(json_output_path.resolve())" + " json_files.append(json_output_path.resolve())\n", + "\n", + " print(f\"\\nSample:\\n ${doc.export_to_text()[:500]}...\")" ] }, { @@ -817,7 +862,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.13" } }, "nbformat": 4,