Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -360,14 +360,23 @@
"source": [
"import json\n",
"\n",
"confidence_reports = dict()\n",
"\n",
"json_files=[]\n",
"\n",
"for contribution in contributions:\n",
" files = list((contribution[\"dir\"] / SOURCE_DOCUMENT_DIR).glob(\"*.pdf\"))\n",
" \n",
" for file in files:\n",
" doc = doc_converter.convert(source=file).document\n",
" print(f\"Converting {file}...\")\n",
" \n",
" conversion_result = doc_converter.convert(source=file)\n",
"\n",
" doc = conversion_result.document\n",
" doc_dict = doc.export_to_dict()\n",
" \n",
" confidence_reports[file] = conversion_result.confidence\n",
" \n",
" conversion_output_dir = contribution[\"dir\"] / CONVERSION_DIR\n",
" conversion_output_dir.mkdir(parents=True, exist_ok=True)\n",
" \n",
Expand All @@ -377,7 +386,44 @@
" print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
" json_files.append(json_output_path.resolve())\n",
"\n",
" print(f\"\\nSample:\\n ${doc.export_to_text()[:500]}...\")"
" print(\"Document sample:\\n\")\n",
" print(f\"{doc.export_to_text()[:500]}...\")\n",
" print()"
]
},
{
"cell_type": "markdown",
"id": "07a15341-aa4a-4471-85ac-0a49df20fb2e",
"metadata": {},
"source": [
"### Conversion confidence\n",
"\n",
"When converting a document, Docling can calculate how confident it is in the quality of the conversion. This *confidence* is expressed as both a *score* and a *grade*. The score is a numeric value between 0 and 1, and the grade is a label that can be **poor**, **fair**, **good**, or **excellent**. If Docling is unable to calculate a confidence grade, the value will be marked as *unspecified*.\n",
"\n",
"If your document receives a low score (for example, below 0.8) and a grade of *poor* or *fair*, you'll probably benefit from using a different conversion technique. In that case, go back to the *Configure Docling Conversion Pipeline* section and try selecting a different approach (e.g. forcing OCR or using a VLM) and compare the results."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5e4aaa1-4f82-456b-ae37-0a68da10a4c7",
"metadata": {},
"outputs": [],
"source": [
"for file, confidence_report in confidence_reports.items():\n",
" print(f\"Conversion confidence for {file}:\")\n",
" \n",
" print(f\"Average confidence: \\x1b[1m{confidence_report.mean_grade.name}\\033[0m (score {confidence_report.mean_score:.3f})\")\n",
" \n",
" low_score_pages = []\n",
" for page in confidence_report.pages:\n",
" page_confidence_report = confidence_report.pages[page]\n",
" if page_confidence_report.mean_score < confidence_report.mean_score:\n",
" low_score_pages.append(page)\n",
"\n",
" print(f\"Pages that scored lower than average: {', '.join(str(x + 1) for x in low_score_pages)}\")\n",
" \n",
" print()"
]
},
{
Expand Down