Add notebook

alinaryan · alinaryan · commit d625f134ca58 · 2025-05-29T10:56:14.000-04:00
Signed-off-by: Alina Ryan &lt;aliryan@redhat.com&gt;
diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
@@ -151,14 +151,81 @@
    "source": [
     "import json\n",
     "\n",
+    "converted_json_paths = []\n",
+    "\n",
     "for file in files:\n",
     "    doc = doc_converter.convert(source=file).document\n",
     "    doc_dict = doc.export_to_dict()\n",
     "\n",
     "    json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n",
     "    with open(json_output_path, \"w\") as f:\n",
     "        json.dump(doc_dict, f)\n",
-    "        print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")"
+    "        print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
+    "    converted_json_paths.append(Path(json_output_path).resolve())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2",
+   "metadata": {},
+   "source": [
+    "## Post-Conversion: Illuminator Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9",
+   "metadata": {},
+   "source": [
+    "The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09e07e35-befb-4ed5-9fe4-41544f88d943",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.illuminator.analysis import analyze_docling_tables\n",
+    "from utils.illuminator.utils import generate_summary\n",
+    "from docling.datamodel.document import DoclingDocument\n",
+    "\n",
+    "import json\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Step 1: Analyze converted JSONs\n",
+    "results = {}\n",
+    "\n",
+    "for path in converted_json_paths:\n",
+    "    with open(path, \"r\") as f:\n",
+    "        doc_dict = json.load(f)\n",
+    "\n",
+    "    doc = DoclingDocument(**doc_dict)\n",
+    "    results[path] = analyze_docling_tables(doc)\n",
+    "\n",
+    "# Step 2: Save human-readable summary to a .txt file\n",
+    "summary_path = Path(\"illuminator_readable_summary.txt\")\n",
+    "\n",
+    "with open(summary_path, \"w\") as f:\n",
+    "    original_stdout = sys.stdout\n",
+    "    sys.stdout = f\n",
+    "    try:\n",
+    "        generate_summary(results)\n",
+    "    finally:\n",
+    "        sys.stdout = original_stdout\n",
+    "\n",
+    "print(f\"✅ Post-conversion summary saved to: {summary_path.resolve()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104",
+   "metadata": {},
+   "source": [
+    "\n",
+    "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n"
    ]
   },
   {
@@ -681,7 +748,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -695,7 +762,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.12"
+   "version": "3.13.2"
   }
  },
  "nbformat": 4,