|
151 | 151 | "source": [ |
152 | 152 | "import json\n", |
153 | 153 | "\n", |
| 154 | + "converted_json_paths = []\n", |
| 155 | + "\n", |
154 | 156 | "for file in files:\n", |
155 | 157 | " doc = doc_converter.convert(source=file).document\n", |
156 | 158 | " doc_dict = doc.export_to_dict()\n", |
157 | 159 | "\n", |
158 | 160 | " json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n", |
159 | 161 | " with open(json_output_path, \"w\") as f:\n", |
160 | 162 | " json.dump(doc_dict, f)\n", |
161 | | - " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")" |
| 163 | + " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n", |
| 164 | + " converted_json_paths.append(Path(json_output_path).resolve())" |
| 165 | + ] |
| 166 | + }, |
| 167 | + { |
| 168 | + "cell_type": "markdown", |
| 169 | + "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2", |
| 170 | + "metadata": {}, |
| 171 | + "source": [ |
| 172 | + "## Post-Conversion: Illuminator Analysis" |
| 173 | + ] |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "markdown", |
| 177 | + "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9", |
| 178 | + "metadata": {}, |
| 179 | + "source": [ |
| 180 | + "The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues." |
| 181 | + ] |
| 182 | + }, |
| 183 | + { |
| 184 | + "cell_type": "code", |
| 185 | + "execution_count": null, |
| 186 | + "id": "09e07e35-befb-4ed5-9fe4-41544f88d943", |
| 187 | + "metadata": {}, |
| 188 | + "outputs": [], |
| 189 | + "source": [ |
| 190 | + "from utils.illuminator.analysis import analyze_docling_tables\n", |
| 191 | + "from utils.illuminator.utils import generate_summary\n", |
| 192 | + "from docling.datamodel.document import DoclingDocument\n", |
| 193 | + "\n", |
| 194 | + "import json\n", |
| 195 | + "import sys\n", |
| 196 | + "from pathlib import Path\n", |
| 197 | + "\n", |
| 198 | + "# Step 1: Analyze converted JSONs\n", |
| 199 | + "results = {}\n", |
| 200 | + "\n", |
| 201 | + "for path in converted_json_paths:\n", |
| 202 | + " with open(path, \"r\") as f:\n", |
| 203 | + " doc_dict = json.load(f)\n", |
| 204 | + "\n", |
| 205 | + " doc = DoclingDocument(**doc_dict)\n", |
| 206 | + " results[path] = analyze_docling_tables(doc)\n", |
| 207 | + "\n", |
| 208 | + "# Step 2: Save human-readable summary to a .txt file\n", |
| 209 | + "summary_path = Path(\"illuminator_readable_summary.txt\")\n", |
| 210 | + "\n", |
| 211 | + "with open(summary_path, \"w\") as f:\n", |
| 212 | + " original_stdout = sys.stdout\n", |
| 213 | + " sys.stdout = f\n", |
| 214 | + " try:\n", |
| 215 | + " generate_summary(results)\n", |
| 216 | + " finally:\n", |
| 217 | + " sys.stdout = original_stdout\n", |
| 218 | + "\n", |
| 219 | + "print(f\"✅ Post-conversion summary saved to: {summary_path.resolve()}\")" |
| 220 | + ] |
| 221 | + }, |
| 222 | + { |
| 223 | + "cell_type": "markdown", |
| 224 | + "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104", |
| 225 | + "metadata": {}, |
| 226 | + "source": [ |
| 227 | + "\n", |
| 228 | + "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n" |
162 | 229 | ] |
163 | 230 | }, |
164 | 231 | { |
|
681 | 748 | ], |
682 | 749 | "metadata": { |
683 | 750 | "kernelspec": { |
684 | | - "display_name": ".venv", |
| 751 | + "display_name": "Python 3 (ipykernel)", |
685 | 752 | "language": "python", |
686 | 753 | "name": "python3" |
687 | 754 | }, |
|
695 | 762 | "name": "python", |
696 | 763 | "nbconvert_exporter": "python", |
697 | 764 | "pygments_lexer": "ipython3", |
698 | | - "version": "3.11.12" |
| 765 | + "version": "3.13.2" |
699 | 766 | } |
700 | 767 | }, |
701 | 768 | "nbformat": 4, |
|
0 commit comments