|
151 | 151 | "source": [ |
152 | 152 | "import json\n", |
153 | 153 | "\n", |
| 154 | + "converted_json_paths = []\n", |
| 155 | + "\n", |
154 | 156 | "for file in files:\n", |
155 | 157 | " doc = doc_converter.convert(source=file).document\n", |
156 | 158 | " doc_dict = doc.export_to_dict()\n", |
157 | 159 | "\n", |
158 | 160 | " json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n", |
159 | 161 | " with open(json_output_path, \"w\") as f:\n", |
160 | 162 | " json.dump(doc_dict, f)\n", |
161 | | - " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")" |
| 163 | + " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n", |
| 164 | + " converted_json_paths.append(Path(json_output_path).resolve())", |
| 165 | + ] |
| 166 | + }, |
| 167 | + { |
| 168 | + "cell_type": "markdown", |
| 169 | + "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2", |
| 170 | + "metadata": {}, |
| 171 | + "source": [ |
| 172 | + "## Post-Conversion: Illuminator Analysis" |
| 173 | + ] |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "markdown", |
| 177 | + "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9", |
| 178 | + "metadata": {}, |
| 179 | + "source": [ |
| 180 | + "The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues." |
| 181 | + ] |
| 182 | + }, |
| 183 | + { |
| 184 | + "cell_type": "code", |
| 185 | + "execution_count": null, |
| 186 | + "id": "09e07e35-befb-4ed5-9fe4-41544f88d943", |
| 187 | + "metadata": {}, |
| 188 | + "outputs": [], |
| 189 | + "source": [ |
| 190 | + "from utils.illuminator.analysis import analyze_docling_tables\n", |
| 191 | + "from utils.illuminator.utils import generate_summary\n", |
| 192 | + "from docling.datamodel.document import DoclingDocument\n", |
| 193 | + "\n", |
| 194 | + "import json\n", |
| 195 | + "import sys\n", |
| 196 | + "from pathlib import Path\n", |
| 197 | + "\n", |
| 198 | + "results = {}\n", |
| 199 | + "\n", |
| 200 | + "for path in converted_json_paths:\n", |
| 201 | + " with open(path, \"r\") as f:\n", |
| 202 | + " doc_dict = json.load(f)\n", |
| 203 | + "\n", |
| 204 | + " doc = DoclingDocument(**doc_dict)\n", |
| 205 | + " results[path] = analyze_docling_tables(doc)\n", |
| 206 | + "\n", |
| 207 | + "summary_path = Path(\"illuminator_readable_summary.txt\")\n", |
| 208 | + "\n", |
| 209 | + "with open(summary_path, \"w\") as f:\n", |
| 210 | + " generate_summary(results, file=f)\n", |
| 211 | + "\n", |
| 212 | + "print(f\"✅ Post-conversion summary saved to: {summary_path.resolve()}\")" |
| 213 | + ] |
| 214 | + }, |
| 215 | + { |
| 216 | + "cell_type": "markdown", |
| 217 | + "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104", |
| 218 | + "metadata": {}, |
| 219 | + "source": [ |
| 220 | + "\n", |
| 221 | + "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n" |
162 | 222 | ] |
163 | 223 | }, |
164 | 224 | { |
|
0 commit comments