instructlab · alimaredia · Jun 5, 2025 · May 20, 2025 · May 21, 2025
@@ -151,14 +151,74 @@
    "source": [
     "import json\n",
     "\n",
+    "converted_json_paths = []\n",
+    "\n",
     "for file in files:\n",
     "    doc = doc_converter.convert(source=file).document\n",
     "    doc_dict = doc.export_to_dict()\n",
     "\n",
     "    json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n",
     "    with open(json_output_path, \"w\") as f:\n",
     "        json.dump(doc_dict, f)\n",
-    "        print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")"
+    "        print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
+    "    converted_json_paths.append(Path(json_output_path).resolve())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2",
+   "metadata": {},
+   "source": [
+    "## Post-Conversion: Illuminator Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9",
+   "metadata": {},
+   "source": [
+    "The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09e07e35-befb-4ed5-9fe4-41544f88d943",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils.illuminator.analysis import analyze_docling_tables\n",
+    "from utils.illuminator.utils import generate_summary\n",
+    "from docling.datamodel.document import DoclingDocument\n",
+    "\n",
+    "import json\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "results = {}\n",
+    "\n",
+    "for path in converted_json_paths:\n",
+    "    with open(path, \"r\") as f:\n",
+    "        doc_dict = json.load(f)\n",
+    "\n",
+    "    doc = DoclingDocument(**doc_dict)\n",
+    "    results[path] = analyze_docling_tables(doc)\n",
+    "\n",
+    "summary_path = Path(\"illuminator_readable_summary.txt\")\n",
+    "\n",
+    "with open(summary_path, \"w\") as f:\n",
+    "    generate_summary(results, file=f)\n",
+    "\n",
+    "print(f\"✅ Post-conversion summary saved to: {summary_path.resolve()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104",
+   "metadata": {},
+   "source": [
+    "\n",
+    "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n"
    ]
   },
   {

@@ -35,9 +35,20 @@ pip install -r requirements.txt
 ```
 
 ## 🚀 Usage
-### Check a Single PDF or an Entire Folder of PDFs
+### Supports PDF and JSON
+Illuminator works with:
+- Raw PDF files (will convert using Docling)
+- Docling-generated JSON files (post-conversion documents)
+
+### Analyse a Single File
 ```
-python illuminator.py -f /path/to/pdf/document.pdf
+python illuminator.py -f /path/to/document.pdf
+python illuminator.py -f /path/to/document.json
+```
+
+### Analyze All Files in a Folder
+```
+python illuminator.py -f /path/to/folder/
 ```
 
 ### Save Results to a JSON File
@@ -89,4 +100,3 @@ python illuminator.py -f /path/to/pdf/folder/ -o results.json
 
 ## 🤝 Acknowledgments
 Built by Alina with ❤️ for better PDF conversion workflows!
-
@@ -1,6 +1,7 @@
 from docling.document_converter import DocumentConverter
-from typing import List, Tuple, Dict, Any, Union
-from log_utils import logger
+from docling.datamodel.document import DoclingDocument
+from typing import List, Tuple, Dict, Any, Union, Set
+from .log_utils import logger
 import os
 
 def cell_is_merged(cell) -> bool:
@@ -40,37 +41,36 @@ def summarize_tables(doc) -> Tuple[int, List[int]]:
 
     return num_tables, pages
 
-
-def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
+def convert_to_docling_document(file_path: str) -> DoclingDocument:
     """
-    Analyzes a PDF for merged table cells using the Docling converter,
-    and saves a Markdown version of the converted document.
+    Converts a PDF using Docling and saves a Markdown version of the document.
 
     Args:
         file_path: Path to the input PDF file.
 
     Returns:
-        A dictionary containing:
-            - Total number of tables.
-            - Set of pages with merged cells.
-            - List of merged cell details (page, position, spans, text).
-            - Total unique pages with tables.
+        The converted Docling Document object.
     """
     converter = DocumentConverter()
     result = converter.convert(file_path)
     doc = result.document
 
-    # ✅ Save Markdown output
+    # Save Markdown output
     markdown_text = doc.export_to_markdown()
     base_name = os.path.splitext(os.path.basename(file_path))[0]
     md_output_path = f"{base_name}.md"
-    with open(md_output_path, "w") as f:
+    with open(md_output_path, "w", encoding="utf-8") as f:
         f.write(markdown_text)
 
     logger.info(f"📝 Markdown saved to {md_output_path}")
+    return doc
+
+def analyze_docling_tables(doc_input: DoclingDocument) -> Dict[str, Union[int, List[dict], List[int], str]]:
+    """
+    Analyzes a Docling document (object or path to PDF/JSON file) for merged table cells.
+    """
 
-    # ⬇️ Proceed with table analysis
-    table_count, table_pages_list = summarize_tables(doc)
+    table_count, table_pages_list = summarize_tables(doc_input)
     total_pages = len(set(table_pages_list)) or "Unknown"
 
     issues = {
@@ -80,7 +80,7 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
         "page_count": total_pages
     }
 
-    for i, table_item in enumerate(doc.tables):
+    for i, table_item in enumerate(doc_input.tables):
         try:
             page_number = table_pages_list[i]
         except IndexError:
@@ -103,4 +103,4 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
                         issues["merged_cell_pages"].add(page_number)
 
     issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
-    return issues
+    return issues
@@ -1,8 +1,8 @@
 # main.py
 import argparse
-from utils import get_pdf_files, save_results, generate_summary
-from analysis import analyze_pdf_with_docling
-from log_utils import logger
+from utils import get_supported_files, save_results, generate_summary
+from analysis import convert_to_docling_document, analyze_docling_tables
+from .log_utils import logger
 
 def parse_args() -> argparse.Namespace:
     """
@@ -27,25 +27,26 @@ def parse_args() -> argparse.Namespace:
     )
     return parser.parse_args()
 
-
 def main() -> None:
     """
     Main execution flow:
     - Parses arguments
-    - Loads and analyzes PDFs
+    - Loads and analyzes PDFs or JSON files
     - Generates and saves results
     """
     args = parse_args()
-    pdfs = get_pdf_files(args.file)
-    if not pdfs:
-        logger.error("❌ No PDFs found to process.")
+    files = get_supported_files(args.file)
+    if not files:
+        logger.error("❌ No supported input files found to process.")
         return
 
     all_results = {}
-    for path in pdfs:
+    for path in files:
         logger.info(f"\n🔍 Converting and analyzing: {path}\n")
         try:
-            result = analyze_pdf_with_docling(path)
+            # Use Docling to convert (PDF path)
+            doc = convert_to_docling_document(path)
+            result = analyze_docling_tables(doc)
             all_results[path] = result
         except Exception as e:
             logger.error(f"❌ Failed to process {path}: {e}")
@@ -54,4 +55,4 @@ def main() -> None:
     save_results(all_results, args.output)
 
 if __name__ == "__main__":
-    main()
+    main()
@@ -3,27 +3,39 @@
 import os
 from datetime import datetime, timezone
 from typing import List, Optional, Dict, Any
-from log_utils import logger
+from .log_utils import logger
 
 MAX_PREVIEW_LENGTH = 30  # Max characters shown from cell text in summary
+SUPPORTED_FILE_EXTENSIONS = [".pdf", ".json"]
 
-def get_pdf_files(path: str) -> List[str]:
+def get_supported_files(path: str) -> List[str]:
     """
-    Returns a list of PDF file paths from a given file or directory.
+    Returns a list of file paths from the given path that match supported extensions.
+    """
+    return [
+        os.path.join(path, f)
+        for f in os.listdir(path)
+        if os.path.isfile(os.path.join(path, f)) and any(f.endswith(ext) for ext in SUPPORTED_FILE_EXTENSIONS)
+    ]
+
+def get_supported_files(path: str, extensions: List[str] = [".pdf", ".json"]) -> List[str]:
+    """
+    Returns a list containing one or more files that are in SUPPORTED_FILE_EXTENSIONS
 
     Args:
-        path: Path to a single .pdf file or directory containing .pdf files.
+        path: Path to a single file or directory.
+        extensions: List of file extensions to include.
 
     Returns:
-        List of .pdf file paths.
+        List of matching file paths.
     """
-    if os.path.isfile(path) and path.endswith(".pdf"):
+    if os.path.isfile(path) and any(path.endswith(ext) for ext in extensions):
         return [path]
     elif os.path.isdir(path):
         return [
             os.path.join(path, f)
             for f in os.listdir(path)
-            if f.endswith(".pdf")
+            if any(f.endswith(ext) for ext in extensions)
         ]
     return []
 
@@ -50,41 +62,48 @@ def save_results(results, output_file: str) -> None:
         json.dump(results, f, indent=4)
     logger.info(f"📁 Results saved to {output_file}")
 
-def generate_summary(results) -> None:
+def generate_summary(results, file=None) -> None:
     """
-    Prints a human-readable summary of merged table cell issues per file.
+    Prints a human-readable summary of merged table cell issues per file,
+    optionally also writing to a file.
 
     Args:
-        results: Dictionary containing analysis results for each PDF.
+        results: Dictionary containing analysis results for each document.
+        file: Optional file-like object (e.g., open file or StringIO) to also write output.
     """
-    logger.info("📊 Summary Report")
-    logger.info("=" * 50)
-    for file, data in results.items():
-        logger.info(f"\n📂 File: {file}")
+    def out(msg):
+        print(msg)               # Always print to notebook
+        if file:
+            print(msg, file=file)  # Also write to file if provided
+
+    out("📊 Summary Report")
+    out("=" * 50)
+    for path, data in results.items():
+        out(f"\n📂 File: {path}")
 
         total_tables = data.get("table_count", 0)
         merged_cells = data.get("merged_table_cells", [])
         tables_with_merged_cells = len(set(cell["page"] for cell in merged_cells))
 
         if total_tables == 0:
-            logger.info("ℹ️  No tables detected in this document.")
+            out("ℹ️  No tables detected in this document.")
             continue
 
-        logger.info(f"📋 Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.")
+        out(f"📋 Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.")
 
         if not data.get("merged_cell_pages"):
-            logger.info("✅ Tables detected, but no merged cells found.")
+            out("✅ Tables detected, but no merged cells found.")
             continue
 
         pages = format_pages(data["merged_cell_pages"])
-        logger.info(f"⚠️ Merged Table Cells Detected on Pages: {pages}")
+        out(f"⚠️ Merged Table Cells Detected on Pages: {pages}")
 
         for cell in merged_cells:
             page = cell.get("page")
             text = cell.get("text", "").strip()
             if len(text) > MAX_PREVIEW_LENGTH:
                 text = text[:MAX_PREVIEW_LENGTH] + "..."
-            logger.info(f"   - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})")
+            out(f"   - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})")
 
 def format_pages(pages) -> str:
     """
@@ -114,4 +133,4 @@ def format_pages(pages) -> str:
         ranges.append(f"{start}")
     else:
         ranges.append(f"{start}-{pages[-1]}")
-    return ", ".join(ranges)
+    return ", ".join(ranges)