Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion notebooks/instructlab-knowledge/instructlab-knowledge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,74 @@
"source": [
"import json\n",
"\n",
"converted_json_paths = []\n",
"\n",
"for file in files:\n",
" doc = doc_converter.convert(source=file).document\n",
" doc_dict = doc.export_to_dict()\n",
"\n",
" json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n",
" with open(json_output_path, \"w\") as f:\n",
" json.dump(doc_dict, f)\n",
" print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")"
" print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n",
" converted_json_paths.append(Path(json_output_path).resolve())"
]
},
{
"cell_type": "markdown",
"id": "40710019-7ec9-414e-ad72-1ba672cf5fc2",
"metadata": {},
"source": [
"## Post-Conversion: Illuminator Analysis"
]
},
{
"cell_type": "markdown",
"id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9",
"metadata": {},
"source": [
"The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09e07e35-befb-4ed5-9fe4-41544f88d943",
"metadata": {},
"outputs": [],
"source": [
"from utils.illuminator.analysis import analyze_docling_tables\n",
"from utils.illuminator.utils import generate_summary\n",
"from docling.datamodel.document import DoclingDocument\n",
"\n",
"import json\n",
"import sys\n",
"from pathlib import Path\n",
"\n",
"results = {}\n",
"\n",
"for path in converted_json_paths:\n",
" with open(path, \"r\") as f:\n",
" doc_dict = json.load(f)\n",
"\n",
" doc = DoclingDocument(**doc_dict)\n",
" results[path] = analyze_docling_tables(doc)\n",
"\n",
"summary_path = Path(\"illuminator_readable_summary.txt\")\n",
"\n",
"with open(summary_path, \"w\") as f:\n",
" generate_summary(results, file=f)\n",
"\n",
"print(f\"✅ Post-conversion summary saved to: {summary_path.resolve()}\")"
]
},
{
"cell_type": "markdown",
"id": "eea0876e-ac55-45fc-93e8-3e646a6c3104",
"metadata": {},
"source": [
"\n",
"The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,20 @@ pip install -r requirements.txt
```

## 🚀 Usage
### Check a Single PDF or an Entire Folder of PDFs
### Supports PDF and JSON
Illuminator works with:
- Raw PDF files (will convert using Docling)
- Docling-generated JSON files (post-conversion documents)

### Analyse a Single File
```
python illuminator.py -f /path/to/pdf/document.pdf
python illuminator.py -f /path/to/document.pdf
python illuminator.py -f /path/to/document.json
```

### Analyze All Files in a Folder
```
python illuminator.py -f /path/to/folder/
```

### Save Results to a JSON File
Expand Down Expand Up @@ -89,4 +100,3 @@ python illuminator.py -f /path/to/pdf/folder/ -o results.json

## 🤝 Acknowledgments
Built by Alina with ❤️ for better PDF conversion workflows!

Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from docling.document_converter import DocumentConverter
from typing import List, Tuple, Dict, Any, Union
from log_utils import logger
from docling.datamodel.document import DoclingDocument
from typing import List, Tuple, Dict, Any, Union, Set
from .log_utils import logger
import os

def cell_is_merged(cell) -> bool:
Expand Down Expand Up @@ -40,37 +41,36 @@ def summarize_tables(doc) -> Tuple[int, List[int]]:

return num_tables, pages


def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
def convert_to_docling_document(file_path: str) -> DoclingDocument:
"""
Analyzes a PDF for merged table cells using the Docling converter,
and saves a Markdown version of the converted document.
Converts a PDF using Docling and saves a Markdown version of the document.

Args:
file_path: Path to the input PDF file.

Returns:
A dictionary containing:
- Total number of tables.
- Set of pages with merged cells.
- List of merged cell details (page, position, spans, text).
- Total unique pages with tables.
The converted Docling Document object.
"""
converter = DocumentConverter()
result = converter.convert(file_path)
doc = result.document

# Save Markdown output
# Save Markdown output
markdown_text = doc.export_to_markdown()
base_name = os.path.splitext(os.path.basename(file_path))[0]
md_output_path = f"{base_name}.md"
with open(md_output_path, "w") as f:
with open(md_output_path, "w", encoding="utf-8") as f:
f.write(markdown_text)

logger.info(f"📝 Markdown saved to {md_output_path}")
return doc

def analyze_docling_tables(doc_input: DoclingDocument) -> Dict[str, Union[int, List[dict], List[int], str]]:
"""
Analyzes a Docling document (object or path to PDF/JSON file) for merged table cells.
"""

# ⬇️ Proceed with table analysis
table_count, table_pages_list = summarize_tables(doc)
table_count, table_pages_list = summarize_tables(doc_input)
total_pages = len(set(table_pages_list)) or "Unknown"

issues = {
Expand All @@ -80,7 +80,7 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
"page_count": total_pages
}

for i, table_item in enumerate(doc.tables):
for i, table_item in enumerate(doc_input.tables):
try:
page_number = table_pages_list[i]
except IndexError:
Expand All @@ -103,4 +103,4 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
issues["merged_cell_pages"].add(page_number)

issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
return issues
return issues
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# main.py
import argparse
from utils import get_pdf_files, save_results, generate_summary
from analysis import analyze_pdf_with_docling
from log_utils import logger
from utils import get_supported_files, save_results, generate_summary
from analysis import convert_to_docling_document, analyze_docling_tables
from .log_utils import logger

def parse_args() -> argparse.Namespace:
"""
Expand All @@ -27,25 +27,26 @@ def parse_args() -> argparse.Namespace:
)
return parser.parse_args()


def main() -> None:
"""
Main execution flow:
- Parses arguments
- Loads and analyzes PDFs
- Loads and analyzes PDFs or JSON files
- Generates and saves results
"""
args = parse_args()
pdfs = get_pdf_files(args.file)
if not pdfs:
logger.error("❌ No PDFs found to process.")
files = get_supported_files(args.file)
if not files:
logger.error("❌ No supported input files found to process.")
return

all_results = {}
for path in pdfs:
for path in files:
logger.info(f"\n🔍 Converting and analyzing: {path}\n")
try:
result = analyze_pdf_with_docling(path)
# Use Docling to convert (PDF path)
doc = convert_to_docling_document(path)
result = analyze_docling_tables(doc)
all_results[path] = result
except Exception as e:
logger.error(f"❌ Failed to process {path}: {e}")
Expand All @@ -54,4 +55,4 @@ def main() -> None:
save_results(all_results, args.output)

if __name__ == "__main__":
main()
main()
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,39 @@
import os
from datetime import datetime, timezone
from typing import List, Optional, Dict, Any
from log_utils import logger
from .log_utils import logger

MAX_PREVIEW_LENGTH = 30 # Max characters shown from cell text in summary
SUPPORTED_FILE_EXTENSIONS = [".pdf", ".json"]

def get_pdf_files(path: str) -> List[str]:
def get_supported_files(path: str) -> List[str]:
"""
Returns a list of PDF file paths from a given file or directory.
Returns a list of file paths from the given path that match supported extensions.
"""
return [
os.path.join(path, f)
for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f)) and any(f.endswith(ext) for ext in SUPPORTED_FILE_EXTENSIONS)
]

def get_supported_files(path: str, extensions: List[str] = [".pdf", ".json"]) -> List[str]:
"""
Returns a list containing one or more files that are in SUPPORTED_FILE_EXTENSIONS

Args:
path: Path to a single .pdf file or directory containing .pdf files.
path: Path to a single file or directory.
extensions: List of file extensions to include.

Returns:
List of .pdf file paths.
List of matching file paths.
"""
if os.path.isfile(path) and path.endswith(".pdf"):
if os.path.isfile(path) and any(path.endswith(ext) for ext in extensions):
return [path]
elif os.path.isdir(path):
return [
os.path.join(path, f)
for f in os.listdir(path)
if f.endswith(".pdf")
if any(f.endswith(ext) for ext in extensions)
]
return []

Expand All @@ -50,41 +62,48 @@ def save_results(results, output_file: str) -> None:
json.dump(results, f, indent=4)
logger.info(f"📁 Results saved to {output_file}")

def generate_summary(results) -> None:
def generate_summary(results, file=None) -> None:
"""
Prints a human-readable summary of merged table cell issues per file.
Prints a human-readable summary of merged table cell issues per file,
optionally also writing to a file.

Args:
results: Dictionary containing analysis results for each PDF.
results: Dictionary containing analysis results for each document.
file: Optional file-like object (e.g., open file or StringIO) to also write output.
"""
logger.info("📊 Summary Report")
logger.info("=" * 50)
for file, data in results.items():
logger.info(f"\n📂 File: {file}")
def out(msg):
print(msg) # Always print to notebook
if file:
print(msg, file=file) # Also write to file if provided

out("📊 Summary Report")
out("=" * 50)
for path, data in results.items():
out(f"\n📂 File: {path}")

total_tables = data.get("table_count", 0)
merged_cells = data.get("merged_table_cells", [])
tables_with_merged_cells = len(set(cell["page"] for cell in merged_cells))

if total_tables == 0:
logger.info("ℹ️ No tables detected in this document.")
out("ℹ️ No tables detected in this document.")
continue

logger.info(f"📋 Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.")
out(f"📋 Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.")

if not data.get("merged_cell_pages"):
logger.info("✅ Tables detected, but no merged cells found.")
out("✅ Tables detected, but no merged cells found.")
continue

pages = format_pages(data["merged_cell_pages"])
logger.info(f"⚠️ Merged Table Cells Detected on Pages: {pages}")
out(f"⚠️ Merged Table Cells Detected on Pages: {pages}")

for cell in merged_cells:
page = cell.get("page")
text = cell.get("text", "").strip()
if len(text) > MAX_PREVIEW_LENGTH:
text = text[:MAX_PREVIEW_LENGTH] + "..."
logger.info(f" - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})")
out(f" - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})")

def format_pages(pages) -> str:
"""
Expand Down Expand Up @@ -114,4 +133,4 @@ def format_pages(pages) -> str:
ranges.append(f"{start}")
else:
ranges.append(f"{start}-{pages[-1]}")
return ", ".join(ranges)
return ", ".join(ranges)