diff --git a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb index 7ebde02..a32f2a5 100644 --- a/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb +++ b/notebooks/instructlab-knowledge/instructlab-knowledge.ipynb @@ -151,6 +151,8 @@ "source": [ "import json\n", "\n", + "converted_json_paths = []\n", + "\n", "for file in files:\n", " doc = doc_converter.convert(source=file).document\n", " doc_dict = doc.export_to_dict()\n", @@ -158,7 +160,65 @@ " json_output_path = CONVERSION_OUTPUT_DIR / f\"{file.stem}.json\"\n", " with open(json_output_path, \"w\") as f:\n", " json.dump(doc_dict, f)\n", - " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")" + " print(f\"Path of JSON output is: {Path(json_output_path).resolve()}\")\n", + " converted_json_paths.append(Path(json_output_path).resolve())" + ] + }, + { + "cell_type": "markdown", + "id": "40710019-7ec9-414e-ad72-1ba672cf5fc2", + "metadata": {}, + "source": [ + "## Post-Conversion: Illuminator Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "2572e2d0-94dc-4ca0-b032-3978af26c9c9", + "metadata": {}, + "source": [ + "The output of document conversion is not always perfect. Data may become distorted or corrupted, which can negatively affect a model's performance after training. While optional, reviewing your converted data is strongly recommended. The following example explains how to use the Illuminator tool to identify common conversion issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09e07e35-befb-4ed5-9fe4-41544f88d943", + "metadata": {}, + "outputs": [], + "source": [ + "from utils.illuminator.analysis import analyze_docling_tables\n", + "from utils.illuminator.utils import generate_summary\n", + "from docling.datamodel.document import DoclingDocument\n", + "\n", + "import json\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "results = {}\n", + "\n", + "for path in converted_json_paths:\n", + " with open(path, \"r\") as f:\n", + " doc_dict = json.load(f)\n", + "\n", + " doc = DoclingDocument(**doc_dict)\n", + " results[path] = analyze_docling_tables(doc)\n", + "\n", + "summary_path = Path(\"illuminator_readable_summary.txt\")\n", + "\n", + "with open(summary_path, \"w\") as f:\n", + " generate_summary(results, file=f)\n", + "\n", + "print(f\"āœ… Post-conversion summary saved to: {summary_path.resolve()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "eea0876e-ac55-45fc-93e8-3e646a6c3104", + "metadata": {}, + "source": [ + "\n", + "The output of this post-conversion step should help determine whether to avoid using the content for chunking entirely or to manually edit it before proceeding with chunking.\n" ] }, { diff --git a/notebooks/illuminator/.gitignore b/notebooks/instructlab-knowledge/utils/illuminator/.gitignore similarity index 100% rename from notebooks/illuminator/.gitignore rename to notebooks/instructlab-knowledge/utils/illuminator/.gitignore diff --git a/notebooks/illuminator/README.md b/notebooks/instructlab-knowledge/utils/illuminator/README.md similarity index 86% rename from notebooks/illuminator/README.md rename to notebooks/instructlab-knowledge/utils/illuminator/README.md index 04789e9..33f54d0 100644 --- a/notebooks/illuminator/README.md +++ b/notebooks/instructlab-knowledge/utils/illuminator/README.md @@ -35,9 +35,20 @@ pip install -r requirements.txt ``` ## šŸš€ Usage -### Check a Single PDF or an Entire Folder of PDFs +### Supports PDF and JSON +Illuminator works with: +- Raw PDF files (will convert using Docling) +- Docling-generated JSON files (post-conversion documents) + +### Analyse a Single File ``` -python illuminator.py -f /path/to/pdf/document.pdf +python illuminator.py -f /path/to/document.pdf +python illuminator.py -f /path/to/document.json +``` + +### Analyze All Files in a Folder +``` +python illuminator.py -f /path/to/folder/ ``` ### Save Results to a JSON File @@ -89,4 +100,3 @@ python illuminator.py -f /path/to/pdf/folder/ -o results.json ## šŸ¤ Acknowledgments Built by Alina with ā¤ļø for better PDF conversion workflows! - diff --git a/notebooks/illuminator/analysis.py b/notebooks/instructlab-knowledge/utils/illuminator/analysis.py similarity index 76% rename from notebooks/illuminator/analysis.py rename to notebooks/instructlab-knowledge/utils/illuminator/analysis.py index be8dc2e..a98d605 100644 --- a/notebooks/illuminator/analysis.py +++ b/notebooks/instructlab-knowledge/utils/illuminator/analysis.py @@ -1,6 +1,7 @@ from docling.document_converter import DocumentConverter -from typing import List, Tuple, Dict, Any, Union -from log_utils import logger +from docling.datamodel.document import DoclingDocument +from typing import List, Tuple, Dict, Any, Union, Set +from .log_utils import logger import os def cell_is_merged(cell) -> bool: @@ -40,37 +41,36 @@ def summarize_tables(doc) -> Tuple[int, List[int]]: return num_tables, pages - -def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]: +def convert_to_docling_document(file_path: str) -> DoclingDocument: """ - Analyzes a PDF for merged table cells using the Docling converter, - and saves a Markdown version of the converted document. + Converts a PDF using Docling and saves a Markdown version of the document. Args: file_path: Path to the input PDF file. Returns: - A dictionary containing: - - Total number of tables. - - Set of pages with merged cells. - - List of merged cell details (page, position, spans, text). - - Total unique pages with tables. + The converted Docling Document object. """ converter = DocumentConverter() result = converter.convert(file_path) doc = result.document - # āœ… Save Markdown output + # Save Markdown output markdown_text = doc.export_to_markdown() base_name = os.path.splitext(os.path.basename(file_path))[0] md_output_path = f"{base_name}.md" - with open(md_output_path, "w") as f: + with open(md_output_path, "w", encoding="utf-8") as f: f.write(markdown_text) logger.info(f"šŸ“ Markdown saved to {md_output_path}") + return doc + +def analyze_docling_tables(doc_input: DoclingDocument) -> Dict[str, Union[int, List[dict], List[int], str]]: + """ + Analyzes a Docling document (object or path to PDF/JSON file) for merged table cells. + """ - # ā¬‡ļø Proceed with table analysis - table_count, table_pages_list = summarize_tables(doc) + table_count, table_pages_list = summarize_tables(doc_input) total_pages = len(set(table_pages_list)) or "Unknown" issues = { @@ -80,7 +80,7 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]] "page_count": total_pages } - for i, table_item in enumerate(doc.tables): + for i, table_item in enumerate(doc_input.tables): try: page_number = table_pages_list[i] except IndexError: @@ -103,4 +103,4 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]] issues["merged_cell_pages"].add(page_number) issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"]) - return issues + return issues \ No newline at end of file diff --git a/notebooks/illuminator/illuminator.py b/notebooks/instructlab-knowledge/utils/illuminator/illuminator.py similarity index 69% rename from notebooks/illuminator/illuminator.py rename to notebooks/instructlab-knowledge/utils/illuminator/illuminator.py index e5f63f1..03e368c 100644 --- a/notebooks/illuminator/illuminator.py +++ b/notebooks/instructlab-knowledge/utils/illuminator/illuminator.py @@ -1,8 +1,8 @@ # main.py import argparse -from utils import get_pdf_files, save_results, generate_summary -from analysis import analyze_pdf_with_docling -from log_utils import logger +from utils import get_supported_files, save_results, generate_summary +from analysis import convert_to_docling_document, analyze_docling_tables +from .log_utils import logger def parse_args() -> argparse.Namespace: """ @@ -27,25 +27,26 @@ def parse_args() -> argparse.Namespace: ) return parser.parse_args() - def main() -> None: """ Main execution flow: - Parses arguments - - Loads and analyzes PDFs + - Loads and analyzes PDFs or JSON files - Generates and saves results """ args = parse_args() - pdfs = get_pdf_files(args.file) - if not pdfs: - logger.error("āŒ No PDFs found to process.") + files = get_supported_files(args.file) + if not files: + logger.error("āŒ No supported input files found to process.") return all_results = {} - for path in pdfs: + for path in files: logger.info(f"\nšŸ” Converting and analyzing: {path}\n") try: - result = analyze_pdf_with_docling(path) + # Use Docling to convert (PDF path) + doc = convert_to_docling_document(path) + result = analyze_docling_tables(doc) all_results[path] = result except Exception as e: logger.error(f"āŒ Failed to process {path}: {e}") @@ -54,4 +55,4 @@ def main() -> None: save_results(all_results, args.output) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/notebooks/illuminator/log_utils.py b/notebooks/instructlab-knowledge/utils/illuminator/log_utils.py similarity index 100% rename from notebooks/illuminator/log_utils.py rename to notebooks/instructlab-knowledge/utils/illuminator/log_utils.py diff --git a/notebooks/illuminator/requirements.txt b/notebooks/instructlab-knowledge/utils/illuminator/requirements.txt similarity index 100% rename from notebooks/illuminator/requirements.txt rename to notebooks/instructlab-knowledge/utils/illuminator/requirements.txt diff --git a/notebooks/illuminator/utils.py b/notebooks/instructlab-knowledge/utils/illuminator/utils.py similarity index 62% rename from notebooks/illuminator/utils.py rename to notebooks/instructlab-knowledge/utils/illuminator/utils.py index d5361e3..ab617df 100644 --- a/notebooks/illuminator/utils.py +++ b/notebooks/instructlab-knowledge/utils/illuminator/utils.py @@ -3,27 +3,39 @@ import os from datetime import datetime, timezone from typing import List, Optional, Dict, Any -from log_utils import logger +from .log_utils import logger MAX_PREVIEW_LENGTH = 30 # Max characters shown from cell text in summary +SUPPORTED_FILE_EXTENSIONS = [".pdf", ".json"] -def get_pdf_files(path: str) -> List[str]: +def get_supported_files(path: str) -> List[str]: """ - Returns a list of PDF file paths from a given file or directory. + Returns a list of file paths from the given path that match supported extensions. + """ + return [ + os.path.join(path, f) + for f in os.listdir(path) + if os.path.isfile(os.path.join(path, f)) and any(f.endswith(ext) for ext in SUPPORTED_FILE_EXTENSIONS) + ] + +def get_supported_files(path: str, extensions: List[str] = [".pdf", ".json"]) -> List[str]: + """ + Returns a list containing one or more files that are in SUPPORTED_FILE_EXTENSIONS Args: - path: Path to a single .pdf file or directory containing .pdf files. + path: Path to a single file or directory. + extensions: List of file extensions to include. Returns: - List of .pdf file paths. + List of matching file paths. """ - if os.path.isfile(path) and path.endswith(".pdf"): + if os.path.isfile(path) and any(path.endswith(ext) for ext in extensions): return [path] elif os.path.isdir(path): return [ os.path.join(path, f) for f in os.listdir(path) - if f.endswith(".pdf") + if any(f.endswith(ext) for ext in extensions) ] return [] @@ -50,41 +62,48 @@ def save_results(results, output_file: str) -> None: json.dump(results, f, indent=4) logger.info(f"šŸ“ Results saved to {output_file}") -def generate_summary(results) -> None: +def generate_summary(results, file=None) -> None: """ - Prints a human-readable summary of merged table cell issues per file. + Prints a human-readable summary of merged table cell issues per file, + optionally also writing to a file. Args: - results: Dictionary containing analysis results for each PDF. + results: Dictionary containing analysis results for each document. + file: Optional file-like object (e.g., open file or StringIO) to also write output. """ - logger.info("šŸ“Š Summary Report") - logger.info("=" * 50) - for file, data in results.items(): - logger.info(f"\nšŸ“‚ File: {file}") + def out(msg): + print(msg) # Always print to notebook + if file: + print(msg, file=file) # Also write to file if provided + + out("šŸ“Š Summary Report") + out("=" * 50) + for path, data in results.items(): + out(f"\nšŸ“‚ File: {path}") total_tables = data.get("table_count", 0) merged_cells = data.get("merged_table_cells", []) tables_with_merged_cells = len(set(cell["page"] for cell in merged_cells)) if total_tables == 0: - logger.info("ā„¹ļø No tables detected in this document.") + out("ā„¹ļø No tables detected in this document.") continue - logger.info(f"šŸ“‹ Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.") + out(f"šŸ“‹ Found {total_tables} table(s); {tables_with_merged_cells} table(s) have merged cells.") if not data.get("merged_cell_pages"): - logger.info("āœ… Tables detected, but no merged cells found.") + out("āœ… Tables detected, but no merged cells found.") continue pages = format_pages(data["merged_cell_pages"]) - logger.info(f"āš ļø Merged Table Cells Detected on Pages: {pages}") + out(f"āš ļø Merged Table Cells Detected on Pages: {pages}") for cell in merged_cells: page = cell.get("page") text = cell.get("text", "").strip() if len(text) > MAX_PREVIEW_LENGTH: text = text[:MAX_PREVIEW_LENGTH] + "..." - logger.info(f" - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})") + out(f" - Page {page}: \"{text}\" (row={cell['row']}, column={cell['column']})") def format_pages(pages) -> str: """ @@ -114,4 +133,4 @@ def format_pages(pages) -> str: ranges.append(f"{start}") else: ranges.append(f"{start}-{pages[-1]}") - return ", ".join(ranges) + return ", ".join(ranges) \ No newline at end of file