Skip to content

Commit 420d143

Browse files
committed
Move illuminator to utils and support json as input
Signed-off-by: Alina Ryan <[email protected]>
1 parent ff26fa4 commit 420d143

File tree

7 files changed

+60
-36
lines changed

7 files changed

+60
-36
lines changed

notebooks/illuminator/.gitignore renamed to notebooks/instructlab-knowledge/utils/illuminator/.gitignore

File renamed without changes.

notebooks/illuminator/README.md renamed to notebooks/instructlab-knowledge/utils/illuminator/README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,20 @@ pip install -r requirements.txt
3535
```
3636

3737
## 🚀 Usage
38-
### Check a Single PDF or an Entire Folder of PDFs
38+
### Supports PDF and JSON
39+
Illuminator works with:
40+
- Raw PDF files (will convert using Docling)
41+
- Docling-generated JSON files (post-conversion documents)
42+
43+
### Analyse a Single File
3944
```
40-
python illuminator.py -f /path/to/pdf/document.pdf
45+
python illuminator.py -f /path/to/document.pdf
46+
python illuminator.py -f /path/to/document.json
47+
```
48+
49+
### Analyze All Files in a Folder
50+
```
51+
python illuminator.py -f /path/to/folder/
4152
```
4253

4354
### Save Results to a JSON File
@@ -89,4 +100,3 @@ python illuminator.py -f /path/to/pdf/folder/ -o results.json
89100

90101
## 🤝 Acknowledgments
91102
Built by Alina with ❤️ for better PDF conversion workflows!
92-

notebooks/illuminator/analysis.py renamed to notebooks/instructlab-knowledge/utils/illuminator/analysis.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from docling.document_converter import DocumentConverter
2-
from typing import List, Tuple, Dict, Any, Union
2+
from docling.datamodel.document import DoclingDocument
3+
from typing import List, Tuple, Dict, Any, Union, Set
34
from log_utils import logger
45
import os
56

@@ -40,37 +41,37 @@ def summarize_tables(doc) -> Tuple[int, List[int]]:
4041

4142
return num_tables, pages
4243

43-
44-
def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]:
44+
def convert_pdf_with_docling(file_path: str) -> DoclingDocument:
4545
"""
46-
Analyzes a PDF for merged table cells using the Docling converter,
47-
and saves a Markdown version of the converted document.
46+
Converts a PDF using Docling and saves a Markdown version of the document.
4847
4948
Args:
5049
file_path: Path to the input PDF file.
5150
5251
Returns:
53-
A dictionary containing:
54-
- Total number of tables.
55-
- Set of pages with merged cells.
56-
- List of merged cell details (page, position, spans, text).
57-
- Total unique pages with tables.
52+
The converted Docling Document object.
5853
"""
5954
converter = DocumentConverter()
6055
result = converter.convert(file_path)
6156
doc = result.document
6257

63-
# Save Markdown output
58+
# Save Markdown output
6459
markdown_text = doc.export_to_markdown()
6560
base_name = os.path.splitext(os.path.basename(file_path))[0]
6661
md_output_path = f"{base_name}.md"
67-
with open(md_output_path, "w") as f:
62+
with open(md_output_path, "w", encoding="utf-8") as f:
6863
f.write(markdown_text)
6964

7065
logger.info(f"📝 Markdown saved to {md_output_path}")
66+
return doc
67+
68+
def analyze_docling_tables(doc_input: DoclingDocument) -> Dict[str, Union[int, List[dict], List[int], str]]:
69+
"""
70+
Analyzes a Docling document (object or path to PDF/JSON file) for merged table cells.
71+
"""
72+
doc_input
7173

72-
# ⬇️ Proceed with table analysis
73-
table_count, table_pages_list = summarize_tables(doc)
74+
table_count, table_pages_list = summarize_tables(doc_input)
7475
total_pages = len(set(table_pages_list)) or "Unknown"
7576

7677
issues = {
@@ -80,7 +81,7 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
8081
"page_count": total_pages
8182
}
8283

83-
for i, table_item in enumerate(doc.tables):
84+
for i, table_item in enumerate(doc_input.tables):
8485
try:
8586
page_number = table_pages_list[i]
8687
except IndexError:
@@ -103,4 +104,4 @@ def analyze_pdf_with_docling(file_path) -> Dict[str, Union[int, List[Any], set]]
103104
issues["merged_cell_pages"].add(page_number)
104105

105106
issues["merged_cell_pages"] = sorted(issues["merged_cell_pages"])
106-
return issues
107+
return issues

notebooks/illuminator/illuminator.py renamed to notebooks/instructlab-knowledge/utils/illuminator/illuminator.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# main.py
22
import argparse
3-
from utils import get_pdf_files, save_results, generate_summary
4-
from analysis import analyze_pdf_with_docling
3+
from utils import get_supported_files, save_results, generate_summary
4+
from analysis import convert_pdf_with_docling, analyze_docling_tables
55
from log_utils import logger
66

77
def parse_args() -> argparse.Namespace:
@@ -27,25 +27,26 @@ def parse_args() -> argparse.Namespace:
2727
)
2828
return parser.parse_args()
2929

30-
3130
def main() -> None:
3231
"""
3332
Main execution flow:
3433
- Parses arguments
35-
- Loads and analyzes PDFs
34+
- Loads and analyzes PDFs or JSONs
3635
- Generates and saves results
3736
"""
3837
args = parse_args()
39-
pdfs = get_pdf_files(args.file)
40-
if not pdfs:
41-
logger.error("❌ No PDFs found to process.")
38+
files = get_supported_files(args.file)
39+
if not files:
40+
logger.error("❌ No input files found to process.")
4241
return
4342

4443
all_results = {}
45-
for path in pdfs:
44+
for path in files:
4645
logger.info(f"\n🔍 Converting and analyzing: {path}\n")
4746
try:
48-
result = analyze_pdf_with_docling(path)
47+
# Use Docling to convert (PDF path)
48+
doc = convert_pdf_with_docling(path)
49+
result = analyze_docling_tables(doc)
4950
all_results[path] = result
5051
except Exception as e:
5152
logger.error(f"❌ Failed to process {path}: {e}")
@@ -54,4 +55,4 @@ def main() -> None:
5455
save_results(all_results, args.output)
5556

5657
if __name__ == "__main__":
57-
main()
58+
main()

notebooks/illuminator/log_utils.py renamed to notebooks/instructlab-knowledge/utils/illuminator/log_utils.py

File renamed without changes.

notebooks/illuminator/requirements.txt renamed to notebooks/instructlab-knowledge/utils/illuminator/requirements.txt

File renamed without changes.

notebooks/illuminator/utils.py renamed to notebooks/instructlab-knowledge/utils/illuminator/utils.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,36 @@
66
from log_utils import logger
77

88
MAX_PREVIEW_LENGTH = 30 # Max characters shown from cell text in summary
9+
SUPPORTED_FILE_EXTENSIONS = [".pdf", ".json"]
910

10-
def get_pdf_files(path: str) -> List[str]:
11+
def get_supported_files(path: str) -> List[str]:
1112
"""
12-
Returns a list of PDF file paths from a given file or directory.
13+
Returns a list of file paths from the given path that match supported extensions.
14+
"""
15+
return [
16+
os.path.join(path, f)
17+
for f in os.listdir(path)
18+
if os.path.isfile(os.path.join(path, f)) and any(f.endswith(ext) for ext in SUPPORTED_FILE_EXTENSIONS)
19+
]
20+
21+
def get_supported_files(path: str, extensions: List[str] = [".pdf", ".json"]) -> List[str]:
22+
"""
23+
Returns a list containing one or more files that are in SUPPORTED_FILE_EXTENSIONS
1324
1425
Args:
15-
path: Path to a single .pdf file or directory containing .pdf files.
26+
path: Path to a single file or directory.
27+
extensions: List of file extensions to include.
1628
1729
Returns:
18-
List of .pdf file paths.
30+
List of matching file paths.
1931
"""
20-
if os.path.isfile(path) and path.endswith(".pdf"):
32+
if os.path.isfile(path) and any(path.endswith(ext) for ext in extensions):
2133
return [path]
2234
elif os.path.isdir(path):
2335
return [
2436
os.path.join(path, f)
2537
for f in os.listdir(path)
26-
if f.endswith(".pdf")
38+
if any(f.endswith(ext) for ext in extensions)
2739
]
2840
return []
2941

@@ -114,4 +126,4 @@ def format_pages(pages) -> str:
114126
ranges.append(f"{start}")
115127
else:
116128
ranges.append(f"{start}-{pages[-1]}")
117-
return ", ".join(ranges)
129+
return ", ".join(ranges)

0 commit comments

Comments
 (0)