Skip to content

Commit c22b8ea

Browse files
committed
Multiple improvements to example.py
1 parent 5c47d59 commit c22b8ea

File tree

1 file changed

+102
-129
lines changed

1 file changed

+102
-129
lines changed

scripts/html_chunking/example.py

Lines changed: 102 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -2,168 +2,141 @@
22
"""
33
Example script demonstrating the HTML chunking library.
44
5-
This script shows how to use the HTML chunker with a sample HTML document.
5+
This script reads an HTML file, splits it into chunks based on a token limit,
6+
and generates a report for visual inspection.
67
"""
78

9+
import argparse
810
import os
911
import sys
10-
import argparse
11-
from pathlib import Path
12-
from chunker import chunk_html, Chunk
13-
from tokenizer import count_html_tokens
12+
from typing import List
1413

15-
def main():
16-
"""Run the HTML chunking example."""
14+
# Imports are deferred into main() to support running the script
15+
# from within its directory, which requires a sys.path modification first.
16+
17+
def create_argument_parser() -> argparse.ArgumentParser:
18+
"""Creates and configures the argument parser."""
1719
parser = argparse.ArgumentParser(
1820
description="HTML Chunking Example",
1921
formatter_class=argparse.ArgumentDefaultsHelpFormatter
2022
)
2123
parser.add_argument(
2224
"html_file",
2325
nargs="?",
24-
default="example.html",
26+
default=os.path.join(os.path.dirname(__file__), "example.html"),
2527
help="Path to the input HTML file."
2628
)
2729
parser.add_argument(
2830
"--max-token-limit",
2931
type=int,
3032
default=500,
31-
help="Max tokens per chunk"
33+
help="Max tokens per chunk."
3234
)
3335
parser.add_argument(
3436
"-o", "--output",
3537
default="chunked_output.html",
36-
help="Output HTML file name"
38+
help="Output HTML file name for the report."
3739
)
38-
args = parser.parse_args()
40+
return parser
41+
42+
def generate_html_report(output_path: str, chunks: List['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None:
43+
"""Generates a single HTML file containing all chunks for review."""
44+
print(f"\nSaving all chunks to a single file: {output_path}...")
3945

40-
print("HTML Chunking Example")
41-
print("====================\n")
46+
chunk_tokens = [count_html_tokens_func(chunk.text) for chunk in chunks]
47+
avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0
48+
min_tokens = min(chunk_tokens) if chunk_tokens else 0
49+
max_tokens = max(chunk_tokens) if chunk_tokens else 0
4250

43-
sample_path = args.html_file
44-
if not os.path.exists(sample_path):
45-
print(f"Error: Sample document {sample_path} not found.", file=sys.stderr)
51+
with open(output_path, "w", encoding="utf-8") as f:
52+
f.write("<!DOCTYPE html>\n<html>\n<head>\n<title>Chunked HTML Document</title>\n")
53+
f.write("<style>body{font-family:Arial,sans-serif;max-width:1200px;margin:20px auto;padding:0 20px}.chunk-separator{margin:20px 0;border-top:5px solid #3c82f6;padding-top:10px}.chunk-header{background-color:#f0f0f0;padding:10px;font-weight:bold;margin-bottom:10px;font-size:16px}.chunk-meta{background-color:#eaf2ff;padding:5px 10px;font-family:monospace;font-size:12px;word-wrap:break-word}.chunk-content{border:1px solid #ddd;padding:15px}</style>\n")
54+
f.write("</head>\n<body>\n<h1>Chunked HTML Document</h1>\n")
55+
56+
f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n")
57+
f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {max_token_limit} tokens per chunk</p>\n")
58+
59+
f.write("<h2>Chunk Statistics</h2>\n<table border='1' cellpadding='5' style='border-collapse:collapse;width:100%;'>\n")
60+
f.write("<tr><th>Statistic</th><th>Value</th></tr>\n")
61+
f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n")
62+
f.write(f"<tr><td>Average tokens per chunk</td><td>{avg_tokens:.1f}</td></tr>\n")
63+
f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n")
64+
f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n")
65+
f.write(f"<tr><td>Chunks &lt; 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n")
66+
f.write(f"<tr><td>Chunks &gt; token limit</td><td>{sum(1 for t in chunk_tokens if t > max_token_limit)}</td></tr>\n")
67+
f.write("</table>\n<br><hr><br>\n")
68+
69+
for i, chunk in enumerate(chunks, 1):
70+
token_count = chunk_tokens[i-1]
71+
if i > 1: f.write('<div class="chunk-separator"></div>\n')
72+
73+
style = " style='background-color:#FFE0E0;'" if token_count > max_token_limit else ""
74+
f.write(f'<div class="chunk-header"{style}>Chunk {i} ({token_count} tokens)</div>\n')
75+
f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n')
76+
f.write('<div class="chunk-content">\n')
77+
f.write(chunk.text)
78+
f.write('\n</div>\n')
79+
80+
f.write("</body>\n</html>")
81+
print(f"Report saved to {output_path}")
82+
83+
def main():
84+
"""Main function to run the HTML chunking example."""
85+
from html_chunking.chunker import chunk_html, Chunk
86+
from html_chunking.tokenizer import count_html_tokens
87+
88+
parser = create_argument_parser()
89+
args = parser.parse_args()
90+
91+
print("HTML Chunking Example\n====================\n")
92+
93+
if not os.path.exists(args.html_file):
94+
print(f"Error: Sample document '{args.html_file}' not found.", file=sys.stderr)
4695
return 1
47-
96+
4897
try:
49-
with open(sample_path, "r", encoding="utf-8") as f:
98+
with open(args.html_file, "r", encoding="utf-8") as f:
5099
sample_html = f.read()
51-
except Exception as e:
52-
print(f"Error reading HTML file: {e}", file=sys.stderr)
100+
except IOError as e:
101+
print(f"Error reading HTML file '{args.html_file}': {e}", file=sys.stderr)
53102
return 1
103+
104+
original_tokens = count_html_tokens(sample_html)
105+
print(f"Original document has {original_tokens} tokens.\n")
106+
107+
print(f"Chunking with max {args.max_token_limit} tokens per chunk...\n")
108+
109+
source_url_for_example = f"file://{os.path.abspath(args.html_file)}"
110+
111+
chunks = chunk_html(
112+
html_content=sample_html,
113+
source_url=source_url_for_example,
114+
max_token_limit=args.max_token_limit
115+
)
116+
117+
print(f"Created {len(chunks)} chunks.")
54118

55-
try:
56-
print("Counting tokens in the document...")
57-
original_tokens = count_html_tokens(sample_html)
58-
print(f"Original document has {original_tokens} tokens\n")
59-
except Exception as e:
60-
print(f"Warning: Could not count tokens accurately: {e}", file=sys.stderr)
61-
original_tokens = "N/A"
119+
generate_html_report(args.output, chunks, original_tokens, args.max_token_limit, count_html_tokens)
62120

63-
print(f"Chunking with max {args.max_token_limit} tokens per chunk...")
64-
# These options are now handled internally by the improved chunker
65-
print("Chunking options:")
66-
print(" - Count tag tokens: True")
67-
print(" - Keep siblings together: True")
68-
print(" - Prepend parent section text: True\n")
121+
return 0
122+
123+
if __name__ == "__main__":
124+
# This block allows the script to be run directly from within the html_chunking
125+
# directory by ensuring the parent 'scripts' directory is in the Python path.
126+
if __package__ is None:
127+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
128+
if project_root not in sys.path:
129+
sys.path.insert(0, project_root)
69130

70131
try:
71-
# Define a source URL for the example file. In a real pipeline,
72-
# this would come from url_mapping.json.
73-
source_url_for_example = f"file://{os.path.abspath(sample_path)}"
74-
75-
# Perform chunking using the updated function signature
76-
chunks = chunk_html(
77-
html_content=sample_html,
78-
source_url=source_url_for_example,
79-
max_token_limit=args.max_token_limit,
80-
count_tag_tokens=True
81-
)
82-
83-
print(f"Created {len(chunks)} chunks:")
84-
85-
chunk_tokens = [count_html_tokens(chunk.text) for chunk in chunks]
86-
87-
# Print info for the first 5 chunks
88-
for i, chunk in enumerate(chunks[:5], 1):
89-
print(f" Chunk {i}: {chunk_tokens[i-1]} tokens")
90-
print(f" Metadata Source: {chunk.metadata.get('source', 'N/A')}\n")
91-
92-
if len(chunks) > 5:
93-
print(f" ... and {len(chunks) - 5} more chunks\n")
94-
95-
# Save all chunks to a single file with separators
96-
output_filename = args.output
97-
print(f"\nSaving all chunks to a single file: {output_filename}...")
98-
with open(output_filename, "w", encoding="utf-8") as f:
99-
f.write("<!DOCTYPE html>\n<html>\n<head>\n")
100-
f.write("<title>Chunked HTML Document</title>\n")
101-
f.write("<style>\n")
102-
f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n")
103-
f.write(".chunk-separator { margin: 20px 0; border-top: 5px solid #3c82f6; padding-top: 10px; }\n")
104-
f.write(".chunk-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; margin-bottom: 10px; font-size: 16px; }\n")
105-
f.write(".chunk-meta { background-color: #eaf2ff; padding: 5px 10px; font-family: monospace; font-size: 12px; margin-bottom: 10px; word-wrap: break-word; }\n")
106-
f.write(".chunk-content { border: 1px solid #ddd; padding: 15px; }\n")
107-
f.write("</style>\n</head>\n<body>\n")
108-
109-
f.write("<h1>Chunked HTML Document</h1>\n")
110-
f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n")
111-
f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {args.max_token_limit} tokens per chunk</p>\n")
112-
113-
# --- START: Re-added summary section ---
114-
f.write("<p><strong>Chunking settings (now internal):</strong></p>\n")
115-
f.write("<ul>\n")
116-
f.write(" <li>count_tag_tokens: True</li>\n")
117-
f.write(" <li>keep_siblings_together: True</li>\n")
118-
f.write(" <li>prepend_parent_section_text: True</li>\n")
119-
f.write("</ul>\n")
120-
121-
f.write("<h2>Chunk Statistics</h2>\n")
122-
f.write("<table border='1' cellpadding='5' style='border-collapse: collapse; width: 100%;'>\n")
123-
f.write("<tr><th>Statistic</th><th>Value</th></tr>\n")
124-
f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n")
125-
126-
# Calculate statistics
127-
avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0
128-
min_tokens = min(chunk_tokens) if chunk_tokens else 0
129-
max_tokens = max(chunk_tokens) if chunk_tokens else 0
130-
131-
f.write(f"<tr><td>Average tokens per chunk</td><td>{avg_tokens:.1f}</td></tr>\n")
132-
f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n")
133-
f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n")
134-
f.write(f"<tr><td>Chunks below 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n")
135-
f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > args.max_token_limit)}</td></tr>\n")
136-
f.write("</table>\n")
137-
# --- END: Re-added summary section ---
138-
139-
f.write("<br><hr><br>\n")
140-
141-
for i, chunk in enumerate(chunks, 1):
142-
token_count = chunk_tokens[i-1]
143-
if i > 1:
144-
f.write('<div class="chunk-separator"></div>\n')
145-
146-
color_style = ""
147-
if token_count > args.max_token_limit:
148-
color_style = " style='background-color: #FFE0E0;'" # Red for oversized
149-
150-
f.write(f'<div class="chunk-header"{color_style}>Chunk {i} ({token_count} tokens)</div>\n')
151-
f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n')
152-
f.write('<div class="chunk-content">\n')
153-
f.write(chunk.text)
154-
f.write('\n</div>\n')
155-
156-
f.write("</body>\n</html>")
157-
158-
print(f"All chunks saved to {output_filename}")
159-
132+
sys.exit(main())
133+
except ImportError as e:
134+
print(f"Error: Failed to import a required module.", file=sys.stderr)
135+
print(f"Detail: {e}", file=sys.stderr)
136+
print("\nSuggestion: Try running this script from the project's root directory using 'python -m html_chunking.example'", file=sys.stderr)
137+
sys.exit(1)
160138
except Exception as e:
161-
print(f"Error during chunking: {e}", file=sys.stderr)
139+
print(f"An unexpected error occurred: {e}", file=sys.stderr)
162140
import traceback
163141
traceback.print_exc()
164-
return 1
165-
166-
return 0
167-
168-
if __name__ == "__main__":
169-
sys.exit(main())
142+
sys.exit(1)

0 commit comments

Comments
 (0)