Skip to content

Commit 5c47d59

Browse files
committed
Update example.py
1 parent abff2db commit 5c47d59

File tree

1 file changed

+84
-78
lines changed

1 file changed

+84
-78
lines changed

scripts/html_chunking/example.py

Lines changed: 84 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -7,118 +7,123 @@
77

88
import os
99
import sys
10-
import json
11-
from chunker import chunk_html
12-
from tokenizer import count_html_tokens, set_custom_tokenizer
10+
import argparse
11+
from pathlib import Path
12+
from chunker import chunk_html, Chunk
13+
from tokenizer import count_html_tokens
1314

1415
def main():
1516
"""Run the HTML chunking example."""
17+
parser = argparse.ArgumentParser(
18+
description="HTML Chunking Example",
19+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
20+
)
21+
parser.add_argument(
22+
"html_file",
23+
nargs="?",
24+
default="example.html",
25+
help="Path to the input HTML file."
26+
)
27+
parser.add_argument(
28+
"--max-token-limit",
29+
type=int,
30+
default=500,
31+
help="Max tokens per chunk"
32+
)
33+
parser.add_argument(
34+
"-o", "--output",
35+
default="chunked_output.html",
36+
help="Output HTML file name"
37+
)
38+
args = parser.parse_args()
39+
1640
print("HTML Chunking Example")
1741
print("====================\n")
1842

19-
# Load the HTML sample document
20-
sample_path = "example.html"
43+
sample_path = args.html_file
2144
if not os.path.exists(sample_path):
22-
print(f"Error: Sample document {sample_path} not found.")
23-
print("Make sure example.html is in the current directory.")
45+
print(f"Error: Sample document {sample_path} not found.", file=sys.stderr)
2446
return 1
2547

2648
try:
2749
with open(sample_path, "r", encoding="utf-8") as f:
2850
sample_html = f.read()
2951
except Exception as e:
30-
print(f"Error reading HTML file: {e}")
52+
print(f"Error reading HTML file: {e}", file=sys.stderr)
3153
return 1
3254

33-
# Count tokens in the original document
3455
try:
3556
print("Counting tokens in the document...")
3657
original_tokens = count_html_tokens(sample_html)
3758
print(f"Original document has {original_tokens} tokens\n")
3859
except Exception as e:
39-
print(f"Error counting tokens: {e}")
40-
print("Will proceed with chunking anyway")
60+
print(f"Warning: Could not count tokens accurately: {e}", file=sys.stderr)
61+
original_tokens = "N/A"
62+
63+
print(f"Chunking with max {args.max_token_limit} tokens per chunk...")
64+
# These options are now handled internally by the improved chunker
65+
print("Chunking options:")
66+
print(" - Count tag tokens: True")
67+
print(" - Keep siblings together: True")
68+
print(" - Prepend parent section text: True\n")
4169

42-
# Chunk with default settings
4370
try:
44-
# First, set maximum token limit
45-
max_token_limit = 500
46-
print(f"Chunking with max {max_token_limit} tokens per chunk...")
47-
48-
# Perform chunking
71+
# Define a source URL for the example file. In a real pipeline,
72+
# this would come from url_mapping.json.
73+
source_url_for_example = f"file://{os.path.abspath(sample_path)}"
74+
75+
# Perform chunking using the updated function signature
4976
chunks = chunk_html(
50-
sample_html,
51-
max_token_limit=max_token_limit,
52-
count_tag_tokens=True,
53-
keep_siblings_together=True,
54-
prepend_parent_section_text=True
77+
html_content=sample_html,
78+
source_url=source_url_for_example,
79+
max_token_limit=args.max_token_limit,
80+
count_tag_tokens=True
5581
)
5682

57-
# Print chunk information
5883
print(f"Created {len(chunks)} chunks:")
5984

60-
# Check if chunks are reasonable
61-
unreasonable_chunks = 0
62-
too_small_chunks = 0
63-
for i, chunk in enumerate(chunks, 1):
64-
token_count = count_html_tokens(chunk)
65-
66-
# Check if chunk is too small (less than 20% of the limit)
67-
if token_count < max_token_limit * 0.2:
68-
too_small_chunks += 1
69-
70-
# Check if chunk is way too large
71-
if token_count > max_token_limit * 1.2:
72-
unreasonable_chunks += 1
73-
74-
# Print info for first 5 chunks
75-
if i <= 5:
76-
print(f" Chunk {i}: {token_count} tokens")
77-
# Show a brief preview of the chunk's content
78-
soup_chunk = chunk.replace("\n", " ").strip()
79-
preview = soup_chunk[:100] + "..." if len(soup_chunk) > 100 else soup_chunk
80-
print(f" Preview: {preview}\n")
85+
chunk_tokens = [count_html_tokens(chunk.text) for chunk in chunks]
8186

82-
if i > 5:
87+
# Print info for the first 5 chunks
88+
for i, chunk in enumerate(chunks[:5], 1):
89+
print(f" Chunk {i}: {chunk_tokens[i-1]} tokens")
90+
print(f" Metadata Source: {chunk.metadata.get('source', 'N/A')}\n")
91+
92+
if len(chunks) > 5:
8393
print(f" ... and {len(chunks) - 5} more chunks\n")
84-
85-
if too_small_chunks > 0:
86-
print(f"Warning: {too_small_chunks} chunks are less than 20% of the token limit.")
87-
88-
if unreasonable_chunks > 0:
89-
print(f"Warning: {unreasonable_chunks} chunks exceed the token limit by more than 20%.")
90-
94+
9195
# Save all chunks to a single file with separators
92-
print("\nSaving all chunks to a single file...")
93-
with open("chunked_output.html", "w", encoding="utf-8") as f:
96+
output_filename = args.output
97+
print(f"\nSaving all chunks to a single file: {output_filename}...")
98+
with open(output_filename, "w", encoding="utf-8") as f:
9499
f.write("<!DOCTYPE html>\n<html>\n<head>\n")
95100
f.write("<title>Chunked HTML Document</title>\n")
96101
f.write("<style>\n")
102+
f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n")
97103
f.write(".chunk-separator { margin: 20px 0; border-top: 5px solid #3c82f6; padding-top: 10px; }\n")
98104
f.write(".chunk-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; margin-bottom: 10px; font-size: 16px; }\n")
105+
f.write(".chunk-meta { background-color: #eaf2ff; padding: 5px 10px; font-family: monospace; font-size: 12px; margin-bottom: 10px; word-wrap: break-word; }\n")
99106
f.write(".chunk-content { border: 1px solid #ddd; padding: 15px; }\n")
100-
f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n")
101-
f.write("</style>\n")
102-
f.write("</head>\n<body>\n")
107+
f.write("</style>\n</head>\n<body>\n")
103108

104109
f.write("<h1>Chunked HTML Document</h1>\n")
105110
f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n")
106-
f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {max_token_limit} tokens per chunk</p>\n")
107-
f.write("<p><strong>Chunking settings:</strong></p>\n")
111+
f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {args.max_token_limit} tokens per chunk</p>\n")
112+
113+
# --- START: Re-added summary section ---
114+
f.write("<p><strong>Chunking settings (now internal):</strong></p>\n")
108115
f.write("<ul>\n")
109-
f.write(" <li>count_tag_tokens: Yes</li>\n")
110-
f.write(" <li>keep_siblings_together: Yes</li>\n")
111-
f.write(" <li>prepend_parent_section_text: Yes</li>\n")
116+
f.write(" <li>count_tag_tokens: True</li>\n")
117+
f.write(" <li>keep_siblings_together: True</li>\n")
118+
f.write(" <li>prepend_parent_section_text: True</li>\n")
112119
f.write("</ul>\n")
113120

114-
# Add a statistics table
115121
f.write("<h2>Chunk Statistics</h2>\n")
116122
f.write("<table border='1' cellpadding='5' style='border-collapse: collapse; width: 100%;'>\n")
117123
f.write("<tr><th>Statistic</th><th>Value</th></tr>\n")
118124
f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n")
119125

120-
# Calculate more statistics
121-
chunk_tokens = [count_html_tokens(chunk) for chunk in chunks]
126+
# Calculate statistics
122127
avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0
123128
min_tokens = min(chunk_tokens) if chunk_tokens else 0
124129
max_tokens = max(chunk_tokens) if chunk_tokens else 0
@@ -127,32 +132,33 @@ def main():
127132
f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n")
128133
f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n")
129134
f.write(f"<tr><td>Chunks below 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n")
130-
f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > max_token_limit)}</td></tr>\n")
131-
f.write("</table>\n<br><hr><br>\n")
135+
f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > args.max_token_limit)}</td></tr>\n")
136+
f.write("</table>\n")
137+
# --- END: Re-added summary section ---
138+
139+
f.write("<br><hr><br>\n")
132140

133141
for i, chunk in enumerate(chunks, 1):
134-
token_count = count_html_tokens(chunk)
142+
token_count = chunk_tokens[i-1]
135143
if i > 1:
136144
f.write('<div class="chunk-separator"></div>\n')
137145

138-
# Add token count color based on size
139-
color_class = ""
140-
if token_count < max_token_limit * 0.2:
141-
color_class = " style='background-color: #FFF0F0;'" # Light red for too small
142-
elif token_count > max_token_limit * 1.1:
143-
color_class = " style='background-color: #FFE0E0;'" # Red for too large
146+
color_style = ""
147+
if token_count > args.max_token_limit:
148+
color_style = " style='background-color: #FFE0E0;'" # Red for oversized
144149

145-
f.write(f'<div class="chunk-header"{color_class}>Chunk {i} ({token_count} tokens)</div>\n')
150+
f.write(f'<div class="chunk-header"{color_style}>Chunk {i} ({token_count} tokens)</div>\n')
151+
f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n')
146152
f.write('<div class="chunk-content">\n')
147-
f.write(chunk)
153+
f.write(chunk.text)
148154
f.write('\n</div>\n')
149155

150156
f.write("</body>\n</html>")
151157

152-
print(f"All chunks saved to chunked_output.html")
158+
print(f"All chunks saved to {output_filename}")
153159

154160
except Exception as e:
155-
print(f"Error during chunking: {e}")
161+
print(f"Error during chunking: {e}", file=sys.stderr)
156162
import traceback
157163
traceback.print_exc()
158164
return 1

0 commit comments

Comments
 (0)