|
2 | 2 | """
|
3 | 3 | Example script demonstrating the HTML chunking library.
|
4 | 4 |
|
5 |
| -This script shows how to use the HTML chunker with a sample HTML document. |
| 5 | +This script reads an HTML file, splits it into chunks based on a token limit, |
| 6 | +and generates a report for visual inspection. |
6 | 7 | """
|
7 | 8 |
|
| 9 | +import argparse |
8 | 10 | import os
|
9 | 11 | import sys
|
10 |
| -import argparse |
11 |
| -from pathlib import Path |
12 |
| -from chunker import chunk_html, Chunk |
13 |
| -from tokenizer import count_html_tokens |
| 12 | +from typing import List |
14 | 13 |
|
15 |
| -def main(): |
16 |
| - """Run the HTML chunking example.""" |
| 14 | +# Imports are deferred into main() to support running the script |
| 15 | +# from within its directory, which requires a sys.path modification first. |
| 16 | + |
| 17 | +def create_argument_parser() -> argparse.ArgumentParser: |
| 18 | + """Creates and configures the argument parser.""" |
17 | 19 | parser = argparse.ArgumentParser(
|
18 | 20 | description="HTML Chunking Example",
|
19 | 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
20 | 22 | )
|
21 | 23 | parser.add_argument(
|
22 | 24 | "html_file",
|
23 | 25 | nargs="?",
|
24 |
| - default="example.html", |
| 26 | + default=os.path.join(os.path.dirname(__file__), "example.html"), |
25 | 27 | help="Path to the input HTML file."
|
26 | 28 | )
|
27 | 29 | parser.add_argument(
|
28 | 30 | "--max-token-limit",
|
29 | 31 | type=int,
|
30 | 32 | default=500,
|
31 |
| - help="Max tokens per chunk" |
| 33 | + help="Max tokens per chunk." |
32 | 34 | )
|
33 | 35 | parser.add_argument(
|
34 | 36 | "-o", "--output",
|
35 | 37 | default="chunked_output.html",
|
36 |
| - help="Output HTML file name" |
| 38 | + help="Output HTML file name for the report." |
37 | 39 | )
|
38 |
| - args = parser.parse_args() |
| 40 | + return parser |
| 41 | + |
| 42 | +def generate_html_report(output_path: str, chunks: List['Chunk'], original_tokens: int, max_token_limit: int, count_html_tokens_func) -> None: |
| 43 | + """Generates a single HTML file containing all chunks for review.""" |
| 44 | + print(f"\nSaving all chunks to a single file: {output_path}...") |
39 | 45 |
|
40 |
| - print("HTML Chunking Example") |
41 |
| - print("====================\n") |
| 46 | + chunk_tokens = [count_html_tokens_func(chunk.text) for chunk in chunks] |
| 47 | + avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0 |
| 48 | + min_tokens = min(chunk_tokens) if chunk_tokens else 0 |
| 49 | + max_tokens = max(chunk_tokens) if chunk_tokens else 0 |
42 | 50 |
|
43 |
| - sample_path = args.html_file |
44 |
| - if not os.path.exists(sample_path): |
45 |
| - print(f"Error: Sample document {sample_path} not found.", file=sys.stderr) |
| 51 | + with open(output_path, "w", encoding="utf-8") as f: |
| 52 | + f.write("<!DOCTYPE html>\n<html>\n<head>\n<title>Chunked HTML Document</title>\n") |
| 53 | + f.write("<style>body{font-family:Arial,sans-serif;max-width:1200px;margin:20px auto;padding:0 20px}.chunk-separator{margin:20px 0;border-top:5px solid #3c82f6;padding-top:10px}.chunk-header{background-color:#f0f0f0;padding:10px;font-weight:bold;margin-bottom:10px;font-size:16px}.chunk-meta{background-color:#eaf2ff;padding:5px 10px;font-family:monospace;font-size:12px;word-wrap:break-word}.chunk-content{border:1px solid #ddd;padding:15px}</style>\n") |
| 54 | + f.write("</head>\n<body>\n<h1>Chunked HTML Document</h1>\n") |
| 55 | + |
| 56 | + f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n") |
| 57 | + f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {max_token_limit} tokens per chunk</p>\n") |
| 58 | + |
| 59 | + f.write("<h2>Chunk Statistics</h2>\n<table border='1' cellpadding='5' style='border-collapse:collapse;width:100%;'>\n") |
| 60 | + f.write("<tr><th>Statistic</th><th>Value</th></tr>\n") |
| 61 | + f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n") |
| 62 | + f.write(f"<tr><td>Average tokens per chunk</td><td>{avg_tokens:.1f}</td></tr>\n") |
| 63 | + f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n") |
| 64 | + f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n") |
| 65 | + f.write(f"<tr><td>Chunks < 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n") |
| 66 | + f.write(f"<tr><td>Chunks > token limit</td><td>{sum(1 for t in chunk_tokens if t > max_token_limit)}</td></tr>\n") |
| 67 | + f.write("</table>\n<br><hr><br>\n") |
| 68 | + |
| 69 | + for i, chunk in enumerate(chunks, 1): |
| 70 | + token_count = chunk_tokens[i-1] |
| 71 | + if i > 1: f.write('<div class="chunk-separator"></div>\n') |
| 72 | + |
| 73 | + style = " style='background-color:#FFE0E0;'" if token_count > max_token_limit else "" |
| 74 | + f.write(f'<div class="chunk-header"{style}>Chunk {i} ({token_count} tokens)</div>\n') |
| 75 | + f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n') |
| 76 | + f.write('<div class="chunk-content">\n') |
| 77 | + f.write(chunk.text) |
| 78 | + f.write('\n</div>\n') |
| 79 | + |
| 80 | + f.write("</body>\n</html>") |
| 81 | + print(f"Report saved to {output_path}") |
| 82 | + |
| 83 | +def main(): |
| 84 | + """Main function to run the HTML chunking example.""" |
| 85 | + from html_chunking.chunker import chunk_html, Chunk |
| 86 | + from html_chunking.tokenizer import count_html_tokens |
| 87 | + |
| 88 | + parser = create_argument_parser() |
| 89 | + args = parser.parse_args() |
| 90 | + |
| 91 | + print("HTML Chunking Example\n====================\n") |
| 92 | + |
| 93 | + if not os.path.exists(args.html_file): |
| 94 | + print(f"Error: Sample document '{args.html_file}' not found.", file=sys.stderr) |
46 | 95 | return 1
|
47 |
| - |
| 96 | + |
48 | 97 | try:
|
49 |
| - with open(sample_path, "r", encoding="utf-8") as f: |
| 98 | + with open(args.html_file, "r", encoding="utf-8") as f: |
50 | 99 | sample_html = f.read()
|
51 |
| - except Exception as e: |
52 |
| - print(f"Error reading HTML file: {e}", file=sys.stderr) |
| 100 | + except IOError as e: |
| 101 | + print(f"Error reading HTML file '{args.html_file}': {e}", file=sys.stderr) |
53 | 102 | return 1
|
| 103 | + |
| 104 | + original_tokens = count_html_tokens(sample_html) |
| 105 | + print(f"Original document has {original_tokens} tokens.\n") |
| 106 | + |
| 107 | + print(f"Chunking with max {args.max_token_limit} tokens per chunk...\n") |
| 108 | + |
| 109 | + source_url_for_example = f"file://{os.path.abspath(args.html_file)}" |
| 110 | + |
| 111 | + chunks = chunk_html( |
| 112 | + html_content=sample_html, |
| 113 | + source_url=source_url_for_example, |
| 114 | + max_token_limit=args.max_token_limit |
| 115 | + ) |
| 116 | + |
| 117 | + print(f"Created {len(chunks)} chunks.") |
54 | 118 |
|
55 |
| - try: |
56 |
| - print("Counting tokens in the document...") |
57 |
| - original_tokens = count_html_tokens(sample_html) |
58 |
| - print(f"Original document has {original_tokens} tokens\n") |
59 |
| - except Exception as e: |
60 |
| - print(f"Warning: Could not count tokens accurately: {e}", file=sys.stderr) |
61 |
| - original_tokens = "N/A" |
| 119 | + generate_html_report(args.output, chunks, original_tokens, args.max_token_limit, count_html_tokens) |
62 | 120 |
|
63 |
| - print(f"Chunking with max {args.max_token_limit} tokens per chunk...") |
64 |
| - # These options are now handled internally by the improved chunker |
65 |
| - print("Chunking options:") |
66 |
| - print(" - Count tag tokens: True") |
67 |
| - print(" - Keep siblings together: True") |
68 |
| - print(" - Prepend parent section text: True\n") |
| 121 | + return 0 |
| 122 | + |
| 123 | +if __name__ == "__main__": |
| 124 | + # This block allows the script to be run directly from within the html_chunking |
| 125 | + # directory by ensuring the parent 'scripts' directory is in the Python path. |
| 126 | + if __package__ is None: |
| 127 | + project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| 128 | + if project_root not in sys.path: |
| 129 | + sys.path.insert(0, project_root) |
69 | 130 |
|
70 | 131 | try:
|
71 |
| - # Define a source URL for the example file. In a real pipeline, |
72 |
| - # this would come from url_mapping.json. |
73 |
| - source_url_for_example = f"file://{os.path.abspath(sample_path)}" |
74 |
| - |
75 |
| - # Perform chunking using the updated function signature |
76 |
| - chunks = chunk_html( |
77 |
| - html_content=sample_html, |
78 |
| - source_url=source_url_for_example, |
79 |
| - max_token_limit=args.max_token_limit, |
80 |
| - count_tag_tokens=True |
81 |
| - ) |
82 |
| - |
83 |
| - print(f"Created {len(chunks)} chunks:") |
84 |
| - |
85 |
| - chunk_tokens = [count_html_tokens(chunk.text) for chunk in chunks] |
86 |
| - |
87 |
| - # Print info for the first 5 chunks |
88 |
| - for i, chunk in enumerate(chunks[:5], 1): |
89 |
| - print(f" Chunk {i}: {chunk_tokens[i-1]} tokens") |
90 |
| - print(f" Metadata Source: {chunk.metadata.get('source', 'N/A')}\n") |
91 |
| - |
92 |
| - if len(chunks) > 5: |
93 |
| - print(f" ... and {len(chunks) - 5} more chunks\n") |
94 |
| - |
95 |
| - # Save all chunks to a single file with separators |
96 |
| - output_filename = args.output |
97 |
| - print(f"\nSaving all chunks to a single file: {output_filename}...") |
98 |
| - with open(output_filename, "w", encoding="utf-8") as f: |
99 |
| - f.write("<!DOCTYPE html>\n<html>\n<head>\n") |
100 |
| - f.write("<title>Chunked HTML Document</title>\n") |
101 |
| - f.write("<style>\n") |
102 |
| - f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n") |
103 |
| - f.write(".chunk-separator { margin: 20px 0; border-top: 5px solid #3c82f6; padding-top: 10px; }\n") |
104 |
| - f.write(".chunk-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; margin-bottom: 10px; font-size: 16px; }\n") |
105 |
| - f.write(".chunk-meta { background-color: #eaf2ff; padding: 5px 10px; font-family: monospace; font-size: 12px; margin-bottom: 10px; word-wrap: break-word; }\n") |
106 |
| - f.write(".chunk-content { border: 1px solid #ddd; padding: 15px; }\n") |
107 |
| - f.write("</style>\n</head>\n<body>\n") |
108 |
| - |
109 |
| - f.write("<h1>Chunked HTML Document</h1>\n") |
110 |
| - f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n") |
111 |
| - f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {args.max_token_limit} tokens per chunk</p>\n") |
112 |
| - |
113 |
| - # --- START: Re-added summary section --- |
114 |
| - f.write("<p><strong>Chunking settings (now internal):</strong></p>\n") |
115 |
| - f.write("<ul>\n") |
116 |
| - f.write(" <li>count_tag_tokens: True</li>\n") |
117 |
| - f.write(" <li>keep_siblings_together: True</li>\n") |
118 |
| - f.write(" <li>prepend_parent_section_text: True</li>\n") |
119 |
| - f.write("</ul>\n") |
120 |
| - |
121 |
| - f.write("<h2>Chunk Statistics</h2>\n") |
122 |
| - f.write("<table border='1' cellpadding='5' style='border-collapse: collapse; width: 100%;'>\n") |
123 |
| - f.write("<tr><th>Statistic</th><th>Value</th></tr>\n") |
124 |
| - f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n") |
125 |
| - |
126 |
| - # Calculate statistics |
127 |
| - avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0 |
128 |
| - min_tokens = min(chunk_tokens) if chunk_tokens else 0 |
129 |
| - max_tokens = max(chunk_tokens) if chunk_tokens else 0 |
130 |
| - |
131 |
| - f.write(f"<tr><td>Average tokens per chunk</td><td>{avg_tokens:.1f}</td></tr>\n") |
132 |
| - f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n") |
133 |
| - f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n") |
134 |
| - f.write(f"<tr><td>Chunks below 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n") |
135 |
| - f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > args.max_token_limit)}</td></tr>\n") |
136 |
| - f.write("</table>\n") |
137 |
| - # --- END: Re-added summary section --- |
138 |
| - |
139 |
| - f.write("<br><hr><br>\n") |
140 |
| - |
141 |
| - for i, chunk in enumerate(chunks, 1): |
142 |
| - token_count = chunk_tokens[i-1] |
143 |
| - if i > 1: |
144 |
| - f.write('<div class="chunk-separator"></div>\n') |
145 |
| - |
146 |
| - color_style = "" |
147 |
| - if token_count > args.max_token_limit: |
148 |
| - color_style = " style='background-color: #FFE0E0;'" # Red for oversized |
149 |
| - |
150 |
| - f.write(f'<div class="chunk-header"{color_style}>Chunk {i} ({token_count} tokens)</div>\n') |
151 |
| - f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n') |
152 |
| - f.write('<div class="chunk-content">\n') |
153 |
| - f.write(chunk.text) |
154 |
| - f.write('\n</div>\n') |
155 |
| - |
156 |
| - f.write("</body>\n</html>") |
157 |
| - |
158 |
| - print(f"All chunks saved to {output_filename}") |
159 |
| - |
| 132 | + sys.exit(main()) |
| 133 | + except ImportError as e: |
| 134 | + print(f"Error: Failed to import a required module.", file=sys.stderr) |
| 135 | + print(f"Detail: {e}", file=sys.stderr) |
| 136 | + print("\nSuggestion: Try running this script from the project's root directory using 'python -m html_chunking.example'", file=sys.stderr) |
| 137 | + sys.exit(1) |
160 | 138 | except Exception as e:
|
161 |
| - print(f"Error during chunking: {e}", file=sys.stderr) |
| 139 | + print(f"An unexpected error occurred: {e}", file=sys.stderr) |
162 | 140 | import traceback
|
163 | 141 | traceback.print_exc()
|
164 |
| - return 1 |
165 |
| - |
166 |
| - return 0 |
167 |
| - |
168 |
| -if __name__ == "__main__": |
169 |
| - sys.exit(main()) |
| 142 | + sys.exit(1) |
0 commit comments