7
7
8
8
import os
9
9
import sys
10
- import json
11
- from chunker import chunk_html
12
- from tokenizer import count_html_tokens , set_custom_tokenizer
10
+ import argparse
11
+ from pathlib import Path
12
+ from chunker import chunk_html , Chunk
13
+ from tokenizer import count_html_tokens
13
14
14
15
def main ():
15
16
"""Run the HTML chunking example."""
17
+ parser = argparse .ArgumentParser (
18
+ description = "HTML Chunking Example" ,
19
+ formatter_class = argparse .ArgumentDefaultsHelpFormatter
20
+ )
21
+ parser .add_argument (
22
+ "html_file" ,
23
+ nargs = "?" ,
24
+ default = "example.html" ,
25
+ help = "Path to the input HTML file."
26
+ )
27
+ parser .add_argument (
28
+ "--max-token-limit" ,
29
+ type = int ,
30
+ default = 500 ,
31
+ help = "Max tokens per chunk"
32
+ )
33
+ parser .add_argument (
34
+ "-o" , "--output" ,
35
+ default = "chunked_output.html" ,
36
+ help = "Output HTML file name"
37
+ )
38
+ args = parser .parse_args ()
39
+
16
40
print ("HTML Chunking Example" )
17
41
print ("====================\n " )
18
42
19
- # Load the HTML sample document
20
- sample_path = "example.html"
43
+ sample_path = args .html_file
21
44
if not os .path .exists (sample_path ):
22
- print (f"Error: Sample document { sample_path } not found." )
23
- print ("Make sure example.html is in the current directory." )
45
+ print (f"Error: Sample document { sample_path } not found." , file = sys .stderr )
24
46
return 1
25
47
26
48
try :
27
49
with open (sample_path , "r" , encoding = "utf-8" ) as f :
28
50
sample_html = f .read ()
29
51
except Exception as e :
30
- print (f"Error reading HTML file: { e } " )
52
+ print (f"Error reading HTML file: { e } " , file = sys . stderr )
31
53
return 1
32
54
33
- # Count tokens in the original document
34
55
try :
35
56
print ("Counting tokens in the document..." )
36
57
original_tokens = count_html_tokens (sample_html )
37
58
print (f"Original document has { original_tokens } tokens\n " )
38
59
except Exception as e :
39
- print (f"Error counting tokens: { e } " )
40
- print ("Will proceed with chunking anyway" )
60
+ print (f"Warning: Could not count tokens accurately: { e } " , file = sys .stderr )
61
+ original_tokens = "N/A"
62
+
63
+ print (f"Chunking with max { args .max_token_limit } tokens per chunk..." )
64
+ # These options are now handled internally by the improved chunker
65
+ print ("Chunking options:" )
66
+ print (" - Count tag tokens: True" )
67
+ print (" - Keep siblings together: True" )
68
+ print (" - Prepend parent section text: True\n " )
41
69
42
- # Chunk with default settings
43
70
try :
44
- # First, set maximum token limit
45
- max_token_limit = 500
46
- print ( f"Chunking with max { max_token_limit } tokens per chunk..." )
47
-
48
- # Perform chunking
71
+ # Define a source URL for the example file. In a real pipeline,
72
+ # this would come from url_mapping.json.
73
+ source_url_for_example = f"file:// { os . path . abspath ( sample_path ) } "
74
+
75
+ # Perform chunking using the updated function signature
49
76
chunks = chunk_html (
50
- sample_html ,
51
- max_token_limit = max_token_limit ,
52
- count_tag_tokens = True ,
53
- keep_siblings_together = True ,
54
- prepend_parent_section_text = True
77
+ html_content = sample_html ,
78
+ source_url = source_url_for_example ,
79
+ max_token_limit = args .max_token_limit ,
80
+ count_tag_tokens = True
55
81
)
56
82
57
- # Print chunk information
58
83
print (f"Created { len (chunks )} chunks:" )
59
84
60
- # Check if chunks are reasonable
61
- unreasonable_chunks = 0
62
- too_small_chunks = 0
63
- for i , chunk in enumerate (chunks , 1 ):
64
- token_count = count_html_tokens (chunk )
65
-
66
- # Check if chunk is too small (less than 20% of the limit)
67
- if token_count < max_token_limit * 0.2 :
68
- too_small_chunks += 1
69
-
70
- # Check if chunk is way too large
71
- if token_count > max_token_limit * 1.2 :
72
- unreasonable_chunks += 1
73
-
74
- # Print info for first 5 chunks
75
- if i <= 5 :
76
- print (f" Chunk { i } : { token_count } tokens" )
77
- # Show a brief preview of the chunk's content
78
- soup_chunk = chunk .replace ("\n " , " " ).strip ()
79
- preview = soup_chunk [:100 ] + "..." if len (soup_chunk ) > 100 else soup_chunk
80
- print (f" Preview: { preview } \n " )
85
+ chunk_tokens = [count_html_tokens (chunk .text ) for chunk in chunks ]
81
86
82
- if i > 5 :
87
+ # Print info for the first 5 chunks
88
+ for i , chunk in enumerate (chunks [:5 ], 1 ):
89
+ print (f" Chunk { i } : { chunk_tokens [i - 1 ]} tokens" )
90
+ print (f" Metadata Source: { chunk .metadata .get ('source' , 'N/A' )} \n " )
91
+
92
+ if len (chunks ) > 5 :
83
93
print (f" ... and { len (chunks ) - 5 } more chunks\n " )
84
-
85
- if too_small_chunks > 0 :
86
- print (f"Warning: { too_small_chunks } chunks are less than 20% of the token limit." )
87
-
88
- if unreasonable_chunks > 0 :
89
- print (f"Warning: { unreasonable_chunks } chunks exceed the token limit by more than 20%." )
90
-
94
+
91
95
# Save all chunks to a single file with separators
92
- print ("\n Saving all chunks to a single file..." )
93
- with open ("chunked_output.html" , "w" , encoding = "utf-8" ) as f :
96
+ output_filename = args .output
97
+ print (f"\n Saving all chunks to a single file: { output_filename } ..." )
98
+ with open (output_filename , "w" , encoding = "utf-8" ) as f :
94
99
f .write ("<!DOCTYPE html>\n <html>\n <head>\n " )
95
100
f .write ("<title>Chunked HTML Document</title>\n " )
96
101
f .write ("<style>\n " )
102
+ f .write ("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n " )
97
103
f .write (".chunk-separator { margin: 20px 0; border-top: 5px solid #3c82f6; padding-top: 10px; }\n " )
98
104
f .write (".chunk-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; margin-bottom: 10px; font-size: 16px; }\n " )
105
+ f .write (".chunk-meta { background-color: #eaf2ff; padding: 5px 10px; font-family: monospace; font-size: 12px; margin-bottom: 10px; word-wrap: break-word; }\n " )
99
106
f .write (".chunk-content { border: 1px solid #ddd; padding: 15px; }\n " )
100
- f .write ("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n " )
101
- f .write ("</style>\n " )
102
- f .write ("</head>\n <body>\n " )
107
+ f .write ("</style>\n </head>\n <body>\n " )
103
108
104
109
f .write ("<h1>Chunked HTML Document</h1>\n " )
105
110
f .write (f"<p><strong>Original document:</strong> { original_tokens } tokens</p>\n " )
106
- f .write (f"<p><strong>Split into:</strong> { len (chunks )} chunks with max { max_token_limit } tokens per chunk</p>\n " )
107
- f .write ("<p><strong>Chunking settings:</strong></p>\n " )
111
+ f .write (f"<p><strong>Split into:</strong> { len (chunks )} chunks with max { args .max_token_limit } tokens per chunk</p>\n " )
112
+
113
+ # --- START: Re-added summary section ---
114
+ f .write ("<p><strong>Chunking settings (now internal):</strong></p>\n " )
108
115
f .write ("<ul>\n " )
109
- f .write (" <li>count_tag_tokens: Yes </li>\n " )
110
- f .write (" <li>keep_siblings_together: Yes </li>\n " )
111
- f .write (" <li>prepend_parent_section_text: Yes </li>\n " )
116
+ f .write (" <li>count_tag_tokens: True </li>\n " )
117
+ f .write (" <li>keep_siblings_together: True </li>\n " )
118
+ f .write (" <li>prepend_parent_section_text: True </li>\n " )
112
119
f .write ("</ul>\n " )
113
120
114
- # Add a statistics table
115
121
f .write ("<h2>Chunk Statistics</h2>\n " )
116
122
f .write ("<table border='1' cellpadding='5' style='border-collapse: collapse; width: 100%;'>\n " )
117
123
f .write ("<tr><th>Statistic</th><th>Value</th></tr>\n " )
118
124
f .write (f"<tr><td>Number of chunks</td><td>{ len (chunks )} </td></tr>\n " )
119
125
120
- # Calculate more statistics
121
- chunk_tokens = [count_html_tokens (chunk ) for chunk in chunks ]
126
+ # Calculate statistics
122
127
avg_tokens = sum (chunk_tokens ) / len (chunk_tokens ) if chunk_tokens else 0
123
128
min_tokens = min (chunk_tokens ) if chunk_tokens else 0
124
129
max_tokens = max (chunk_tokens ) if chunk_tokens else 0
@@ -127,32 +132,33 @@ def main():
127
132
f .write (f"<tr><td>Minimum tokens</td><td>{ min_tokens } </td></tr>\n " )
128
133
f .write (f"<tr><td>Maximum tokens</td><td>{ max_tokens } </td></tr>\n " )
129
134
f .write (f"<tr><td>Chunks below 100 tokens</td><td>{ sum (1 for t in chunk_tokens if t < 100 )} </td></tr>\n " )
130
- f .write (f"<tr><td>Chunks above token limit</td><td>{ sum (1 for t in chunk_tokens if t > max_token_limit )} </td></tr>\n " )
131
- f .write ("</table>\n <br><hr><br>\n " )
135
+ f .write (f"<tr><td>Chunks above token limit</td><td>{ sum (1 for t in chunk_tokens if t > args .max_token_limit )} </td></tr>\n " )
136
+ f .write ("</table>\n " )
137
+ # --- END: Re-added summary section ---
138
+
139
+ f .write ("<br><hr><br>\n " )
132
140
133
141
for i , chunk in enumerate (chunks , 1 ):
134
- token_count = count_html_tokens ( chunk )
142
+ token_count = chunk_tokens [ i - 1 ]
135
143
if i > 1 :
136
144
f .write ('<div class="chunk-separator"></div>\n ' )
137
145
138
- # Add token count color based on size
139
- color_class = ""
140
- if token_count < max_token_limit * 0.2 :
141
- color_class = " style='background-color: #FFF0F0;'" # Light red for too small
142
- elif token_count > max_token_limit * 1.1 :
143
- color_class = " style='background-color: #FFE0E0;'" # Red for too large
146
+ color_style = ""
147
+ if token_count > args .max_token_limit :
148
+ color_style = " style='background-color: #FFE0E0;'" # Red for oversized
144
149
145
- f .write (f'<div class="chunk-header"{ color_class } >Chunk { i } ({ token_count } tokens)</div>\n ' )
150
+ f .write (f'<div class="chunk-header"{ color_style } >Chunk { i } ({ token_count } tokens)</div>\n ' )
151
+ f .write (f'<div class="chunk-meta"><strong>Source:</strong> { chunk .metadata .get ("source" , "N/A" )} </div>\n ' )
146
152
f .write ('<div class="chunk-content">\n ' )
147
- f .write (chunk )
153
+ f .write (chunk . text )
148
154
f .write ('\n </div>\n ' )
149
155
150
156
f .write ("</body>\n </html>" )
151
157
152
- print (f"All chunks saved to chunked_output.html " )
158
+ print (f"All chunks saved to { output_filename } " )
153
159
154
160
except Exception as e :
155
- print (f"Error during chunking: { e } " )
161
+ print (f"Error during chunking: { e } " , file = sys . stderr )
156
162
import traceback
157
163
traceback .print_exc ()
158
164
return 1
0 commit comments