Update example.py

max-svistunov · max-svistunov · commit 5c47d59dffb2 · 2025-06-09T20:08:41.000+02:00
diff --git a/scripts/html_chunking/example.py b/scripts/html_chunking/example.py
@@ -7,118 +7,123 @@
 
 import os
 import sys
-import json
-from chunker import chunk_html
-from tokenizer import count_html_tokens, set_custom_tokenizer
+import argparse
+from pathlib import Path
+from chunker import chunk_html, Chunk
+from tokenizer import count_html_tokens
 
 def main():
     """Run the HTML chunking example."""
+    parser = argparse.ArgumentParser(
+        description="HTML Chunking Example",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "html_file",
+        nargs="?",
+        default="example.html",
+        help="Path to the input HTML file."
+    )
+    parser.add_argument(
+        "--max-token-limit",
+        type=int,
+        default=500,
+        help="Max tokens per chunk"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default="chunked_output.html",
+        help="Output HTML file name"
+    )
+    args = parser.parse_args()
+    
     print("HTML Chunking Example")
     print("====================\n")
     
-    # Load the HTML sample document
-    sample_path = "example.html"
+    sample_path = args.html_file
     if not os.path.exists(sample_path):
-        print(f"Error: Sample document {sample_path} not found.")
-        print("Make sure example.html is in the current directory.")
+        print(f"Error: Sample document {sample_path} not found.", file=sys.stderr)
         return 1
     
     try:
         with open(sample_path, "r", encoding="utf-8") as f:
             sample_html = f.read()
     except Exception as e:
-        print(f"Error reading HTML file: {e}")
+        print(f"Error reading HTML file: {e}", file=sys.stderr)
         return 1
     
-    # Count tokens in the original document
     try:
         print("Counting tokens in the document...")
         original_tokens = count_html_tokens(sample_html)
         print(f"Original document has {original_tokens} tokens\n")
     except Exception as e:
-        print(f"Error counting tokens: {e}")
-        print("Will proceed with chunking anyway")
+        print(f"Warning: Could not count tokens accurately: {e}", file=sys.stderr)
+        original_tokens = "N/A"
+    
+    print(f"Chunking with max {args.max_token_limit} tokens per chunk...")
+    # These options are now handled internally by the improved chunker
+    print("Chunking options:")
+    print("  - Count tag tokens: True")
+    print("  - Keep siblings together: True")
+    print("  - Prepend parent section text: True\n")
     
-    # Chunk with default settings
     try:
-        # First, set maximum token limit
-        max_token_limit = 500
-        print(f"Chunking with max {max_token_limit} tokens per chunk...")
-        
-        # Perform chunking
+        # Define a source URL for the example file. In a real pipeline,
+        # this would come from url_mapping.json.
+        source_url_for_example = f"file://{os.path.abspath(sample_path)}"
+
+        # Perform chunking using the updated function signature
         chunks = chunk_html(
-            sample_html, 
-            max_token_limit=max_token_limit,
-            count_tag_tokens=True,
-            keep_siblings_together=True,
-            prepend_parent_section_text=True
+            html_content=sample_html,
+            source_url=source_url_for_example,
+            max_token_limit=args.max_token_limit,
+            count_tag_tokens=True
         )
         
-        # Print chunk information
         print(f"Created {len(chunks)} chunks:")
         
-        # Check if chunks are reasonable
-        unreasonable_chunks = 0
-        too_small_chunks = 0
-        for i, chunk in enumerate(chunks, 1):
-            token_count = count_html_tokens(chunk)
-            
-            # Check if chunk is too small (less than 20% of the limit)
-            if token_count < max_token_limit * 0.2:
-                too_small_chunks += 1
-            
-            # Check if chunk is way too large
-            if token_count > max_token_limit * 1.2:
-                unreasonable_chunks += 1
-                
-            # Print info for first 5 chunks
-            if i <= 5:
-                print(f"  Chunk {i}: {token_count} tokens")
-                # Show a brief preview of the chunk's content
-                soup_chunk = chunk.replace("\n", " ").strip()
-                preview = soup_chunk[:100] + "..." if len(soup_chunk) > 100 else soup_chunk
-                print(f"  Preview: {preview}\n")
+        chunk_tokens = [count_html_tokens(chunk.text) for chunk in chunks]
         
-        if i > 5:
+        # Print info for the first 5 chunks
+        for i, chunk in enumerate(chunks[:5], 1):
+            print(f"  Chunk {i}: {chunk_tokens[i-1]} tokens")
+            print(f"  Metadata Source: {chunk.metadata.get('source', 'N/A')}\n")
+
+        if len(chunks) > 5:
             print(f"  ... and {len(chunks) - 5} more chunks\n")
-            
-        if too_small_chunks > 0:
-            print(f"Warning: {too_small_chunks} chunks are less than 20% of the token limit.")
-        
-        if unreasonable_chunks > 0:
-            print(f"Warning: {unreasonable_chunks} chunks exceed the token limit by more than 20%.")
-        
+
         # Save all chunks to a single file with separators
-        print("\nSaving all chunks to a single file...")
-        with open("chunked_output.html", "w", encoding="utf-8") as f:
+        output_filename = args.output
+        print(f"\nSaving all chunks to a single file: {output_filename}...")
+        with open(output_filename, "w", encoding="utf-8") as f:
             f.write("<!DOCTYPE html>\n<html>\n<head>\n")
             f.write("<title>Chunked HTML Document</title>\n")
             f.write("<style>\n")
+            f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n")
             f.write(".chunk-separator { margin: 20px 0; border-top: 5px solid #3c82f6; padding-top: 10px; }\n")
             f.write(".chunk-header { background-color: #f0f0f0; padding: 10px; font-weight: bold; margin-bottom: 10px; font-size: 16px; }\n")
+            f.write(".chunk-meta { background-color: #eaf2ff; padding: 5px 10px; font-family: monospace; font-size: 12px; margin-bottom: 10px; word-wrap: break-word; }\n")
             f.write(".chunk-content { border: 1px solid #ddd; padding: 15px; }\n")
-            f.write("body { font-family: Arial, sans-serif; max-width: 1200px; margin: 20px auto; padding: 0 20px; }\n")
-            f.write("</style>\n")
-            f.write("</head>\n<body>\n")
+            f.write("</style>\n</head>\n<body>\n")
             
             f.write("<h1>Chunked HTML Document</h1>\n")
             f.write(f"<p><strong>Original document:</strong> {original_tokens} tokens</p>\n")
-            f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {max_token_limit} tokens per chunk</p>\n")
-            f.write("<p><strong>Chunking settings:</strong></p>\n")
+            f.write(f"<p><strong>Split into:</strong> {len(chunks)} chunks with max {args.max_token_limit} tokens per chunk</p>\n")
+            
+            # --- START: Re-added summary section ---
+            f.write("<p><strong>Chunking settings (now internal):</strong></p>\n")
             f.write("<ul>\n")
-            f.write("  <li>count_tag_tokens: Yes</li>\n")
-            f.write("  <li>keep_siblings_together: Yes</li>\n")
-            f.write("  <li>prepend_parent_section_text: Yes</li>\n")
+            f.write("  <li>count_tag_tokens: True</li>\n")
+            f.write("  <li>keep_siblings_together: True</li>\n")
+            f.write("  <li>prepend_parent_section_text: True</li>\n")
             f.write("</ul>\n")
             
-            # Add a statistics table
             f.write("<h2>Chunk Statistics</h2>\n")
             f.write("<table border='1' cellpadding='5' style='border-collapse: collapse; width: 100%;'>\n")
             f.write("<tr><th>Statistic</th><th>Value</th></tr>\n")
             f.write(f"<tr><td>Number of chunks</td><td>{len(chunks)}</td></tr>\n")
             
-            # Calculate more statistics
-            chunk_tokens = [count_html_tokens(chunk) for chunk in chunks]
+            # Calculate statistics
             avg_tokens = sum(chunk_tokens) / len(chunk_tokens) if chunk_tokens else 0
             min_tokens = min(chunk_tokens) if chunk_tokens else 0
             max_tokens = max(chunk_tokens) if chunk_tokens else 0
@@ -127,32 +132,33 @@ def main():
             f.write(f"<tr><td>Minimum tokens</td><td>{min_tokens}</td></tr>\n")
             f.write(f"<tr><td>Maximum tokens</td><td>{max_tokens}</td></tr>\n")
             f.write(f"<tr><td>Chunks below 100 tokens</td><td>{sum(1 for t in chunk_tokens if t < 100)}</td></tr>\n")
-            f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > max_token_limit)}</td></tr>\n")
-            f.write("</table>\n<br><hr><br>\n")
+            f.write(f"<tr><td>Chunks above token limit</td><td>{sum(1 for t in chunk_tokens if t > args.max_token_limit)}</td></tr>\n")
+            f.write("</table>\n")
+            # --- END: Re-added summary section ---
+
+            f.write("<br><hr><br>\n")
             
             for i, chunk in enumerate(chunks, 1):
-                token_count = count_html_tokens(chunk)
+                token_count = chunk_tokens[i-1]
                 if i > 1:
                     f.write('<div class="chunk-separator"></div>\n')
                 
-                # Add token count color based on size
-                color_class = ""
-                if token_count < max_token_limit * 0.2:
-                    color_class = " style='background-color: #FFF0F0;'"  # Light red for too small
-                elif token_count > max_token_limit * 1.1:
-                    color_class = " style='background-color: #FFE0E0;'"  # Red for too large
+                color_style = ""
+                if token_count > args.max_token_limit:
+                    color_style = " style='background-color: #FFE0E0;'"  # Red for oversized
                 
-                f.write(f'<div class="chunk-header"{color_class}>Chunk {i} ({token_count} tokens)</div>\n')
+                f.write(f'<div class="chunk-header"{color_style}>Chunk {i} ({token_count} tokens)</div>\n')
+                f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n')
                 f.write('<div class="chunk-content">\n')
-                f.write(chunk)
+                f.write(chunk.text)
                 f.write('\n</div>\n')
             
             f.write("</body>\n</html>")
         
-        print(f"All chunks saved to chunked_output.html")
+        print(f"All chunks saved to {output_filename}")
         
     except Exception as e:
-        print(f"Error during chunking: {e}")
+        print(f"Error during chunking: {e}", file=sys.stderr)
         import traceback
         traceback.print_exc()
         return 1