Skip to content

Commit 5f1af66

Browse files
committed
Add unit tests for main chunking functionality
1 parent 4e14676 commit 5f1af66

File tree

1 file changed

+190
-0
lines changed

1 file changed

+190
-0
lines changed

scripts/html_chunking/test_chunker.py

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
import unittest
2+
from unittest.mock import patch
3+
from bs4 import BeautifulSoup
4+
5+
# Add the parent directory to the path to allow direct import of html_chunking
6+
import sys
7+
import os
8+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9+
10+
from html_chunking.chunker import chunk_html
11+
12+
# --- Mock Tokenizer ---
13+
# This mock tokenizer provides predictable token counts for testing purposes.
14+
def mock_count_html_tokens(html_string, count_tag_tokens=True):
15+
"""
16+
A mock token counting function.
17+
Counts 1 token per word and 10 tokens per tag for consistent testing.
18+
"""
19+
if not isinstance(html_string, str):
20+
html_string = str(html_string)
21+
soup = BeautifulSoup(html_string, 'html.parser')
22+
text = soup.get_text()
23+
words = text.split()
24+
word_tokens = len(words)
25+
tag_tokens = 0
26+
if count_tag_tokens:
27+
tags = soup.find_all(True)
28+
tag_tokens = len(tags) * 10
29+
return word_tokens + tag_tokens
30+
31+
@patch('html_chunking.chunker.count_html_tokens', new=mock_count_html_tokens)
32+
class TestHtmlChunker(unittest.TestCase):
33+
34+
def test_chunk_html_small_input(self):
35+
"""Tests that HTML smaller than the max_token_limit is not chunked."""
36+
html = "<html><body><p>This is a small test.</p></body></html>"
37+
chunks = chunk_html(html, "http://example.com/small", max_token_limit=100)
38+
self.assertEqual(len(chunks), 1)
39+
self.assertEqual(chunks[0].text, html)
40+
self.assertEqual(chunks[0].metadata["source"], "http://example.com/small")
41+
42+
def test_basic_splitting(self):
43+
"""Tests basic splitting of multiple paragraphs."""
44+
html = "<html><body>"
45+
for i in range(10):
46+
html += f"<p>This is paragraph {i}. It contains several words to simulate content.</p>"
47+
html += "</body></html>"
48+
chunks = chunk_html(html, "http://example.com/basic", max_token_limit=100)
49+
self.assertEqual(len(chunks), 3)
50+
self.assertTrue(all(mock_count_html_tokens(c.text) <= 110 for c in chunks))
51+
self.assertIn("paragraph 0", chunks[0].text)
52+
self.assertIn("paragraph 9", chunks[-1].text)
53+
54+
def test_oversized_element_splitting(self):
55+
"""Tests that a single element larger than the limit is recursively split."""
56+
long_text = "word " * 200
57+
html = f"<html><body><div>{long_text}</div></body></html>"
58+
chunks = chunk_html(html, "http://example.com/oversized", max_token_limit=100)
59+
self.assertGreater(len(chunks), 1)
60+
full_text = "".join(BeautifulSoup(c.text, 'html.parser').get_text() for c in chunks)
61+
self.assertIn("word", full_text)
62+
self.assertGreater(len(full_text), 500)
63+
64+
def test_table_splitting(self):
65+
"""Tests that large tables are split, preserving the header in each chunk."""
66+
header = "<thead><tr><th>Header 1</th><th>Header 2</th></tr></thead>"
67+
rows = "".join([f"<tr><td>Row {i} Col 1</td><td>Row {i} Col 2</td></tr>" for i in range(20)])
68+
html = f"<html><body><table>{header}<tbody>{rows}</tbody></table></body></html>"
69+
chunks = chunk_html(html, "http://example.com/table", max_token_limit=100)
70+
self.assertGreater(len(chunks), 1)
71+
for chunk in chunks:
72+
self.assertIn("<thead>", chunk.text)
73+
self.assertIn("Header 1", chunk.text)
74+
self.assertIn("</table>", chunk.text)
75+
self.assertIn("Row 0", chunks[0].text)
76+
self.assertNotIn("Row 19", chunks[0].text)
77+
self.assertIn("Row 19", chunks[-1].text)
78+
79+
def test_list_splitting(self):
80+
"""Tests that large lists are split correctly."""
81+
items = "".join([f"<li>Item {i} is here.</li>" for i in range(30)])
82+
html = f"<html><body><ul>{items}</ul></body></html>"
83+
chunks = chunk_html(html, "http://example.com/list", max_token_limit=100)
84+
self.assertGreater(len(chunks), 1)
85+
for chunk in chunks:
86+
self.assertIn("<ul ", chunk.text)
87+
self.assertIn("</ul>", chunk.text)
88+
self.assertIn("Item 0", chunks[0].text)
89+
self.assertIn("Item 29", chunks[-1].text)
90+
91+
def test_definition_list_splitting(self):
92+
"""Tests splitting of a definition list."""
93+
items = "".join([f"<dt>Term {i}</dt><dd>Definition {i} is quite long and elaborate.</dd>" for i in range(15)])
94+
html = f"<html><body><div class='variablelist'><dl>{items}</dl></div></body></html>"
95+
chunks = chunk_html(html, "http://example.com/dl", max_token_limit=100)
96+
self.assertGreater(len(chunks), 1)
97+
for chunk in chunks:
98+
self.assertIn("<dl>", chunk.text)
99+
self.assertIn("</dl>", chunk.text)
100+
self.assertIn("Term 0", chunks[0].text)
101+
self.assertIn("Term 14", chunks[-1].text)
102+
103+
def test_code_splitting(self):
104+
"""Tests that preformatted code blocks are split by lines."""
105+
code_lines = "\n".join([f"line_{i} = 'some code here';" for i in range(50)])
106+
html = f"<html><body><pre>{code_lines}</pre></body></html>"
107+
chunks = chunk_html(html, "http://example.com/code", max_token_limit=50)
108+
self.assertGreater(len(chunks), 1)
109+
for chunk in chunks:
110+
self.assertIn("<pre ", chunk.text)
111+
self.assertIn("</pre>", chunk.text)
112+
self.assertIn("line_0", chunks[0].text)
113+
self.assertIn("line_49", chunks[-1].text)
114+
self.assertNotIn("line_49", chunks[0].text)
115+
116+
def test_heading_grouping(self):
117+
"""Tests that headings are grouped with the following element."""
118+
html = "<html><body>"
119+
for i in range(5):
120+
html += f"<h2>Title {i}</h2><p>This is paragraph for title {i}. It has text.</p>"
121+
html += "</body></html>"
122+
chunks = chunk_html(html, "http://example.com/headings", max_token_limit=50)
123+
self.assertEqual(len(chunks), 5)
124+
for i, chunk in enumerate(chunks):
125+
self.assertIn(f"Title {i}", chunk.text)
126+
self.assertIn(f"paragraph for title {i}", chunk.text)
127+
128+
def test_paragraph_ending_with_colon_grouping(self):
129+
"""Tests grouping of a paragraph ending with a colon with the next list/table."""
130+
html = ("<html><body><p>Here are the items:</p>"
131+
"<ul><li>Item 1</li><li>Item 2</li></ul></body></html>")
132+
chunks = chunk_html(html, "http://example.com/colon", max_token_limit=100)
133+
self.assertEqual(len(chunks), 1)
134+
self.assertIn("Here are the items:", chunks[0].text)
135+
self.assertIn("<li>Item 1</li>", chunks[0].text)
136+
137+
def test_metadata_anchor_handling(self):
138+
"""Tests the generation of source metadata with correct anchors."""
139+
html = """
140+
<html><body>
141+
<section id="intro"><h1>Intro</h1><p>Text</p></section>
142+
<div id="main-content">
143+
<h2 id="topic1">Topic 1</h2><p>Content 1</p>
144+
<p>More content 1</p>
145+
</div>
146+
<section id="conclusion">
147+
<p>Conclusion text</p>
148+
<h3 id="final-thoughts">Final Thoughts</h3><p>Final words.</p>
149+
</section>
150+
</body></html>
151+
"""
152+
chunks = chunk_html(html, "http://example.com/meta", max_token_limit=25)
153+
154+
self.assertGreaterEqual(len(chunks), 3)
155+
156+
self.assertEqual(chunks[0].metadata["source"], "http://example.com/meta")
157+
158+
topic1_chunks = [c for c in chunks if "Topic 1" in c.text or "Content 1" in c.text or "More content 1" in c.text]
159+
self.assertTrue(all(c.metadata["source"] == "http://example.com/meta#topic1" for c in topic1_chunks))
160+
161+
final_thoughts_chunk = next((c for c in chunks if "Final words" in c.text), None)
162+
163+
self.assertIsNotNone(final_thoughts_chunk, "Final thoughts chunk not found")
164+
165+
self.assertEqual(final_thoughts_chunk.metadata["source"], "http://example.com/meta#final-thoughts")
166+
167+
def test_no_anchor_found(self):
168+
"""Tests that the source URL has no anchor if no IDs are present."""
169+
html = "<html><body><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
170+
chunks = chunk_html(html, "http://example.com/no-anchor", max_token_limit=15)
171+
self.assertEqual(len(chunks), 2)
172+
self.assertEqual(chunks[0].metadata["source"], "http://example.com/no-anchor")
173+
self.assertEqual(chunks[1].metadata["source"], "http://example.com/no-anchor")
174+
175+
def test_empty_html(self):
176+
"""Tests that empty or minimal HTML does not cause errors."""
177+
chunks_empty = chunk_html("", "http://example.com/empty")
178+
self.assertEqual(len(chunks_empty), 1)
179+
self.assertEqual(chunks_empty[0].text, "")
180+
181+
chunks_html = chunk_html("<html></html>", "http://example.com/empty")
182+
self.assertEqual(len(chunks_html), 1)
183+
self.assertEqual(chunks_html[0].text, "<html></html>")
184+
185+
chunks_body = chunk_html("<body></body>", "http://example.com/empty")
186+
self.assertEqual(len(chunks_body), 1)
187+
self.assertEqual(chunks_body[0].text, "<body></body>")
188+
189+
if __name__ == '__main__':
190+
unittest.main(argv=['first-arg-is-ignored'], exit=False)

0 commit comments

Comments
 (0)