|
| 1 | +import unittest |
| 2 | +from unittest.mock import patch |
| 3 | +from bs4 import BeautifulSoup |
| 4 | + |
| 5 | +# Add the parent directory to the path to allow direct import of html_chunking |
| 6 | +import sys |
| 7 | +import os |
| 8 | +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
| 9 | + |
| 10 | +from html_chunking.chunker import chunk_html |
| 11 | + |
| 12 | +# --- Mock Tokenizer --- |
| 13 | +# This mock tokenizer provides predictable token counts for testing purposes. |
| 14 | +def mock_count_html_tokens(html_string, count_tag_tokens=True): |
| 15 | + """ |
| 16 | + A mock token counting function. |
| 17 | + Counts 1 token per word and 10 tokens per tag for consistent testing. |
| 18 | + """ |
| 19 | + if not isinstance(html_string, str): |
| 20 | + html_string = str(html_string) |
| 21 | + soup = BeautifulSoup(html_string, 'html.parser') |
| 22 | + text = soup.get_text() |
| 23 | + words = text.split() |
| 24 | + word_tokens = len(words) |
| 25 | + tag_tokens = 0 |
| 26 | + if count_tag_tokens: |
| 27 | + tags = soup.find_all(True) |
| 28 | + tag_tokens = len(tags) * 10 |
| 29 | + return word_tokens + tag_tokens |
| 30 | + |
| 31 | +@patch('html_chunking.chunker.count_html_tokens', new=mock_count_html_tokens) |
| 32 | +class TestHtmlChunker(unittest.TestCase): |
| 33 | + |
| 34 | + def test_chunk_html_small_input(self): |
| 35 | + """Tests that HTML smaller than the max_token_limit is not chunked.""" |
| 36 | + html = "<html><body><p>This is a small test.</p></body></html>" |
| 37 | + chunks = chunk_html(html, "http://example.com/small", max_token_limit=100) |
| 38 | + self.assertEqual(len(chunks), 1) |
| 39 | + self.assertEqual(chunks[0].text, html) |
| 40 | + self.assertEqual(chunks[0].metadata["source"], "http://example.com/small") |
| 41 | + |
| 42 | + def test_basic_splitting(self): |
| 43 | + """Tests basic splitting of multiple paragraphs.""" |
| 44 | + html = "<html><body>" |
| 45 | + for i in range(10): |
| 46 | + html += f"<p>This is paragraph {i}. It contains several words to simulate content.</p>" |
| 47 | + html += "</body></html>" |
| 48 | + chunks = chunk_html(html, "http://example.com/basic", max_token_limit=100) |
| 49 | + self.assertEqual(len(chunks), 3) |
| 50 | + self.assertTrue(all(mock_count_html_tokens(c.text) <= 110 for c in chunks)) |
| 51 | + self.assertIn("paragraph 0", chunks[0].text) |
| 52 | + self.assertIn("paragraph 9", chunks[-1].text) |
| 53 | + |
| 54 | + def test_oversized_element_splitting(self): |
| 55 | + """Tests that a single element larger than the limit is recursively split.""" |
| 56 | + long_text = "word " * 200 |
| 57 | + html = f"<html><body><div>{long_text}</div></body></html>" |
| 58 | + chunks = chunk_html(html, "http://example.com/oversized", max_token_limit=100) |
| 59 | + self.assertGreater(len(chunks), 1) |
| 60 | + full_text = "".join(BeautifulSoup(c.text, 'html.parser').get_text() for c in chunks) |
| 61 | + self.assertIn("word", full_text) |
| 62 | + self.assertGreater(len(full_text), 500) |
| 63 | + |
| 64 | + def test_table_splitting(self): |
| 65 | + """Tests that large tables are split, preserving the header in each chunk.""" |
| 66 | + header = "<thead><tr><th>Header 1</th><th>Header 2</th></tr></thead>" |
| 67 | + rows = "".join([f"<tr><td>Row {i} Col 1</td><td>Row {i} Col 2</td></tr>" for i in range(20)]) |
| 68 | + html = f"<html><body><table>{header}<tbody>{rows}</tbody></table></body></html>" |
| 69 | + chunks = chunk_html(html, "http://example.com/table", max_token_limit=100) |
| 70 | + self.assertGreater(len(chunks), 1) |
| 71 | + for chunk in chunks: |
| 72 | + self.assertIn("<thead>", chunk.text) |
| 73 | + self.assertIn("Header 1", chunk.text) |
| 74 | + self.assertIn("</table>", chunk.text) |
| 75 | + self.assertIn("Row 0", chunks[0].text) |
| 76 | + self.assertNotIn("Row 19", chunks[0].text) |
| 77 | + self.assertIn("Row 19", chunks[-1].text) |
| 78 | + |
| 79 | + def test_list_splitting(self): |
| 80 | + """Tests that large lists are split correctly.""" |
| 81 | + items = "".join([f"<li>Item {i} is here.</li>" for i in range(30)]) |
| 82 | + html = f"<html><body><ul>{items}</ul></body></html>" |
| 83 | + chunks = chunk_html(html, "http://example.com/list", max_token_limit=100) |
| 84 | + self.assertGreater(len(chunks), 1) |
| 85 | + for chunk in chunks: |
| 86 | + self.assertIn("<ul ", chunk.text) |
| 87 | + self.assertIn("</ul>", chunk.text) |
| 88 | + self.assertIn("Item 0", chunks[0].text) |
| 89 | + self.assertIn("Item 29", chunks[-1].text) |
| 90 | + |
| 91 | + def test_definition_list_splitting(self): |
| 92 | + """Tests splitting of a definition list.""" |
| 93 | + items = "".join([f"<dt>Term {i}</dt><dd>Definition {i} is quite long and elaborate.</dd>" for i in range(15)]) |
| 94 | + html = f"<html><body><div class='variablelist'><dl>{items}</dl></div></body></html>" |
| 95 | + chunks = chunk_html(html, "http://example.com/dl", max_token_limit=100) |
| 96 | + self.assertGreater(len(chunks), 1) |
| 97 | + for chunk in chunks: |
| 98 | + self.assertIn("<dl>", chunk.text) |
| 99 | + self.assertIn("</dl>", chunk.text) |
| 100 | + self.assertIn("Term 0", chunks[0].text) |
| 101 | + self.assertIn("Term 14", chunks[-1].text) |
| 102 | + |
| 103 | + def test_code_splitting(self): |
| 104 | + """Tests that preformatted code blocks are split by lines.""" |
| 105 | + code_lines = "\n".join([f"line_{i} = 'some code here';" for i in range(50)]) |
| 106 | + html = f"<html><body><pre>{code_lines}</pre></body></html>" |
| 107 | + chunks = chunk_html(html, "http://example.com/code", max_token_limit=50) |
| 108 | + self.assertGreater(len(chunks), 1) |
| 109 | + for chunk in chunks: |
| 110 | + self.assertIn("<pre ", chunk.text) |
| 111 | + self.assertIn("</pre>", chunk.text) |
| 112 | + self.assertIn("line_0", chunks[0].text) |
| 113 | + self.assertIn("line_49", chunks[-1].text) |
| 114 | + self.assertNotIn("line_49", chunks[0].text) |
| 115 | + |
| 116 | + def test_heading_grouping(self): |
| 117 | + """Tests that headings are grouped with the following element.""" |
| 118 | + html = "<html><body>" |
| 119 | + for i in range(5): |
| 120 | + html += f"<h2>Title {i}</h2><p>This is paragraph for title {i}. It has text.</p>" |
| 121 | + html += "</body></html>" |
| 122 | + chunks = chunk_html(html, "http://example.com/headings", max_token_limit=50) |
| 123 | + self.assertEqual(len(chunks), 5) |
| 124 | + for i, chunk in enumerate(chunks): |
| 125 | + self.assertIn(f"Title {i}", chunk.text) |
| 126 | + self.assertIn(f"paragraph for title {i}", chunk.text) |
| 127 | + |
| 128 | + def test_paragraph_ending_with_colon_grouping(self): |
| 129 | + """Tests grouping of a paragraph ending with a colon with the next list/table.""" |
| 130 | + html = ("<html><body><p>Here are the items:</p>" |
| 131 | + "<ul><li>Item 1</li><li>Item 2</li></ul></body></html>") |
| 132 | + chunks = chunk_html(html, "http://example.com/colon", max_token_limit=100) |
| 133 | + self.assertEqual(len(chunks), 1) |
| 134 | + self.assertIn("Here are the items:", chunks[0].text) |
| 135 | + self.assertIn("<li>Item 1</li>", chunks[0].text) |
| 136 | + |
| 137 | + def test_metadata_anchor_handling(self): |
| 138 | + """Tests the generation of source metadata with correct anchors.""" |
| 139 | + html = """ |
| 140 | + <html><body> |
| 141 | + <section id="intro"><h1>Intro</h1><p>Text</p></section> |
| 142 | + <div id="main-content"> |
| 143 | + <h2 id="topic1">Topic 1</h2><p>Content 1</p> |
| 144 | + <p>More content 1</p> |
| 145 | + </div> |
| 146 | + <section id="conclusion"> |
| 147 | + <p>Conclusion text</p> |
| 148 | + <h3 id="final-thoughts">Final Thoughts</h3><p>Final words.</p> |
| 149 | + </section> |
| 150 | + </body></html> |
| 151 | + """ |
| 152 | + chunks = chunk_html(html, "http://example.com/meta", max_token_limit=25) |
| 153 | + |
| 154 | + self.assertGreaterEqual(len(chunks), 3) |
| 155 | + |
| 156 | + self.assertEqual(chunks[0].metadata["source"], "http://example.com/meta") |
| 157 | + |
| 158 | + topic1_chunks = [c for c in chunks if "Topic 1" in c.text or "Content 1" in c.text or "More content 1" in c.text] |
| 159 | + self.assertTrue(all(c.metadata["source"] == "http://example.com/meta#topic1" for c in topic1_chunks)) |
| 160 | + |
| 161 | + final_thoughts_chunk = next((c for c in chunks if "Final words" in c.text), None) |
| 162 | + |
| 163 | + self.assertIsNotNone(final_thoughts_chunk, "Final thoughts chunk not found") |
| 164 | + |
| 165 | + self.assertEqual(final_thoughts_chunk.metadata["source"], "http://example.com/meta#final-thoughts") |
| 166 | + |
| 167 | + def test_no_anchor_found(self): |
| 168 | + """Tests that the source URL has no anchor if no IDs are present.""" |
| 169 | + html = "<html><body><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>" |
| 170 | + chunks = chunk_html(html, "http://example.com/no-anchor", max_token_limit=15) |
| 171 | + self.assertEqual(len(chunks), 2) |
| 172 | + self.assertEqual(chunks[0].metadata["source"], "http://example.com/no-anchor") |
| 173 | + self.assertEqual(chunks[1].metadata["source"], "http://example.com/no-anchor") |
| 174 | + |
| 175 | + def test_empty_html(self): |
| 176 | + """Tests that empty or minimal HTML does not cause errors.""" |
| 177 | + chunks_empty = chunk_html("", "http://example.com/empty") |
| 178 | + self.assertEqual(len(chunks_empty), 1) |
| 179 | + self.assertEqual(chunks_empty[0].text, "") |
| 180 | + |
| 181 | + chunks_html = chunk_html("<html></html>", "http://example.com/empty") |
| 182 | + self.assertEqual(len(chunks_html), 1) |
| 183 | + self.assertEqual(chunks_html[0].text, "<html></html>") |
| 184 | + |
| 185 | + chunks_body = chunk_html("<body></body>", "http://example.com/empty") |
| 186 | + self.assertEqual(len(chunks_body), 1) |
| 187 | + self.assertEqual(chunks_body[0].text, "<body></body>") |
| 188 | + |
| 189 | +if __name__ == '__main__': |
| 190 | + unittest.main(argv=['first-arg-is-ignored'], exit=False) |
0 commit comments