@@ -31,18 +31,19 @@ def mock_count_html_tokens(html_string, count_tag_tokens=True):
31
31
@patch ('html_chunking.chunker.count_html_tokens' , new = mock_count_html_tokens )
32
32
class TestHtmlChunker (unittest .TestCase ):
33
33
34
- def test_chunk_html_small_input (self ):
34
+ def test_small_input_no_chunking (self ):
35
35
"""Tests that HTML smaller than the max_token_limit is not chunked."""
36
- html = "<html><body><h1>My Title</h1 ><p>This is a small test.</p></body></html>"
36
+ html = "<html><head><title>Test Title</title></head><body ><p>This is a small test.</p></body></html>"
37
37
chunks = chunk_html (html , "http://example.com/small" , max_token_limit = 100 )
38
38
self .assertEqual (len (chunks ), 1 )
39
39
self .assertEqual (chunks [0 ].text , html )
40
40
self .assertEqual (chunks [0 ].metadata ["docs_url" ], "http://example.com/small" )
41
- self .assertEqual (chunks [0 ].metadata ["title" ], "My Title" )
41
+ self .assertEqual (chunks [0 ].metadata ["title" ], "Test Title" )
42
+ self .assertEqual (chunks [0 ].metadata ["section_title" ], "Test Title" )
42
43
43
44
def test_basic_splitting (self ):
44
45
"""Tests basic splitting of multiple paragraphs."""
45
- html = "<html><body>"
46
+ html = "<html><head><title>Basic Splitting</title></head>< body>"
46
47
for i in range (10 ):
47
48
html += f"<p>This is paragraph { i } . It contains several words to simulate content.</p>"
48
49
html += "</body></html>"
@@ -51,131 +52,96 @@ def test_basic_splitting(self):
51
52
self .assertTrue (all (mock_count_html_tokens (c .text ) <= 110 for c in chunks ))
52
53
self .assertIn ("paragraph 0" , chunks [0 ].text )
53
54
self .assertIn ("paragraph 9" , chunks [- 1 ].text )
55
+ self .assertEqual (chunks [0 ].metadata ["title" ], "Basic Splitting" )
54
56
55
57
def test_oversized_element_splitting (self ):
56
58
"""Tests that a single element larger than the limit is recursively split."""
57
59
long_text = "word " * 200
58
- html = f"<html><body><div>{ long_text } </div></body></html>"
60
+ html = f"<html><head><title>Oversized</title></head>< body><div>{ long_text } </div></body></html>"
59
61
chunks = chunk_html (html , "http://example.com/oversized" , max_token_limit = 100 )
60
62
self .assertGreater (len (chunks ), 1 )
61
63
full_text = "" .join (BeautifulSoup (c .text , 'html.parser' ).get_text () for c in chunks )
62
64
self .assertIn ("word" , full_text )
63
65
self .assertGreater (len (full_text ), 500 )
66
+ self .assertEqual (chunks [0 ].metadata ["title" ], "Oversized" )
64
67
65
68
def test_table_splitting (self ):
66
69
"""Tests that large tables are split, preserving the header in each chunk."""
67
70
header = "<thead><tr><th>Header 1</th><th>Header 2</th></tr></thead>"
68
71
rows = "" .join ([f"<tr><td>Row { i } Col 1</td><td>Row { i } Col 2</td></tr>" for i in range (20 )])
69
- html = f"<html><body><table>{ header } <tbody>{ rows } </tbody></table></body></html>"
72
+ html = f"<html><head><title>Table Test</title></head>< body><table>{ header } <tbody>{ rows } </tbody></table></body></html>"
70
73
chunks = chunk_html (html , "http://example.com/table" , max_token_limit = 100 )
71
74
self .assertGreater (len (chunks ), 1 )
72
75
for chunk in chunks :
73
76
self .assertIn ("<thead>" , chunk .text )
74
77
self .assertIn ("Header 1" , chunk .text )
75
78
self .assertIn ("</table>" , chunk .text )
79
+ self .assertEqual (chunk .metadata ["title" ], "Table Test" )
76
80
self .assertIn ("Row 0" , chunks [0 ].text )
77
81
self .assertNotIn ("Row 19" , chunks [0 ].text )
78
82
self .assertIn ("Row 19" , chunks [- 1 ].text )
79
83
80
84
def test_list_splitting (self ):
81
85
"""Tests that large lists are split correctly."""
82
86
items = "" .join ([f"<li>Item { i } is here.</li>" for i in range (30 )])
83
- html = f"<html><body><ul>{ items } </ul></body></html>"
87
+ html = f"<html><head><title>List Test</title></head>< body><ul>{ items } </ul></body></html>"
84
88
chunks = chunk_html (html , "http://example.com/list" , max_token_limit = 100 )
85
89
self .assertGreater (len (chunks ), 1 )
86
90
for chunk in chunks :
87
91
self .assertIn ("<ul " , chunk .text )
88
92
self .assertIn ("</ul>" , chunk .text )
93
+ self .assertEqual (chunk .metadata ["title" ], "List Test" )
89
94
self .assertIn ("Item 0" , chunks [0 ].text )
90
95
self .assertIn ("Item 29" , chunks [- 1 ].text )
91
96
92
- def test_definition_list_splitting (self ):
93
- """Tests splitting of a definition list."""
94
- items = "" .join ([f"<dt>Term { i } </dt><dd>Definition { i } is quite long and elaborate.</dd>" for i in range (15 )])
95
- html = f"<html><body><div class='variablelist'><dl>{ items } </dl></div></body></html>"
96
- chunks = chunk_html (html , "http://example.com/dl" , max_token_limit = 100 )
97
- self .assertGreater (len (chunks ), 1 )
98
- for chunk in chunks :
99
- self .assertIn ("<dl>" , chunk .text )
100
- self .assertIn ("</dl>" , chunk .text )
101
- self .assertIn ("Term 0" , chunks [0 ].text )
102
- self .assertIn ("Term 14" , chunks [- 1 ].text )
103
-
104
- def test_code_splitting (self ):
105
- """Tests that preformatted code blocks are split by lines."""
106
- code_lines = "\n " .join ([f"line_{ i } = 'some code here';" for i in range (50 )])
107
- html = f"<html><body><pre>{ code_lines } </pre></body></html>"
108
- chunks = chunk_html (html , "http://example.com/code" , max_token_limit = 50 )
109
- self .assertGreater (len (chunks ), 1 )
110
- for chunk in chunks :
111
- self .assertIn ("<pre " , chunk .text )
112
- self .assertIn ("</pre>" , chunk .text )
113
- self .assertIn ("line_0" , chunks [0 ].text )
114
- self .assertIn ("line_49" , chunks [- 1 ].text )
115
- self .assertNotIn ("line_49" , chunks [0 ].text )
116
-
117
- def test_heading_grouping (self ):
118
- """Tests that headings are grouped with the following element."""
119
- html = "<html><body>"
120
- for i in range (5 ):
121
- html += f"<h2>Title { i } </h2><p>This is paragraph for title { i } . It has text.</p>"
122
- html += "</body></html>"
123
- chunks = chunk_html (html , "http://example.com/headings" , max_token_limit = 50 )
124
- self .assertEqual (len (chunks ), 5 )
125
- for i , chunk in enumerate (chunks ):
126
- self .assertIn (f"Title { i } " , chunk .text )
127
- self .assertIn (f"paragraph for title { i } " , chunk .text )
128
-
129
- def test_paragraph_ending_with_colon_grouping (self ):
130
- """Tests grouping of a paragraph ending with a colon with the next list/table."""
131
- html = ("<html><body><p>Here are the items:</p>"
132
- "<ul><li>Item 1</li><li>Item 2</li></ul></body></html>" )
133
- chunks = chunk_html (html , "http://example.com/colon" , max_token_limit = 100 )
134
- self .assertEqual (len (chunks ), 1 )
135
- self .assertIn ("Here are the items:" , chunks [0 ].text )
136
- self .assertIn ("<li>Item 1</li>" , chunks [0 ].text )
137
-
138
- def test_metadata_anchor_handling (self ):
139
- """Tests the generation of source metadata with correct anchors."""
97
+ def test_metadata_and_section_titles (self ):
98
+ """Tests the generation of metadata with correct anchors and section titles."""
140
99
html = """
141
- <html><body>
142
- <section id="intro"><h1>Intro </h1><p>Text</p></section>
100
+ <html><head><title>Main Document Title</title></head>< body>
101
+ <section id="intro"><h1>Introduction </h1><p>Text about intro. </p></section>
143
102
<div id="main-content">
144
103
<h2 id="topic1">Topic 1</h2><p>Content 1</p>
145
- <p>More content 1</p>
104
+ <p>More content 1, still under Topic 1. </p>
146
105
</div>
147
106
<section id="conclusion">
148
- <p>Conclusion text</p>
107
+ <p>Conclusion text, still under Topic 1 technically. </p>
149
108
<h3 id="final-thoughts">Final Thoughts</h3><p>Final words.</p>
150
109
</section>
151
110
</body></html>
152
111
"""
153
112
chunks = chunk_html (html , "http://example.com/meta" , max_token_limit = 25 )
154
113
155
- self .assertGreaterEqual (len (chunks ), 3 )
114
+ self .assertGreaterEqual (len (chunks ), 4 )
156
115
157
- # The first chunk might not have a specific anchor if it's just the title
158
- self .assertIn (chunks [0 ].metadata ["docs_url" ], ["http://example.com/meta" , "http://example.com/meta#intro" ])
159
- self .assertEqual (chunks [0 ].metadata ["title" ], "Intro" )
116
+ # Check document title consistency
117
+ for chunk in chunks :
118
+ self .assertEqual (chunk .metadata ["title" ], "Main Document Title" )
119
+
120
+ # Check section titles and anchors
121
+ intro_chunk = next (c for c in chunks if "Introduction" in c .text )
122
+ self .assertIn (intro_chunk .metadata ["docs_url" ], ["http://example.com/meta#intro" , "http://example.com/meta" ])
123
+ self .assertEqual (intro_chunk .metadata ["section_title" ], "Introduction" )
160
124
161
- topic1_chunks = [c for c in chunks if "Topic 1" in c .text or "Content 1" in c .text or "More content 1" in c . text ]
125
+ topic1_chunks = [c for c in chunks if "Topic 1" in c .text or "Content 1" in c .text ]
162
126
self .assertTrue (all (c .metadata ["docs_url" ] == "http://example.com/meta#topic1" for c in topic1_chunks ))
163
-
164
- final_thoughts_chunk = next ((c for c in chunks if "Final words" in c .text ), None )
165
-
166
- self .assertIsNotNone (final_thoughts_chunk , "Final thoughts chunk not found" )
167
-
127
+ self .assertTrue (all (c .metadata ["section_title" ] == "Topic 1" for c in topic1_chunks ))
128
+
129
+ conclusion_chunk = next (c for c in chunks if "Conclusion text" in c .text )
130
+ self .assertEqual (conclusion_chunk .metadata ["section_title" ], "Topic 1" ) # Inherited from previous heading
131
+
132
+ final_thoughts_chunk = next (c for c in chunks if "Final words" in c .text )
168
133
self .assertEqual (final_thoughts_chunk .metadata ["docs_url" ], "http://example.com/meta#final-thoughts" )
169
- self .assertEqual (final_thoughts_chunk .metadata ["title " ], "Intro " )
134
+ self .assertEqual (final_thoughts_chunk .metadata ["section_title " ], "Final Thoughts " )
170
135
171
136
def test_no_anchor_found (self ):
172
137
"""Tests that the source URL has no anchor if no IDs are present."""
173
- html = "<html><body><h1 >No Anchor Title</h1 ><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
138
+ html = "<html><head><title >No Anchor Title</title></head><body ><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
174
139
chunks = chunk_html (html , "http://example.com/no-anchor" , max_token_limit = 15 )
175
140
self .assertEqual (len (chunks ), 2 )
176
141
self .assertEqual (chunks [0 ].metadata ["docs_url" ], "http://example.com/no-anchor" )
177
142
self .assertEqual (chunks [1 ].metadata ["docs_url" ], "http://example.com/no-anchor" )
178
143
self .assertEqual (chunks [0 ].metadata ["title" ], "No Anchor Title" )
144
+ self .assertEqual (chunks [0 ].metadata ["section_title" ], "No Anchor Title" )
179
145
180
146
def test_empty_html (self ):
181
147
"""Tests that empty or minimal HTML does not cause errors."""
0 commit comments