@@ -33,11 +33,12 @@ class TestHtmlChunker(unittest.TestCase):
33
33
34
34
def test_chunk_html_small_input (self ):
35
35
"""Tests that HTML smaller than the max_token_limit is not chunked."""
36
- html = "<html><body><p>This is a small test.</p></body></html>"
36
+ html = "<html><body><h1>My Title</h1>< p>This is a small test.</p></body></html>"
37
37
chunks = chunk_html (html , "http://example.com/small" , max_token_limit = 100 )
38
38
self .assertEqual (len (chunks ), 1 )
39
39
self .assertEqual (chunks [0 ].text , html )
40
- self .assertEqual (chunks [0 ].metadata ["source" ], "http://example.com/small" )
40
+ self .assertEqual (chunks [0 ].metadata ["docs_url" ], "http://example.com/small" )
41
+ self .assertEqual (chunks [0 ].metadata ["title" ], "My Title" )
41
42
42
43
def test_basic_splitting (self ):
43
44
"""Tests basic splitting of multiple paragraphs."""
@@ -153,24 +154,28 @@ def test_metadata_anchor_handling(self):
153
154
154
155
self .assertGreaterEqual (len (chunks ), 3 )
155
156
156
- self .assertEqual (chunks [0 ].metadata ["source" ], "http://example.com/meta" )
157
+ # The first chunk might not have a specific anchor if it's just the title
158
+ self .assertIn (chunks [0 ].metadata ["docs_url" ], ["http://example.com/meta" , "http://example.com/meta#intro" ])
159
+ self .assertEqual (chunks [0 ].metadata ["title" ], "Intro" )
157
160
158
161
topic1_chunks = [c for c in chunks if "Topic 1" in c .text or "Content 1" in c .text or "More content 1" in c .text ]
159
- self .assertTrue (all (c .metadata ["source " ] == "http://example.com/meta#topic1" for c in topic1_chunks ))
162
+ self .assertTrue (all (c .metadata ["docs_url " ] == "http://example.com/meta#topic1" for c in topic1_chunks ))
160
163
161
164
final_thoughts_chunk = next ((c for c in chunks if "Final words" in c .text ), None )
162
165
163
166
self .assertIsNotNone (final_thoughts_chunk , "Final thoughts chunk not found" )
164
167
165
- self .assertEqual (final_thoughts_chunk .metadata ["source" ], "http://example.com/meta#final-thoughts" )
168
+ self .assertEqual (final_thoughts_chunk .metadata ["docs_url" ], "http://example.com/meta#final-thoughts" )
169
+ self .assertEqual (final_thoughts_chunk .metadata ["title" ], "Intro" )
166
170
167
171
def test_no_anchor_found (self ):
168
172
"""Tests that the source URL has no anchor if no IDs are present."""
169
- html = "<html><body><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
173
+ html = "<html><body><h1>No Anchor Title</h1>< p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
170
174
chunks = chunk_html (html , "http://example.com/no-anchor" , max_token_limit = 15 )
171
175
self .assertEqual (len (chunks ), 2 )
172
- self .assertEqual (chunks [0 ].metadata ["source" ], "http://example.com/no-anchor" )
173
- self .assertEqual (chunks [1 ].metadata ["source" ], "http://example.com/no-anchor" )
176
+ self .assertEqual (chunks [0 ].metadata ["docs_url" ], "http://example.com/no-anchor" )
177
+ self .assertEqual (chunks [1 ].metadata ["docs_url" ], "http://example.com/no-anchor" )
178
+ self .assertEqual (chunks [0 ].metadata ["title" ], "No Anchor Title" )
174
179
175
180
def test_empty_html (self ):
176
181
"""Tests that empty or minimal HTML does not cause errors."""
0 commit comments