Skip to content

Commit 9965966

Browse files
committed
Fix title bug; simplify & update test; add section_title metadata
1 parent 36530b6 commit 9965966

File tree

4 files changed

+90
-84
lines changed

4 files changed

+90
-84
lines changed

scripts/html_chunking/chunker.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from bs4 import BeautifulSoup, Tag, NavigableString
1010
import warnings
1111

12-
from tokenizer import count_html_tokens
12+
from .tokenizer import count_html_tokens
1313

1414
# Constants
1515
DEFAULT_CHARS_PER_TOKEN_RATIO = 3.5
@@ -39,9 +39,9 @@ def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
3939

4040

4141
def get_document_title(soup: BeautifulSoup) -> str:
42-
"""Extracts the document title from the <h1> tag."""
43-
h1_tag = soup.find('h1')
44-
return h1_tag.get_text(strip=True) if h1_tag else "Untitled"
42+
"""Extracts the document title from the <title> tag."""
43+
title_tag = soup.find('title')
44+
return title_tag.get_text(strip=True) if title_tag else "Untitled"
4545

4646

4747
def chunk_html(
@@ -70,14 +70,18 @@ def chunk_html(
7070

7171
try:
7272
soup = BeautifulSoup(html_content, 'html.parser')
73-
doc_title = get_document_title(soup)
73+
document_title = get_document_title(soup)
7474

7575
if count_html_tokens(html_content, options.count_tag_tokens) <= options.max_token_limit:
76-
metadata = {"docs_url": source_url, "title": doc_title}
76+
metadata = {
77+
"docs_url": source_url,
78+
"title": document_title,
79+
"section_title": document_title
80+
}
7781
return [Chunk(text=html_content, metadata=metadata)]
7882
except Exception as e:
7983
warnings.warn("Could not pre-calculate total tokens: %s. Proceeding with chunking." % e)
80-
doc_title = "Untitled"
84+
document_title = "Untitled"
8185

8286
try:
8387
body = soup.body or soup
@@ -86,9 +90,11 @@ def chunk_html(
8690
warnings.warn("A critical error occurred during semantic chunking: %s. Falling back to linear splitting." % e)
8791
string_chunks = _linear_split(html_content, options)
8892

89-
# Post-process string chunks to add stateful anchor metadata
93+
# Post-process string chunks to add stateful anchor and title metadata
9094
final_chunks = []
9195
last_seen_anchor = None
96+
last_heading_text = document_title
97+
9298
for s_chunk in string_chunks:
9399
if not s_chunk.strip():
94100
continue
@@ -101,11 +107,29 @@ def chunk_html(
101107

102108
final_anchor = last_seen_anchor
103109

110+
chunk_headings = chunk_soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
111+
if chunk_headings:
112+
last_heading_text = chunk_headings[-1].get_text(strip=True)
113+
114+
section_title = last_heading_text
115+
104116
full_source_url = f"{source_url}#{final_anchor}" if final_anchor else source_url
105-
metadata = {"docs_url": full_source_url, "title": doc_title}
117+
metadata = {
118+
"docs_url": full_source_url,
119+
"title": document_title,
120+
"section_title": section_title
121+
}
106122
final_chunks.append(Chunk(text=s_chunk, metadata=metadata))
107123

108-
return final_chunks if final_chunks else [Chunk(text=html_content, metadata={"docs_url": source_url, "title": doc_title})]
124+
if not final_chunks:
125+
metadata = {
126+
"docs_url": source_url,
127+
"title": document_title,
128+
"section_title": document_title
129+
}
130+
return [Chunk(text=html_content, metadata=metadata)]
131+
132+
return final_chunks
109133

110134

111135
def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[str]:

scripts/html_chunking/html-stripper.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,18 @@ def strip_html_content(
111111

112112
soup = BeautifulSoup(html_content, "html.parser")
113113

114+
# Extract title from the original document's <title> tag.
115+
title_tag = soup.find('title')
116+
title_text = title_tag.get_text(strip=True) if title_tag else "Untitled"
117+
114118
if strip_mode in ['sections', 'all']:
115119
body_content = soup.body or soup
116-
new_soup = BeautifulSoup("<html><body></body></html>", "html.parser")
120+
new_soup = BeautifulSoup("<html><head></head><body></body></html>", "html.parser")
121+
122+
if new_soup.head:
123+
new_title_tag = new_soup.new_tag("title")
124+
new_title_tag.string = title_text
125+
new_soup.head.append(new_title_tag)
117126

118127
chapters = body_content.find_all("section", class_="chapter")
119128
if not chapters:

scripts/html_chunking/test_chunker.py

Lines changed: 37 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,19 @@ def mock_count_html_tokens(html_string, count_tag_tokens=True):
3131
@patch('html_chunking.chunker.count_html_tokens', new=mock_count_html_tokens)
3232
class TestHtmlChunker(unittest.TestCase):
3333

34-
def test_chunk_html_small_input(self):
34+
def test_small_input_no_chunking(self):
3535
"""Tests that HTML smaller than the max_token_limit is not chunked."""
36-
html = "<html><body><h1>My Title</h1><p>This is a small test.</p></body></html>"
36+
html = "<html><head><title>Test Title</title></head><body><p>This is a small test.</p></body></html>"
3737
chunks = chunk_html(html, "http://example.com/small", max_token_limit=100)
3838
self.assertEqual(len(chunks), 1)
3939
self.assertEqual(chunks[0].text, html)
4040
self.assertEqual(chunks[0].metadata["docs_url"], "http://example.com/small")
41-
self.assertEqual(chunks[0].metadata["title"], "My Title")
41+
self.assertEqual(chunks[0].metadata["title"], "Test Title")
42+
self.assertEqual(chunks[0].metadata["section_title"], "Test Title")
4243

4344
def test_basic_splitting(self):
4445
"""Tests basic splitting of multiple paragraphs."""
45-
html = "<html><body>"
46+
html = "<html><head><title>Basic Splitting</title></head><body>"
4647
for i in range(10):
4748
html += f"<p>This is paragraph {i}. It contains several words to simulate content.</p>"
4849
html += "</body></html>"
@@ -51,131 +52,96 @@ def test_basic_splitting(self):
5152
self.assertTrue(all(mock_count_html_tokens(c.text) <= 110 for c in chunks))
5253
self.assertIn("paragraph 0", chunks[0].text)
5354
self.assertIn("paragraph 9", chunks[-1].text)
55+
self.assertEqual(chunks[0].metadata["title"], "Basic Splitting")
5456

5557
def test_oversized_element_splitting(self):
5658
"""Tests that a single element larger than the limit is recursively split."""
5759
long_text = "word " * 200
58-
html = f"<html><body><div>{long_text}</div></body></html>"
60+
html = f"<html><head><title>Oversized</title></head><body><div>{long_text}</div></body></html>"
5961
chunks = chunk_html(html, "http://example.com/oversized", max_token_limit=100)
6062
self.assertGreater(len(chunks), 1)
6163
full_text = "".join(BeautifulSoup(c.text, 'html.parser').get_text() for c in chunks)
6264
self.assertIn("word", full_text)
6365
self.assertGreater(len(full_text), 500)
66+
self.assertEqual(chunks[0].metadata["title"], "Oversized")
6467

6568
def test_table_splitting(self):
6669
"""Tests that large tables are split, preserving the header in each chunk."""
6770
header = "<thead><tr><th>Header 1</th><th>Header 2</th></tr></thead>"
6871
rows = "".join([f"<tr><td>Row {i} Col 1</td><td>Row {i} Col 2</td></tr>" for i in range(20)])
69-
html = f"<html><body><table>{header}<tbody>{rows}</tbody></table></body></html>"
72+
html = f"<html><head><title>Table Test</title></head><body><table>{header}<tbody>{rows}</tbody></table></body></html>"
7073
chunks = chunk_html(html, "http://example.com/table", max_token_limit=100)
7174
self.assertGreater(len(chunks), 1)
7275
for chunk in chunks:
7376
self.assertIn("<thead>", chunk.text)
7477
self.assertIn("Header 1", chunk.text)
7578
self.assertIn("</table>", chunk.text)
79+
self.assertEqual(chunk.metadata["title"], "Table Test")
7680
self.assertIn("Row 0", chunks[0].text)
7781
self.assertNotIn("Row 19", chunks[0].text)
7882
self.assertIn("Row 19", chunks[-1].text)
7983

8084
def test_list_splitting(self):
8185
"""Tests that large lists are split correctly."""
8286
items = "".join([f"<li>Item {i} is here.</li>" for i in range(30)])
83-
html = f"<html><body><ul>{items}</ul></body></html>"
87+
html = f"<html><head><title>List Test</title></head><body><ul>{items}</ul></body></html>"
8488
chunks = chunk_html(html, "http://example.com/list", max_token_limit=100)
8589
self.assertGreater(len(chunks), 1)
8690
for chunk in chunks:
8791
self.assertIn("<ul ", chunk.text)
8892
self.assertIn("</ul>", chunk.text)
93+
self.assertEqual(chunk.metadata["title"], "List Test")
8994
self.assertIn("Item 0", chunks[0].text)
9095
self.assertIn("Item 29", chunks[-1].text)
9196

92-
def test_definition_list_splitting(self):
93-
"""Tests splitting of a definition list."""
94-
items = "".join([f"<dt>Term {i}</dt><dd>Definition {i} is quite long and elaborate.</dd>" for i in range(15)])
95-
html = f"<html><body><div class='variablelist'><dl>{items}</dl></div></body></html>"
96-
chunks = chunk_html(html, "http://example.com/dl", max_token_limit=100)
97-
self.assertGreater(len(chunks), 1)
98-
for chunk in chunks:
99-
self.assertIn("<dl>", chunk.text)
100-
self.assertIn("</dl>", chunk.text)
101-
self.assertIn("Term 0", chunks[0].text)
102-
self.assertIn("Term 14", chunks[-1].text)
103-
104-
def test_code_splitting(self):
105-
"""Tests that preformatted code blocks are split by lines."""
106-
code_lines = "\n".join([f"line_{i} = 'some code here';" for i in range(50)])
107-
html = f"<html><body><pre>{code_lines}</pre></body></html>"
108-
chunks = chunk_html(html, "http://example.com/code", max_token_limit=50)
109-
self.assertGreater(len(chunks), 1)
110-
for chunk in chunks:
111-
self.assertIn("<pre ", chunk.text)
112-
self.assertIn("</pre>", chunk.text)
113-
self.assertIn("line_0", chunks[0].text)
114-
self.assertIn("line_49", chunks[-1].text)
115-
self.assertNotIn("line_49", chunks[0].text)
116-
117-
def test_heading_grouping(self):
118-
"""Tests that headings are grouped with the following element."""
119-
html = "<html><body>"
120-
for i in range(5):
121-
html += f"<h2>Title {i}</h2><p>This is paragraph for title {i}. It has text.</p>"
122-
html += "</body></html>"
123-
chunks = chunk_html(html, "http://example.com/headings", max_token_limit=50)
124-
self.assertEqual(len(chunks), 5)
125-
for i, chunk in enumerate(chunks):
126-
self.assertIn(f"Title {i}", chunk.text)
127-
self.assertIn(f"paragraph for title {i}", chunk.text)
128-
129-
def test_paragraph_ending_with_colon_grouping(self):
130-
"""Tests grouping of a paragraph ending with a colon with the next list/table."""
131-
html = ("<html><body><p>Here are the items:</p>"
132-
"<ul><li>Item 1</li><li>Item 2</li></ul></body></html>")
133-
chunks = chunk_html(html, "http://example.com/colon", max_token_limit=100)
134-
self.assertEqual(len(chunks), 1)
135-
self.assertIn("Here are the items:", chunks[0].text)
136-
self.assertIn("<li>Item 1</li>", chunks[0].text)
137-
138-
def test_metadata_anchor_handling(self):
139-
"""Tests the generation of source metadata with correct anchors."""
97+
def test_metadata_and_section_titles(self):
98+
"""Tests the generation of metadata with correct anchors and section titles."""
14099
html = """
141-
<html><body>
142-
<section id="intro"><h1>Intro</h1><p>Text</p></section>
100+
<html><head><title>Main Document Title</title></head><body>
101+
<section id="intro"><h1>Introduction</h1><p>Text about intro.</p></section>
143102
<div id="main-content">
144103
<h2 id="topic1">Topic 1</h2><p>Content 1</p>
145-
<p>More content 1</p>
104+
<p>More content 1, still under Topic 1.</p>
146105
</div>
147106
<section id="conclusion">
148-
<p>Conclusion text</p>
107+
<p>Conclusion text, still under Topic 1 technically.</p>
149108
<h3 id="final-thoughts">Final Thoughts</h3><p>Final words.</p>
150109
</section>
151110
</body></html>
152111
"""
153112
chunks = chunk_html(html, "http://example.com/meta", max_token_limit=25)
154113

155-
self.assertGreaterEqual(len(chunks), 3)
114+
self.assertGreaterEqual(len(chunks), 4)
156115

157-
# The first chunk might not have a specific anchor if it's just the title
158-
self.assertIn(chunks[0].metadata["docs_url"], ["http://example.com/meta", "http://example.com/meta#intro"])
159-
self.assertEqual(chunks[0].metadata["title"], "Intro")
116+
# Check document title consistency
117+
for chunk in chunks:
118+
self.assertEqual(chunk.metadata["title"], "Main Document Title")
119+
120+
# Check section titles and anchors
121+
intro_chunk = next(c for c in chunks if "Introduction" in c.text)
122+
self.assertIn(intro_chunk.metadata["docs_url"], ["http://example.com/meta#intro", "http://example.com/meta"])
123+
self.assertEqual(intro_chunk.metadata["section_title"], "Introduction")
160124

161-
topic1_chunks = [c for c in chunks if "Topic 1" in c.text or "Content 1" in c.text or "More content 1" in c.text]
125+
topic1_chunks = [c for c in chunks if "Topic 1" in c.text or "Content 1" in c.text]
162126
self.assertTrue(all(c.metadata["docs_url"] == "http://example.com/meta#topic1" for c in topic1_chunks))
163-
164-
final_thoughts_chunk = next((c for c in chunks if "Final words" in c.text), None)
165-
166-
self.assertIsNotNone(final_thoughts_chunk, "Final thoughts chunk not found")
167-
127+
self.assertTrue(all(c.metadata["section_title"] == "Topic 1" for c in topic1_chunks))
128+
129+
conclusion_chunk = next(c for c in chunks if "Conclusion text" in c.text)
130+
self.assertEqual(conclusion_chunk.metadata["section_title"], "Topic 1") # Inherited from previous heading
131+
132+
final_thoughts_chunk = next(c for c in chunks if "Final words" in c.text)
168133
self.assertEqual(final_thoughts_chunk.metadata["docs_url"], "http://example.com/meta#final-thoughts")
169-
self.assertEqual(final_thoughts_chunk.metadata["title"], "Intro")
134+
self.assertEqual(final_thoughts_chunk.metadata["section_title"], "Final Thoughts")
170135

171136
def test_no_anchor_found(self):
172137
"""Tests that the source URL has no anchor if no IDs are present."""
173-
html = "<html><body><h1>No Anchor Title</h1><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
138+
html = "<html><head><title>No Anchor Title</title></head><body><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
174139
chunks = chunk_html(html, "http://example.com/no-anchor", max_token_limit=15)
175140
self.assertEqual(len(chunks), 2)
176141
self.assertEqual(chunks[0].metadata["docs_url"], "http://example.com/no-anchor")
177142
self.assertEqual(chunks[1].metadata["docs_url"], "http://example.com/no-anchor")
178143
self.assertEqual(chunks[0].metadata["title"], "No Anchor Title")
144+
self.assertEqual(chunks[0].metadata["section_title"], "No Anchor Title")
179145

180146
def test_empty_html(self):
181147
"""Tests that empty or minimal HTML does not cause errors."""

scripts/html_embeddings/chunk_html.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import re
88
import sys
9+
from bs4 import BeautifulSoup
910
from pathlib import Path
1011
from typing import Dict, List, Any, Optional
1112
from urllib.parse import urlparse
@@ -88,7 +89,7 @@ def chunk_html_documents(
8889
success, chunk_count = chunk_single_html_file(
8990
input_file=html_file.resolve(),
9091
output_dir=doc_specific_output_dir, # Pass the new doc-specific dir
91-
input_base_dir=base_dir_for_relative_paths, # Pass the consistent version-level base path
92+
input_base_dir=base_dir_for_relative_paths.resolve(), # Pass the consistent version-level base path
9293
source_url=source_url,
9394
max_token_limit=max_token_limit,
9495
count_tag_tokens=count_tag_tokens,
@@ -182,15 +183,21 @@ def chunk_single_html_file(
182183

183184
chunk_count = 0
184185
for i, chunk_obj in enumerate(chunks):
186+
chunker_metadata = chunk_obj.metadata or {}
185187
chunk_metadata = {
186188
**base_metadata,
187-
**chunk_obj.metadata,
189+
"docs_url": chunker_metadata.get("docs_url"),
190+
"title": chunker_metadata.get("title"),
191+
"section_title": chunker_metadata.get("section_title"),
188192
"chunk_index": i,
189193
"total_chunks": len(chunks),
190194
"token_count": count_html_tokens(chunk_obj.text, count_tag_tokens),
191195
"source_file": str(relative_path),
192196
}
193197

198+
# Filter out any keys that have None values to keep the JSON clean
199+
chunk_metadata = {k: v for k, v in chunk_metadata.items() if v is not None}
200+
194201
chunk_data = {
195202
"id": f"{base_metadata['doc_id']}_chunk_{i:04d}",
196203
"content": chunk_obj.text,

0 commit comments

Comments
 (0)