Skip to content

Commit 80dd461

Browse files
Merge pull request #439 from max-svistunov/lcore-307-update-chunk-metadata
LCORE-307 Add docs_url and title metadata to chunks
2 parents cf43a1c + 1afc52b commit 80dd461

File tree

4 files changed

+31
-15
lines changed

4 files changed

+31
-15
lines changed

scripts/html_chunking/chunker.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ def find_first_anchor(chunk_soup: BeautifulSoup) -> Optional[str]:
3838
return None
3939

4040

41+
def get_document_title(soup: BeautifulSoup) -> str:
42+
"""Extracts the document title from the <h1> tag."""
43+
h1_tag = soup.find('h1')
44+
return h1_tag.get_text(strip=True) if h1_tag else "Untitled"
45+
46+
4147
def chunk_html(
4248
html_content: str,
4349
source_url: str,
@@ -63,13 +69,17 @@ def chunk_html(
6369
)
6470

6571
try:
72+
soup = BeautifulSoup(html_content, 'html.parser')
73+
doc_title = get_document_title(soup)
74+
6675
if count_html_tokens(html_content, options.count_tag_tokens) <= options.max_token_limit:
67-
return [Chunk(text=html_content, metadata={"source": source_url})]
76+
metadata = {"docs_url": source_url, "title": doc_title}
77+
return [Chunk(text=html_content, metadata=metadata)]
6878
except Exception as e:
6979
warnings.warn("Could not pre-calculate total tokens: %s. Proceeding with chunking." % e)
80+
doc_title = "Untitled"
7081

7182
try:
72-
soup = BeautifulSoup(html_content, 'html.parser')
7383
body = soup.body or soup
7484
string_chunks = _split_element_by_children(body, options)
7585
except Exception as e:
@@ -92,10 +102,10 @@ def chunk_html(
92102
final_anchor = last_seen_anchor
93103

94104
full_source_url = f"{source_url}#{final_anchor}" if final_anchor else source_url
95-
metadata = {"source": full_source_url}
105+
metadata = {"docs_url": full_source_url, "title": doc_title}
96106
final_chunks.append(Chunk(text=s_chunk, metadata=metadata))
97107

98-
return final_chunks if final_chunks else [Chunk(text=html_content, metadata={"source": source_url})]
108+
return final_chunks if final_chunks else [Chunk(text=html_content, metadata={"docs_url": source_url, "title": doc_title})]
99109

100110

101111
def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[str]:

scripts/html_chunking/example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def generate_html_report(output_path: str, chunks: List['Chunk'], original_token
7272

7373
style = " style='background-color:#FFE0E0;'" if token_count > max_token_limit else ""
7474
f.write(f'<div class="chunk-header"{style}>Chunk {i} ({token_count} tokens)</div>\n')
75-
f.write(f'<div class="chunk-meta"><strong>Source:</strong> {chunk.metadata.get("source", "N/A")}</div>\n')
75+
f.write(f'<div class="chunk-meta"><strong>Title:</strong> {chunk.metadata.get("title", "N/A")}<br><strong>Source:</strong> {chunk.metadata.get("docs_url", "N/A")}</div>\n')
7676
f.write('<div class="chunk-content">\n')
7777
f.write(chunk.text)
7878
f.write('\n</div>\n')

scripts/html_chunking/test_chunker.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@ class TestHtmlChunker(unittest.TestCase):
3333

3434
def test_chunk_html_small_input(self):
3535
"""Tests that HTML smaller than the max_token_limit is not chunked."""
36-
html = "<html><body><p>This is a small test.</p></body></html>"
36+
html = "<html><body><h1>My Title</h1><p>This is a small test.</p></body></html>"
3737
chunks = chunk_html(html, "http://example.com/small", max_token_limit=100)
3838
self.assertEqual(len(chunks), 1)
3939
self.assertEqual(chunks[0].text, html)
40-
self.assertEqual(chunks[0].metadata["source"], "http://example.com/small")
40+
self.assertEqual(chunks[0].metadata["docs_url"], "http://example.com/small")
41+
self.assertEqual(chunks[0].metadata["title"], "My Title")
4142

4243
def test_basic_splitting(self):
4344
"""Tests basic splitting of multiple paragraphs."""
@@ -153,24 +154,28 @@ def test_metadata_anchor_handling(self):
153154

154155
self.assertGreaterEqual(len(chunks), 3)
155156

156-
self.assertEqual(chunks[0].metadata["source"], "http://example.com/meta")
157+
# The first chunk might not have a specific anchor if it's just the title
158+
self.assertIn(chunks[0].metadata["docs_url"], ["http://example.com/meta", "http://example.com/meta#intro"])
159+
self.assertEqual(chunks[0].metadata["title"], "Intro")
157160

158161
topic1_chunks = [c for c in chunks if "Topic 1" in c.text or "Content 1" in c.text or "More content 1" in c.text]
159-
self.assertTrue(all(c.metadata["source"] == "http://example.com/meta#topic1" for c in topic1_chunks))
162+
self.assertTrue(all(c.metadata["docs_url"] == "http://example.com/meta#topic1" for c in topic1_chunks))
160163

161164
final_thoughts_chunk = next((c for c in chunks if "Final words" in c.text), None)
162165

163166
self.assertIsNotNone(final_thoughts_chunk, "Final thoughts chunk not found")
164167

165-
self.assertEqual(final_thoughts_chunk.metadata["source"], "http://example.com/meta#final-thoughts")
168+
self.assertEqual(final_thoughts_chunk.metadata["docs_url"], "http://example.com/meta#final-thoughts")
169+
self.assertEqual(final_thoughts_chunk.metadata["title"], "Intro")
166170

167171
def test_no_anchor_found(self):
168172
"""Tests that the source URL has no anchor if no IDs are present."""
169-
html = "<html><body><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
173+
html = "<html><body><h1>No Anchor Title</h1><p>Paragraph 1.</p><p>Paragraph 2.</p></body></html>"
170174
chunks = chunk_html(html, "http://example.com/no-anchor", max_token_limit=15)
171175
self.assertEqual(len(chunks), 2)
172-
self.assertEqual(chunks[0].metadata["source"], "http://example.com/no-anchor")
173-
self.assertEqual(chunks[1].metadata["source"], "http://example.com/no-anchor")
176+
self.assertEqual(chunks[0].metadata["docs_url"], "http://example.com/no-anchor")
177+
self.assertEqual(chunks[1].metadata["docs_url"], "http://example.com/no-anchor")
178+
self.assertEqual(chunks[0].metadata["title"], "No Anchor Title")
174179

175180
def test_empty_html(self):
176181
"""Tests that empty or minimal HTML does not cause errors."""

scripts/html_embeddings/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,12 @@ Chunks are saved as JSON files with the following structure:
174174
"version": "4.18",
175175
"file_path": "monitoring/index.html",
176176
"doc_type": "openshift_documentation",
177-
"source": "https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html-single/monitoring/",
177+
"title": "Monitoring Guide",
178+
"docs_url": "https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html-single/monitoring/",
178179
"chunk_index": 1,
179180
"total_chunks": 45,
180181
"token_count": 375,
181-
"source_file": "monitoring/index.html",
182+
"source_file": "monitoring/index.html"
182183
}
183184
}
184185
```

0 commit comments

Comments
 (0)