Skip to content

Commit d20c216

Browse files
committed
Add semantic HTML tag style extraction support
Track semantic tags (b, strong, i, em, u, s, del, strike) and create synthetic style nodes for style_callback. This allows screen readers and other accessibility tools to detect text styling from semantic tags. - Add SEMANTIC_STYLES constant mapping tags to CSS properties - Track semantic tag stack during parsing - Create mock elements with style attributes for callback - Handle nested semantic tags correctly - Add comprehensive test suite (13 tests)
1 parent fef66a4 commit d20c216

File tree

2 files changed

+324
-1
lines changed

2 files changed

+324
-1
lines changed

html_to_text.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import posixpath
77
import re
88
import sys
9-
from typing import Callable, Dict, Optional, Union
9+
from typing import Callable, Dict, Optional, Union, Any
1010

1111
from urllib.parse import unquote
1212

@@ -82,6 +82,17 @@ class HTMLParser(LXMLParser):
8282
whitespace_re = re.compile(r"\s+")
8383
_block = ("p", "div", "center", "blockquote")
8484
heading_levels = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
85+
# Semantic HTML tags that imply styling
86+
SEMANTIC_STYLES = {
87+
'b': {'font-weight': 'bold'},
88+
'strong': {'font-weight': 'bold'},
89+
'i': {'font-style': 'italic'},
90+
'em': {'font-style': 'italic'},
91+
'u': {'text-decoration': 'underline'},
92+
's': {'text-decoration': 'line-through'},
93+
'strike': {'text-decoration': 'line-through'},
94+
'del': {'text-decoration': 'line-through'},
95+
}
8596

8697
def __init__(
8798
self,
@@ -111,6 +122,7 @@ def __init__(
111122
self.last_start = ""
112123
self.element_stack: list[tuple[_Element, int]] = [] # Track (element, start_pos)
113124
self.link_start = 0
125+
self.semantic_style_stack: list[dict[str, Any]] = [] # Track active semantic styles
114126

115127
# Set up state machine using enum objects directly
116128
states = list(ContentState)
@@ -350,6 +362,19 @@ def handle_starttag(self, tag: str, attrs: _Attrib) -> None: # type: ignore[ove
350362
attrs=dict(attrs),
351363
)
352364
self.table_stack.append(node)
365+
# Track semantic tags for style extraction
366+
if tag in self.SEMANTIC_STYLES and self.style_callback:
367+
start_pos = (
368+
self.output.tell()
369+
+ self.startpos
370+
+ (len(self.add) if not self.is_starting else 0)
371+
+ (1 if self.final_space else 0)
372+
)
373+
self.semantic_style_stack.append({
374+
'tag': tag,
375+
'start': start_pos,
376+
'styles': self.SEMANTIC_STYLES[tag].copy()
377+
})
353378

354379
def handle_endtag(self, tag: str, item: _Element) -> None: # type: ignore[override]
355380
if "class" in item.attrib and item.attrib["class"] == "pagenum":
@@ -393,6 +418,32 @@ def handle_endtag(self, tag: str, item: _Element) -> None: # type: ignore[overr
393418
end_pos = self.output.tell() + self.startpos
394419
self.style_callback(item, start_pos, end_pos)
395420

421+
# Process semantic tags for style extraction
422+
if tag in self.SEMANTIC_STYLES and self.style_callback and self.semantic_style_stack:
423+
# Find matching tag on stack (handle nesting - pop most recent matching tag)
424+
for i in range(len(self.semantic_style_stack) - 1, -1, -1):
425+
if self.semantic_style_stack[i]['tag'] == tag:
426+
style_info = self.semantic_style_stack.pop(i)
427+
end_pos = self.output.tell() + self.startpos
428+
429+
# Only create style node if there's actual content
430+
if end_pos > style_info['start']:
431+
# Create a mock element with style attribute for the callback
432+
# Convert our dict format (font_weight) to CSS format (font-weight)
433+
css_properties = []
434+
for prop_key, prop_value in style_info['styles'].items():
435+
css_prop = prop_key.replace('_', '-')
436+
css_properties.append(f"{css_prop}: {prop_value}")
437+
style_str = '; '.join(css_properties)
438+
439+
# Create mock element and set style attribute
440+
mock_element = lxml.etree.Element(tag)
441+
mock_element.set('style', style_str)
442+
443+
# Call style callback with mock element
444+
self.style_callback(mock_element, style_info['start'], end_pos)
445+
break
446+
396447
self.last_start = tag
397448

398449
def handle_data(self, data: str, start_tag: Optional[str]) -> None: # type: ignore[override]

tests/test_semantic_tags.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
"""Tests for semantic HTML tag style extraction."""
2+
import pytest
3+
from html_to_text import html_to_text
4+
from lxml import etree
5+
6+
7+
def test_bold_tag_creates_style():
8+
"""Test that <b> tag creates font-weight style."""
9+
html = '<html><body><p>This is <b>bold</b> text</p></body></html>'
10+
11+
styles_found = []
12+
13+
def style_callback(element, start, end):
14+
style = element.get('style')
15+
styles_found.append({
16+
'tag': element.tag,
17+
'style': style,
18+
'start': start,
19+
'end': end
20+
})
21+
22+
text = html_to_text(html, style_callback=style_callback)
23+
24+
# Should find one style node for <b>
25+
assert len(styles_found) == 1
26+
assert 'font-weight: bold' in styles_found[0]['style']
27+
assert styles_found[0]['tag'] == 'b'
28+
29+
30+
def test_italic_tag_creates_style():
31+
"""Test that <i> tag creates font-style."""
32+
html = '<html><body><p>This is <i>italic</i> text</p></body></html>'
33+
34+
styles_found = []
35+
36+
def style_callback(element, start, end):
37+
style = element.get('style')
38+
styles_found.append({
39+
'tag': element.tag,
40+
'style': style,
41+
'start': start,
42+
'end': end
43+
})
44+
45+
text = html_to_text(html, style_callback=style_callback)
46+
47+
assert len(styles_found) == 1
48+
assert 'font-style: italic' in styles_found[0]['style']
49+
50+
51+
def test_em_tag_creates_style():
52+
"""Test that <em> tag creates font-style."""
53+
html = '<html><body><p><em>Emphasis</em> text</p></body></html>'
54+
55+
styles_found = []
56+
57+
def style_callback(element, start, end):
58+
styles_found.append({'style': element.get('style')})
59+
60+
text = html_to_text(html, style_callback=style_callback)
61+
62+
assert len(styles_found) == 1
63+
assert 'font-style: italic' in styles_found[0]['style']
64+
65+
66+
def test_strong_tag_creates_style():
67+
"""Test that <strong> tag creates font-weight."""
68+
html = '<html><body><p><strong>Strong</strong> text</p></body></html>'
69+
70+
styles_found = []
71+
72+
def style_callback(element, start, end):
73+
styles_found.append({'style': element.get('style')})
74+
75+
text = html_to_text(html, style_callback=style_callback)
76+
77+
assert len(styles_found) == 1
78+
assert 'font-weight: bold' in styles_found[0]['style']
79+
80+
81+
def test_underline_tag_creates_style():
82+
"""Test that <u> tag creates text-decoration."""
83+
html = '<html><body><p><u>Underlined</u> text</p></body></html>'
84+
85+
styles_found = []
86+
87+
def style_callback(element, start, end):
88+
styles_found.append({'style': element.get('style')})
89+
90+
text = html_to_text(html, style_callback=style_callback)
91+
92+
assert len(styles_found) == 1
93+
assert 'text-decoration: underline' in styles_found[0]['style']
94+
95+
96+
def test_nested_semantic_tags():
97+
"""Test that nested semantic tags create multiple overlapping styles."""
98+
html = '<html><body><p>Text with <b>bold and <i>bold italic</i></b></p></body></html>'
99+
100+
styles_found = []
101+
102+
def style_callback(element, start, end):
103+
styles_found.append({
104+
'tag': element.tag,
105+
'style': element.get('style'),
106+
'start': start,
107+
'end': end,
108+
'text_range': (start, end)
109+
})
110+
111+
text = html_to_text(html, style_callback=style_callback)
112+
113+
# Should have 2 style nodes: one for <b>, one for <i>
114+
assert len(styles_found) == 2
115+
116+
# Find bold and italic styles
117+
bold_styles = [s for s in styles_found if 'font-weight: bold' in s['style']]
118+
italic_styles = [s for s in styles_found if 'font-style: italic' in s['style']]
119+
120+
assert len(bold_styles) == 1
121+
assert len(italic_styles) == 1
122+
123+
# The italic range should be contained within the bold range
124+
bold_start, bold_end = bold_styles[0]['start'], bold_styles[0]['end']
125+
italic_start, italic_end = italic_styles[0]['start'], italic_styles[0]['end']
126+
127+
assert italic_start >= bold_start
128+
assert italic_end <= bold_end
129+
130+
131+
def test_multiple_separate_semantic_tags():
132+
"""Test multiple separate semantic tags in same paragraph."""
133+
html = '<html><body><p><b>Bold</b> and <i>italic</i> and <u>underline</u></p></body></html>'
134+
135+
styles_found = []
136+
137+
def style_callback(element, start, end):
138+
styles_found.append({
139+
'tag': element.tag,
140+
'style': element.get('style')
141+
})
142+
143+
text = html_to_text(html, style_callback=style_callback)
144+
145+
# Should have 3 separate style nodes
146+
assert len(styles_found) == 3
147+
148+
tags = [s['tag'] for s in styles_found]
149+
assert 'b' in tags
150+
assert 'i' in tags
151+
assert 'u' in tags
152+
153+
154+
def test_semantic_tag_with_existing_style_attribute():
155+
"""Test that semantic tags work alongside existing style attributes."""
156+
html = '<html><body><p><b style="color: red">Bold red</b></p></body></html>'
157+
158+
styles_found = []
159+
160+
def style_callback(element, start, end):
161+
styles_found.append({
162+
'tag': element.tag,
163+
'style': element.get('style')
164+
})
165+
166+
text = html_to_text(html, style_callback=style_callback)
167+
168+
# Should have TWO style callbacks:
169+
# 1. One from the original <b style="color: red"> (line 388-394)
170+
# 2. One from our semantic tag tracking (new code)
171+
assert len(styles_found) == 2
172+
173+
# One should have color: red, one should have font-weight: bold
174+
styles_str = ' '.join([s['style'] for s in styles_found])
175+
assert 'color: red' in styles_str
176+
assert 'font-weight: bold' in styles_str
177+
178+
179+
def test_empty_semantic_tag():
180+
"""Test that empty semantic tags don't create style nodes."""
181+
html = '<html><body><p><b></b>text</p></body></html>'
182+
183+
styles_found = []
184+
185+
def style_callback(element, start, end):
186+
styles_found.append({'tag': element.tag})
187+
188+
text = html_to_text(html, style_callback=style_callback)
189+
190+
# Empty tag should not create a style node (start == end check)
191+
assert len(styles_found) == 0
192+
193+
194+
def test_semantic_tags_positions():
195+
"""Test that style positions match text positions correctly."""
196+
html = '<html><body><p>Start <b>bold</b> end</p></body></html>'
197+
198+
styles_found = []
199+
200+
def style_callback(element, start, end):
201+
styles_found.append({
202+
'start': start,
203+
'end': end
204+
})
205+
206+
text = html_to_text(html, style_callback=style_callback)
207+
208+
# Text should be "Start bold end"
209+
assert text.strip() == "Start bold end"
210+
211+
# The style should cover "bold" which starts at position 6 (after "Start ")
212+
assert len(styles_found) == 1
213+
styled_text = text[styles_found[0]['start']:styles_found[0]['end']]
214+
assert styled_text == "bold"
215+
216+
217+
def test_no_style_callback_no_processing():
218+
"""Test that semantic tags don't cause errors when no callback provided."""
219+
html = '<html><body><p><b>Bold</b> text</p></body></html>'
220+
221+
# Should not raise any errors
222+
text = html_to_text(html)
223+
assert text.strip() == "Bold text"
224+
225+
226+
def test_strikethrough_tags():
227+
"""Test strikethrough tags (<s>, <strike>, <del>)."""
228+
html = '<html><body><p><s>Strike</s> <del>Deleted</del></p></body></html>'
229+
230+
styles_found = []
231+
232+
def style_callback(element, start, end):
233+
styles_found.append({
234+
'tag': element.tag,
235+
'style': element.get('style')
236+
})
237+
238+
text = html_to_text(html, style_callback=style_callback)
239+
240+
assert len(styles_found) == 2
241+
for style_info in styles_found:
242+
assert 'text-decoration: line-through' in style_info['style']
243+
244+
245+
def test_semantic_tags_with_callback():
246+
"""Integration test: semantic tags with style callback."""
247+
html = '''
248+
<html>
249+
<body>
250+
<h1>Test</h1>
251+
<p>This is a <b>test</b></p>
252+
<p><em>Emphasis</em> <i>italic</i></p>
253+
<p><span style="font-weight: bold">Bold using styles</span></p>
254+
</body>
255+
</html>
256+
'''
257+
258+
nodes = []
259+
260+
def callback(element, start, end):
261+
style = element.get('style', '')
262+
if 'bold' in style or 'italic' in style:
263+
nodes.append({
264+
'start': start,
265+
'end': end,
266+
'style': style
267+
})
268+
269+
text = html_to_text(html, style_callback=callback)
270+
271+
# Should find at least 4 style nodes: <b>, <em>, <i>, and <span>
272+
assert len(nodes) >= 4

0 commit comments

Comments
 (0)