Skip to content

Commit 4e14676

Browse files
committed
Clean up code
1 parent 6449879 commit 4e14676

File tree

6 files changed

+48
-464
lines changed

6 files changed

+48
-464
lines changed

scripts/html_chunking/__init__.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,24 @@
1313
chunks = chunk_html(
1414
html_content,
1515
max_token_limit=500,
16-
count_tag_tokens=True,
17-
keep_siblings_together=True,
18-
prepend_parent_section_text=True
16+
count_tag_tokens=True
1917
)
2018
"""
2119

22-
from .parser import parse_html, HtmlSection, identify_special_sections
23-
from .tokenizer import count_tokens, count_html_tokens, set_custom_tokenizer, TokenCounter
24-
from .chunker import chunk_html, ChunkingOptions
20+
from .parser import HtmlSection, parse_html, identify_special_sections
21+
from .tokenizer import TokenCounter, count_html_tokens, count_tokens, set_custom_tokenizer
22+
from .chunker import ChunkingOptions, chunk_html
2523

2624
__all__ = [
27-
'chunk_html',
28-
'parse_html',
29-
'HtmlSection',
30-
'count_tokens',
31-
'count_html_tokens',
32-
'set_custom_tokenizer',
33-
'TokenCounter',
34-
'ChunkingOptions',
35-
'identify_special_sections'
25+
"chunk_html",
26+
"parse_html",
27+
"HtmlSection",
28+
"count_tokens",
29+
"count_html_tokens",
30+
"set_custom_tokenizer",
31+
"TokenCounter",
32+
"ChunkingOptions",
33+
"identify_special_sections",
3634
]
3735

3836
__version__ = '1.0.0'

scripts/html_chunking/chunker.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@
22
HTML chunker module.
33
44
This module splits HTML content into chunks based on semantic boundaries.
5-
This version includes stateful anchor tracking for metadata generation.
65
"""
76

87
from typing import List, Dict, Any, Optional
98
from dataclasses import dataclass
109
from bs4 import BeautifulSoup, Tag, NavigableString
1110
import warnings
1211

13-
# This code assumes tokenizer.py is in the same directory or accessible in the path.
1412
from tokenizer import count_html_tokens
1513

14+
# Constants
15+
DEFAULT_CHARS_PER_TOKEN_RATIO = 3.5
16+
1617
@dataclass
1718
class ChunkingOptions:
1819
max_token_limit: int = 500
@@ -96,8 +97,6 @@ def chunk_html(
9697

9798
return final_chunks if final_chunks else [Chunk(text=html_content, metadata={"source": source_url})]
9899

99-
# --- HELPER FUNCTIONS ---
100-
# (The functions below are the same as the previous correct version)
101100

102101
def _split_element_by_children(element: Tag, options: ChunkingOptions) -> List[str]:
103102
chunks, current_chunk_elements, current_tokens = [], [], 0
@@ -285,5 +284,5 @@ def _split_code(pre_element: Tag, options: ChunkingOptions) -> List[str]:
285284

286285
def _linear_split(html_content: str, options: ChunkingOptions) -> List[str]:
287286
warnings.warn("Using linear character split as a fallback for an oversized, indivisible chunk.")
288-
chars_per_chunk = int(options.max_token_limit * 3.5)
287+
chars_per_chunk = int(options.max_token_limit * DEFAULT_CHARS_PER_TOKEN_RATIO)
289288
return [html_content[i:i + chars_per_chunk] for i in range(0, len(html_content), chars_per_chunk)]

scripts/html_chunking/html-stripper.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from typing import List, Optional
1616
from bs4 import BeautifulSoup, Tag
1717

18+
# Constants
19+
MAX_UNWRAP_PASSES = 5
20+
1821

1922
def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bool) -> None:
2023
"""
@@ -28,7 +31,7 @@ def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bo
2831
tags_to_unwrap = [
2932
'div.titlepage', 'div.itemizedlist', 'div.variablelist',
3033
'div._additional-resources', 'span.strong', 'span.inlinemediaobject',
31-
'rh-table', 'colgroup', 'span' # Generalized span removal
34+
'rh-table', 'colgroup', 'span'
3235
]
3336
if strip_links:
3437
tags_to_unwrap.append('a')
@@ -38,7 +41,6 @@ def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bo
3841
try:
3942
tag.unwrap()
4043
except ValueError:
41-
# Ignore errors from trying to unwrap a tag that's already gone
4244
continue
4345

4446
# 2. Special transformation for <rh-alert>
@@ -76,7 +78,7 @@ def _aggressively_strip_tags_and_attributes(soup: BeautifulSoup, strip_links: bo
7678
del tag[attr]
7779

7880
# 4. Unwrap nested, attribute-less divs
79-
for _ in range(5): # Run multiple passes to handle deeply nested structures
81+
for _ in range(MAX_UNWRAP_PASSES): # Run multiple passes to handle deeply nested structures
8082
unwrapped_in_pass = False
8183
for tag in soup.find_all('div', attrs={}):
8284
child_elements = [c for c in tag.children if isinstance(c, Tag)]
@@ -127,7 +129,6 @@ def strip_html_content(
127129

128130
if preserve_path:
129131
try:
130-
# Find a common base to create a sensible relative path
131132
base_input_dir = os.path.abspath(os.path.dirname(input_file_path))
132133
rel_path = os.path.relpath(input_file_path, start=base_input_dir)
133134
except ValueError:

scripts/html_chunking/parser.py

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@
77
import re
88
from dataclasses import dataclass, field
99

10+
# Constants
11+
MAX_SEARCH_DEPTH = 10
12+
MAX_PROCEDURE_SEARCH_DEPTH = 5
13+
MAX_CODE_BLOCKS = 100
14+
MAX_TABLES = 50
15+
MAX_TABLE_ROWS = 100
16+
1017

1118
@dataclass
1219
class HtmlSection:
@@ -85,7 +92,6 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
8592
section_stack = [root_section]
8693
current_section = root_section
8794

88-
# Find the body or use the soup itself if no body
8995
body = soup.body or soup
9096

9197
# First pass: identify all headings
@@ -99,7 +105,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
99105
all_headings.sort(key=lambda x: _get_element_position(soup, x[0]))
100106

101107
# Initialize section map to keep track of hierarchy
102-
section_map = {0: root_section} # Level 0 is the root
108+
section_map = {0: root_section}
103109

104110
# Create section hierarchy based on heading levels
105111
for heading, level in all_headings:
@@ -141,13 +147,11 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
141147
if not element or (isinstance(element, str) and not element.strip()):
142148
continue
143149

144-
# Check if this element is a heading that starts a new section
145150
is_section_start = False
146151
new_level = None
147152

148153
if isinstance(element, Tag) and element.name and re.match(r'h[1-6]$', element.name):
149154
level = int(element.name[1])
150-
# Find corresponding section
151155
for section in _flatten_sections(root_section):
152156
if section.heading_tag and section.heading_tag == element:
153157
is_section_start = True
@@ -156,10 +160,8 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
156160
break
157161

158162
if not is_section_start:
159-
# Add content to current section
160163
current_section.add_content(element)
161164
else:
162-
# No headings found, add all content to root section
163165
for element in body.children:
164166
if element:
165167
root_section.add_content(element)
@@ -170,7 +172,6 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
170172
soup = BeautifulSoup(html_content, 'html.parser')
171173
root_section = HtmlSection()
172174

173-
# Simple fallback: just add all content to the root section
174175
for element in soup.children:
175176
if element:
176177
root_section.add_content(element)
@@ -230,7 +231,6 @@ def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
230231

231232
return special_sections
232233
except Exception as e:
233-
# Return empty sections if identification fails
234234
return {
235235
'procedures': [],
236236
'code_blocks': [],
@@ -259,57 +259,46 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
259259

260260
try:
261261
# Multiple ways to identify procedures
262-
# 1. Look for elements containing the word "Procedure"
263262
procedure_markers = []
264263
for element in soup.find_all(string=lambda text: text and "Procedure" in text):
265264
if element.parent and element.parent.name not in ('script', 'style'):
266265
procedure_markers.append(element)
267266

268-
# 2. Look for ordered lists that might be procedures
269267
ordered_lists = soup.find_all('ol')
270268

271-
# Track processed lists to avoid duplicates
272269
processed_lists = set()
273270

274-
# Process explicit procedure markers first
275271
for marker in procedure_markers:
276272
if not marker or not marker.parent:
277273
continue
278274

279-
# Find the nearest ordered list after the marker
280275
ol = None
281276
current = marker.parent
282277
search_depth = 0
283278

284-
# Search forward for an ordered list
285-
while current and search_depth < 5:
279+
while current and search_depth < MAX_PROCEDURE_SEARCH_DEPTH:
286280
search_depth += 1
287281
if current.name == 'ol':
288282
ol = current
289283
break
290284

291-
# Check next siblings
292285
next_sibling = current.find_next_sibling()
293286
if next_sibling and next_sibling.name == 'ol':
294287
ol = next_sibling
295288
break
296289

297-
# Check children
298290
ol_in_children = current.find('ol')
299291
if ol_in_children:
300292
ol = ol_in_children
301293
break
302294

303-
# Move to next element
304295
current = current.find_next()
305296

306297
if not ol or id(ol) in processed_lists:
307298
continue
308299

309-
# Find heading for this procedure
310300
heading = _find_closest_heading(marker.parent)
311301

312-
# Find elements between heading and procedure
313302
intro = []
314303
if heading:
315304
current = heading.find_next()
@@ -318,14 +307,12 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
318307
intro.append(current)
319308
current = current.find_next()
320309

321-
# Check for prerequisites section
322310
prerequisites = None
323311
for element in intro:
324312
if isinstance(element, Tag) and element.get_text() and "Prerequisites" in element.get_text():
325313
prerequisites = element
326314
break
327315

328-
# Add to procedures
329316
procedures.append({
330317
'heading': heading,
331318
'intro': intro,
@@ -334,16 +321,13 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
334321
'steps': ol
335322
})
336323

337-
# Mark as processed
338324
processed_lists.add(id(ol))
339325

340-
# Process remaining ordered lists that might be procedures
341326
for ol in ordered_lists:
342327
if id(ol) in processed_lists:
343328
continue
344329

345-
# Check if this has procedure-like structure:
346-
# 1. Has list items with paragraphs or code blocks
330+
# Check if this has procedure-like structure
347331
has_structure = False
348332
for li in ol.find_all('li', recursive=False):
349333
if li.find('p') or li.find('pre') or li.find('code'):
@@ -415,7 +399,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
415399
current = element
416400
search_depth = 0
417401

418-
while current and search_depth < 10:
402+
while current and search_depth < MAX_SEARCH_DEPTH:
419403
search_depth += 1
420404
current = current.previous_sibling
421405

@@ -445,8 +429,8 @@ def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
445429

446430
try:
447431
# Find all code blocks
448-
pre_tags = soup.find_all('pre', limit=100) # Limit to prevent excessive processing
449-
code_tags = soup.find_all('code', limit=100)
432+
pre_tags = soup.find_all('pre', limit=MAX_CODE_BLOCKS) # Limit to prevent excessive processing
433+
code_tags = soup.find_all('code', limit=MAX_CODE_BLOCKS)
450434

451435
# Process pre tags
452436
processed_tags = set()
@@ -518,7 +502,7 @@ def identify_tables(soup: BeautifulSoup) -> List[Dict]:
518502

519503
try:
520504
# Find all tables, including those in custom components like rh-table
521-
table_tags = soup.find_all(['table', 'rh-table'], limit=50)
505+
table_tags = soup.find_all(['table', 'rh-table'], limit=MAX_TABLES)
522506

523507
# For custom table components, extract the actual table
524508
expanded_tables = []
@@ -552,10 +536,10 @@ def identify_tables(soup: BeautifulSoup) -> List[Dict]:
552536
# Get rows not in header
553537
if header:
554538
header_rows = set(id(row) for row in header.find_all('tr'))
555-
all_rows = table.find_all('tr', limit=100)
539+
all_rows = table.find_all('tr', limit=MAX_TABLE_ROWS)
556540
rows = [row for row in all_rows if id(row) not in header_rows]
557541
else:
558-
rows = table.find_all('tr', limit=100)
542+
rows = table.find_all('tr', limit=MAX_TABLE_ROWS)
559543
except Exception:
560544
pass
561545

0 commit comments

Comments
 (0)