7
7
import re
8
8
from dataclasses import dataclass , field
9
9
10
+ # Constants
11
+ MAX_SEARCH_DEPTH = 10
12
+ MAX_PROCEDURE_SEARCH_DEPTH = 5
13
+ MAX_CODE_BLOCKS = 100
14
+ MAX_TABLES = 50
15
+ MAX_TABLE_ROWS = 100
16
+
10
17
11
18
@dataclass
12
19
class HtmlSection :
@@ -85,7 +92,6 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
85
92
section_stack = [root_section ]
86
93
current_section = root_section
87
94
88
- # Find the body or use the soup itself if no body
89
95
body = soup .body or soup
90
96
91
97
# First pass: identify all headings
@@ -99,7 +105,7 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
99
105
all_headings .sort (key = lambda x : _get_element_position (soup , x [0 ]))
100
106
101
107
# Initialize section map to keep track of hierarchy
102
- section_map = {0 : root_section } # Level 0 is the root
108
+ section_map = {0 : root_section }
103
109
104
110
# Create section hierarchy based on heading levels
105
111
for heading , level in all_headings :
@@ -141,13 +147,11 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
141
147
if not element or (isinstance (element , str ) and not element .strip ()):
142
148
continue
143
149
144
- # Check if this element is a heading that starts a new section
145
150
is_section_start = False
146
151
new_level = None
147
152
148
153
if isinstance (element , Tag ) and element .name and re .match (r'h[1-6]$' , element .name ):
149
154
level = int (element .name [1 ])
150
- # Find corresponding section
151
155
for section in _flatten_sections (root_section ):
152
156
if section .heading_tag and section .heading_tag == element :
153
157
is_section_start = True
@@ -156,10 +160,8 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
156
160
break
157
161
158
162
if not is_section_start :
159
- # Add content to current section
160
163
current_section .add_content (element )
161
164
else :
162
- # No headings found, add all content to root section
163
165
for element in body .children :
164
166
if element :
165
167
root_section .add_content (element )
@@ -170,7 +172,6 @@ def parse_html(html_content: str) -> Tuple[BeautifulSoup, HtmlSection]:
170
172
soup = BeautifulSoup (html_content , 'html.parser' )
171
173
root_section = HtmlSection ()
172
174
173
- # Simple fallback: just add all content to the root section
174
175
for element in soup .children :
175
176
if element :
176
177
root_section .add_content (element )
@@ -230,7 +231,6 @@ def identify_special_sections(soup: BeautifulSoup) -> Dict[str, List[Dict]]:
230
231
231
232
return special_sections
232
233
except Exception as e :
233
- # Return empty sections if identification fails
234
234
return {
235
235
'procedures' : [],
236
236
'code_blocks' : [],
@@ -259,57 +259,46 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
259
259
260
260
try :
261
261
# Multiple ways to identify procedures
262
- # 1. Look for elements containing the word "Procedure"
263
262
procedure_markers = []
264
263
for element in soup .find_all (string = lambda text : text and "Procedure" in text ):
265
264
if element .parent and element .parent .name not in ('script' , 'style' ):
266
265
procedure_markers .append (element )
267
266
268
- # 2. Look for ordered lists that might be procedures
269
267
ordered_lists = soup .find_all ('ol' )
270
268
271
- # Track processed lists to avoid duplicates
272
269
processed_lists = set ()
273
270
274
- # Process explicit procedure markers first
275
271
for marker in procedure_markers :
276
272
if not marker or not marker .parent :
277
273
continue
278
274
279
- # Find the nearest ordered list after the marker
280
275
ol = None
281
276
current = marker .parent
282
277
search_depth = 0
283
278
284
- # Search forward for an ordered list
285
- while current and search_depth < 5 :
279
+ while current and search_depth < MAX_PROCEDURE_SEARCH_DEPTH :
286
280
search_depth += 1
287
281
if current .name == 'ol' :
288
282
ol = current
289
283
break
290
284
291
- # Check next siblings
292
285
next_sibling = current .find_next_sibling ()
293
286
if next_sibling and next_sibling .name == 'ol' :
294
287
ol = next_sibling
295
288
break
296
289
297
- # Check children
298
290
ol_in_children = current .find ('ol' )
299
291
if ol_in_children :
300
292
ol = ol_in_children
301
293
break
302
294
303
- # Move to next element
304
295
current = current .find_next ()
305
296
306
297
if not ol or id (ol ) in processed_lists :
307
298
continue
308
299
309
- # Find heading for this procedure
310
300
heading = _find_closest_heading (marker .parent )
311
301
312
- # Find elements between heading and procedure
313
302
intro = []
314
303
if heading :
315
304
current = heading .find_next ()
@@ -318,14 +307,12 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
318
307
intro .append (current )
319
308
current = current .find_next ()
320
309
321
- # Check for prerequisites section
322
310
prerequisites = None
323
311
for element in intro :
324
312
if isinstance (element , Tag ) and element .get_text () and "Prerequisites" in element .get_text ():
325
313
prerequisites = element
326
314
break
327
315
328
- # Add to procedures
329
316
procedures .append ({
330
317
'heading' : heading ,
331
318
'intro' : intro ,
@@ -334,16 +321,13 @@ def identify_procedure_sections(soup: BeautifulSoup) -> List[Dict]:
334
321
'steps' : ol
335
322
})
336
323
337
- # Mark as processed
338
324
processed_lists .add (id (ol ))
339
325
340
- # Process remaining ordered lists that might be procedures
341
326
for ol in ordered_lists :
342
327
if id (ol ) in processed_lists :
343
328
continue
344
329
345
- # Check if this has procedure-like structure:
346
- # 1. Has list items with paragraphs or code blocks
330
+ # Check if this has procedure-like structure
347
331
has_structure = False
348
332
for li in ol .find_all ('li' , recursive = False ):
349
333
if li .find ('p' ) or li .find ('pre' ) or li .find ('code' ):
@@ -415,7 +399,7 @@ def _find_closest_heading(element: Tag) -> Optional[Tag]:
415
399
current = element
416
400
search_depth = 0
417
401
418
- while current and search_depth < 10 :
402
+ while current and search_depth < MAX_SEARCH_DEPTH :
419
403
search_depth += 1
420
404
current = current .previous_sibling
421
405
@@ -445,8 +429,8 @@ def identify_code_blocks(soup: BeautifulSoup) -> List[Dict]:
445
429
446
430
try :
447
431
# Find all code blocks
448
- pre_tags = soup .find_all ('pre' , limit = 100 ) # Limit to prevent excessive processing
449
- code_tags = soup .find_all ('code' , limit = 100 )
432
+ pre_tags = soup .find_all ('pre' , limit = MAX_CODE_BLOCKS ) # Limit to prevent excessive processing
433
+ code_tags = soup .find_all ('code' , limit = MAX_CODE_BLOCKS )
450
434
451
435
# Process pre tags
452
436
processed_tags = set ()
@@ -518,7 +502,7 @@ def identify_tables(soup: BeautifulSoup) -> List[Dict]:
518
502
519
503
try :
520
504
# Find all tables, including those in custom components like rh-table
521
- table_tags = soup .find_all (['table' , 'rh-table' ], limit = 50 )
505
+ table_tags = soup .find_all (['table' , 'rh-table' ], limit = MAX_TABLES )
522
506
523
507
# For custom table components, extract the actual table
524
508
expanded_tables = []
@@ -552,10 +536,10 @@ def identify_tables(soup: BeautifulSoup) -> List[Dict]:
552
536
# Get rows not in header
553
537
if header :
554
538
header_rows = set (id (row ) for row in header .find_all ('tr' ))
555
- all_rows = table .find_all ('tr' , limit = 100 )
539
+ all_rows = table .find_all ('tr' , limit = MAX_TABLE_ROWS )
556
540
rows = [row for row in all_rows if id (row ) not in header_rows ]
557
541
else :
558
- rows = table .find_all ('tr' , limit = 100 )
542
+ rows = table .find_all ('tr' , limit = MAX_TABLE_ROWS )
559
543
except Exception :
560
544
pass
561
545
0 commit comments