Skip to content

Commit 81686fc

Browse files
committed
Modify the script for the Portal version; make more robust
1 parent 392265a commit 81686fc

File tree

1 file changed

+46
-44
lines changed

1 file changed

+46
-44
lines changed

scripts/html_chunking/html-stripper.py

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,64 +2,59 @@
22

33
"""
44
HTML content stripper for Red Hat OpenShift documentation pages.
5-
Removes navigation, headers, footers, and other unnecessary elements,
6-
keeping only the main documentation content.
5+
6+
Extracts the main documentation content by removing navigation elements,
7+
headers, footers, and other non-essential page components.
78
"""
89

910
import argparse
1011
import os
1112
import sys
1213
from pathlib import Path
14+
from typing import List, Optional
1315
from bs4 import BeautifulSoup
1416

1517

16-
def strip_html_content(input_file_path, output_dir):
18+
def strip_html_content(input_file_path: str, output_dir: str) -> Optional[str]:
1719
"""
1820
Extract the main content from an HTML file and save it to the output directory.
1921
2022
Args:
21-
input_file_path (str): Path to the HTML file to process
22-
output_dir (str): Directory to save the cleaned HTML file
23+
input_file_path: Path to the HTML file to process
24+
output_dir: Directory to save the cleaned HTML file
2325
2426
Returns:
25-
str: Path to the cleaned HTML file
27+
Path to the cleaned HTML file or None if processing failed
2628
"""
2729
try:
28-
with open(input_file_path, 'r', encoding='utf-8') as file:
30+
with open(input_file_path, "r", encoding="utf-8") as file:
2931
html_content = file.read()
3032

31-
soup = BeautifulSoup(html_content, 'html.parser')
32-
33-
new_soup = BeautifulSoup('<html><body></body></html>', 'html.parser')
33+
soup = BeautifulSoup(html_content, "html.parser")
34+
new_soup = BeautifulSoup("<html><body></body></html>", "html.parser")
3435

35-
page_header = soup.find('div', class_='page-header')
36+
# Capture breadcrumbs if they exist
37+
breadcrumb = soup.find("ol", class_="breadcrumb hide-for-print")
38+
if breadcrumb:
39+
new_soup.body.append(breadcrumb)
3640

37-
# Find the main content div
38-
main_content = soup.find('div', class_='col-xs-12 col-sm-9 col-md-9 main')
39-
if not main_content:
40-
main_content = soup.find('div', class_='main')
41+
# Find all "chapter" sections that contain the main content
42+
chapters = soup.find_all("section", class_="chapter")
4143

42-
if not page_header and not main_content:
43-
print(f"Warning: Could not identify required content in {input_file_path}")
44+
if not chapters:
45+
print(f"Warning: No <section class='chapter'> found in {input_file_path}")
4446
return None
4547

46-
if main_content:
47-
toc = main_content.find('div', id='toc')
48-
if toc:
49-
toc.extract()
50-
51-
if page_header:
52-
new_soup.body.append(page_header)
53-
54-
if main_content:
55-
new_soup.body.append(main_content)
48+
# Add each chapter to our new document
49+
for chapter in chapters:
50+
new_soup.body.append(chapter)
5651

52+
# Create output path
5753
rel_path = os.path.relpath(input_file_path)
5854
output_file_path = os.path.join(output_dir, rel_path)
59-
6055
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
6156

62-
with open(output_file_path, 'w', encoding='utf-8') as file:
57+
with open(output_file_path, "w", encoding="utf-8") as file:
6358
file.write(str(new_soup))
6459

6560
print(f"Cleaned HTML saved to {output_file_path}")
@@ -70,14 +65,16 @@ def strip_html_content(input_file_path, output_dir):
7065
return None
7166

7267

73-
def process_directory(input_dir, output_dir, exclusion_list=None):
68+
def process_directory(
69+
input_dir: str, output_dir: str, exclusion_list: Optional[List[str]] = None
70+
) -> None:
7471
"""
7572
Process all HTML files in a directory and its subdirectories.
7673
7774
Args:
78-
input_dir (str): Directory containing HTML files to process
79-
output_dir (str): Directory to save cleaned HTML files
80-
exclusion_list (list): List of file paths to exclude
75+
input_dir: Directory containing HTML files to process
76+
output_dir: Directory to save cleaned HTML files
77+
exclusion_list: List of file paths to exclude
8178
"""
8279
if exclusion_list is None:
8380
exclusion_list = []
@@ -87,7 +84,7 @@ def process_directory(input_dir, output_dir, exclusion_list=None):
8784

8885
for root, _, files in os.walk(input_dir):
8986
for file in files:
90-
if file.endswith('.html'):
87+
if file.endswith(".html"):
9188
file_path = os.path.join(root, file)
9289

9390
if file_path in exclusion_list:
@@ -102,37 +99,42 @@ def process_directory(input_dir, output_dir, exclusion_list=None):
10299
print(f"Processed {processed_files} HTML files, skipped {skipped_files} files.")
103100

104101

105-
def main():
102+
def main() -> None:
103+
"""Parse command line arguments and run the HTML content stripper."""
106104
parser = argparse.ArgumentParser(
107105
description="Strip unnecessary content from HTML documentation files."
108106
)
109107

110108
parser.add_argument(
111-
'--input', '-i', required=True,
112-
help="HTML file or directory to process"
109+
"--input", "-i", required=True, help="HTML file or directory to process"
113110
)
114111
parser.add_argument(
115-
'--output-dir', '-o', default='clean_html',
116-
help="Directory to save cleaned HTML files (default: 'clean_html')"
112+
"--output-dir",
113+
"-o",
114+
default="clean_html",
115+
help="Directory to save cleaned HTML files (default: 'clean_html')",
117116
)
118117
parser.add_argument(
119-
'--exclude', '-e', nargs='+', default=[],
120-
help="Files to exclude from processing"
118+
"--exclude",
119+
"-e",
120+
nargs="+",
121+
default=[],
122+
help="Files to exclude from processing",
121123
)
122124

123125
args = parser.parse_args()
124126

125-
# Determine if input is a file or directory
127+
# Check if input path exists
126128
input_path = Path(args.input)
127129
if not input_path.exists():
128130
print(f"Error: Input path {args.input} does not exist.")
129131
sys.exit(1)
130132

133+
# Process single file or directory
131134
if input_path.is_file():
132-
if not input_path.name.endswith('.html'):
135+
if not input_path.name.endswith(".html"):
133136
print(f"Error: Input file {args.input} is not an HTML file.")
134137
sys.exit(1)
135-
136138
strip_html_content(str(input_path), args.output_dir)
137139
else:
138140
process_directory(str(input_path), args.output_dir, args.exclude)

0 commit comments

Comments
 (0)