Skip to content

Commit 6f35905

Browse files
Merge pull request #237 from max-svistunov/ols-1499-remove-html-ballast
OLS-1499 Add script for removing ballast content from OpenShift documentation
2 parents b76a81e + 52ecf0d commit 6f35905

File tree

1 file changed

+151
-0
lines changed

1 file changed

+151
-0
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
HTML content stripper for Red Hat OpenShift documentation pages.
5+
6+
Extracts the main documentation content by removing navigation elements,
7+
headers, footers, and other non-essential page components.
8+
"""
9+
10+
import argparse
11+
import os
12+
import sys
13+
from pathlib import Path
14+
from typing import List, Optional, Tuple
15+
from bs4 import BeautifulSoup
16+
17+
18+
def strip_html_content(input_file_path: str, output_dir: str, preserve_path: bool = True) -> Optional[str]:
19+
"""
20+
Extract the main content from an HTML file and save it to the output directory.
21+
22+
Args:
23+
input_file_path: Path to the HTML file to process
24+
output_dir: Directory to save the cleaned HTML file
25+
preserve_path: Whether to preserve the directory structure (True for directory processing,
26+
False for single file processing)
27+
28+
Returns:
29+
Path to the cleaned HTML file or None if processing failed
30+
"""
31+
try:
32+
with open(input_file_path, "r", encoding="utf-8") as file:
33+
html_content = file.read()
34+
35+
soup = BeautifulSoup(html_content, "html.parser")
36+
new_soup = BeautifulSoup("<html><body></body></html>", "html.parser")
37+
38+
# Capture breadcrumbs if they exist
39+
breadcrumb = soup.find("ol", class_="breadcrumb hide-for-print")
40+
if breadcrumb:
41+
new_soup.body.append(breadcrumb)
42+
43+
# Find all "chapter" sections that contain the main content
44+
chapters = soup.find_all("section", class_="chapter")
45+
46+
if not chapters:
47+
print(f"Warning: No <section class='chapter'> found in {input_file_path}")
48+
return None
49+
50+
# Add each chapter to our new document
51+
for chapter in chapters:
52+
new_soup.body.append(chapter)
53+
54+
# Create output path based on whether we're preserving directory structure
55+
if preserve_path:
56+
rel_path = os.path.relpath(input_file_path)
57+
output_file_path = os.path.join(output_dir, rel_path)
58+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
59+
else:
60+
filename = os.path.basename(input_file_path)
61+
output_file_path = os.path.join(output_dir, filename)
62+
os.makedirs(output_dir, exist_ok=True)
63+
64+
with open(output_file_path, "w", encoding="utf-8") as file:
65+
file.write(str(new_soup))
66+
67+
print(f"Cleaned HTML saved to {output_file_path}")
68+
return output_file_path
69+
70+
except Exception as e:
71+
print(f"Error processing {input_file_path}: {str(e)}")
72+
return None
73+
74+
75+
def process_directory(
76+
input_dir: str, output_dir: str, exclusion_list: Optional[List[str]] = None
77+
) -> None:
78+
"""
79+
Process all HTML files in a directory and its subdirectories.
80+
81+
Args:
82+
input_dir: Directory containing HTML files to process
83+
output_dir: Directory to save cleaned HTML files
84+
exclusion_list: List of file paths to exclude
85+
"""
86+
if exclusion_list is None:
87+
exclusion_list = []
88+
89+
processed_files = 0
90+
skipped_files = 0
91+
92+
for root, _, files in os.walk(input_dir):
93+
for file in files:
94+
if file.endswith(".html"):
95+
file_path = os.path.join(root, file)
96+
97+
if file_path in exclusion_list:
98+
print(f"Skipping excluded file: {file_path}")
99+
skipped_files += 1
100+
continue
101+
102+
result = strip_html_content(file_path, output_dir, preserve_path=True)
103+
if result:
104+
processed_files += 1
105+
106+
print(f"Processed {processed_files} HTML files, skipped {skipped_files} files.")
107+
108+
109+
def main() -> None:
110+
"""Parse command line arguments and run the HTML content stripper."""
111+
parser = argparse.ArgumentParser(
112+
description="Strip unnecessary content from HTML documentation files."
113+
)
114+
115+
parser.add_argument(
116+
"--input", "-i", required=True, help="HTML file or directory to process"
117+
)
118+
parser.add_argument(
119+
"--output-dir",
120+
"-o",
121+
default="clean_html",
122+
help="Directory to save cleaned HTML files (default: 'clean_html')",
123+
)
124+
parser.add_argument(
125+
"--exclude",
126+
"-e",
127+
nargs="+",
128+
default=[],
129+
help="Files to exclude from processing",
130+
)
131+
132+
args = parser.parse_args()
133+
134+
# Check if input path exists
135+
input_path = Path(args.input)
136+
if not input_path.exists():
137+
print(f"Error: Input path {args.input} does not exist.")
138+
sys.exit(1)
139+
140+
# Process single file or directory
141+
if input_path.is_file():
142+
if not input_path.name.endswith(".html"):
143+
print(f"Error: Input file {args.input} is not an HTML file.")
144+
sys.exit(1)
145+
strip_html_content(str(input_path), args.output_dir, preserve_path=False)
146+
else:
147+
process_directory(str(input_path), args.output_dir, args.exclude)
148+
149+
150+
if __name__ == "__main__":
151+
main()

0 commit comments

Comments
 (0)