Skip to content

Commit 392265a

Browse files
committed
Add script for removing ballast content from OpenShift documentation
1 parent 1442074 commit 392265a

File tree

1 file changed

+142
-0
lines changed

1 file changed

+142
-0
lines changed
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
HTML content stripper for Red Hat OpenShift documentation pages.
5+
Removes navigation, headers, footers, and other unnecessary elements,
6+
keeping only the main documentation content.
7+
"""
8+
9+
import argparse
10+
import os
11+
import sys
12+
from pathlib import Path
13+
from bs4 import BeautifulSoup
14+
15+
16+
def strip_html_content(input_file_path, output_dir):
17+
"""
18+
Extract the main content from an HTML file and save it to the output directory.
19+
20+
Args:
21+
input_file_path (str): Path to the HTML file to process
22+
output_dir (str): Directory to save the cleaned HTML file
23+
24+
Returns:
25+
str: Path to the cleaned HTML file
26+
"""
27+
try:
28+
with open(input_file_path, 'r', encoding='utf-8') as file:
29+
html_content = file.read()
30+
31+
soup = BeautifulSoup(html_content, 'html.parser')
32+
33+
new_soup = BeautifulSoup('<html><body></body></html>', 'html.parser')
34+
35+
page_header = soup.find('div', class_='page-header')
36+
37+
# Find the main content div
38+
main_content = soup.find('div', class_='col-xs-12 col-sm-9 col-md-9 main')
39+
if not main_content:
40+
main_content = soup.find('div', class_='main')
41+
42+
if not page_header and not main_content:
43+
print(f"Warning: Could not identify required content in {input_file_path}")
44+
return None
45+
46+
if main_content:
47+
toc = main_content.find('div', id='toc')
48+
if toc:
49+
toc.extract()
50+
51+
if page_header:
52+
new_soup.body.append(page_header)
53+
54+
if main_content:
55+
new_soup.body.append(main_content)
56+
57+
rel_path = os.path.relpath(input_file_path)
58+
output_file_path = os.path.join(output_dir, rel_path)
59+
60+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
61+
62+
with open(output_file_path, 'w', encoding='utf-8') as file:
63+
file.write(str(new_soup))
64+
65+
print(f"Cleaned HTML saved to {output_file_path}")
66+
return output_file_path
67+
68+
except Exception as e:
69+
print(f"Error processing {input_file_path}: {str(e)}")
70+
return None
71+
72+
73+
def process_directory(input_dir, output_dir, exclusion_list=None):
74+
"""
75+
Process all HTML files in a directory and its subdirectories.
76+
77+
Args:
78+
input_dir (str): Directory containing HTML files to process
79+
output_dir (str): Directory to save cleaned HTML files
80+
exclusion_list (list): List of file paths to exclude
81+
"""
82+
if exclusion_list is None:
83+
exclusion_list = []
84+
85+
processed_files = 0
86+
skipped_files = 0
87+
88+
for root, _, files in os.walk(input_dir):
89+
for file in files:
90+
if file.endswith('.html'):
91+
file_path = os.path.join(root, file)
92+
93+
if file_path in exclusion_list:
94+
print(f"Skipping excluded file: {file_path}")
95+
skipped_files += 1
96+
continue
97+
98+
result = strip_html_content(file_path, output_dir)
99+
if result:
100+
processed_files += 1
101+
102+
print(f"Processed {processed_files} HTML files, skipped {skipped_files} files.")
103+
104+
105+
def main():
106+
parser = argparse.ArgumentParser(
107+
description="Strip unnecessary content from HTML documentation files."
108+
)
109+
110+
parser.add_argument(
111+
'--input', '-i', required=True,
112+
help="HTML file or directory to process"
113+
)
114+
parser.add_argument(
115+
'--output-dir', '-o', default='clean_html',
116+
help="Directory to save cleaned HTML files (default: 'clean_html')"
117+
)
118+
parser.add_argument(
119+
'--exclude', '-e', nargs='+', default=[],
120+
help="Files to exclude from processing"
121+
)
122+
123+
args = parser.parse_args()
124+
125+
# Determine if input is a file or directory
126+
input_path = Path(args.input)
127+
if not input_path.exists():
128+
print(f"Error: Input path {args.input} does not exist.")
129+
sys.exit(1)
130+
131+
if input_path.is_file():
132+
if not input_path.name.endswith('.html'):
133+
print(f"Error: Input file {args.input} is not an HTML file.")
134+
sys.exit(1)
135+
136+
strip_html_content(str(input_path), args.output_dir)
137+
else:
138+
process_directory(str(input_path), args.output_dir, args.exclude)
139+
140+
141+
if __name__ == "__main__":
142+
main()

0 commit comments

Comments
 (0)