2
2
3
3
"""
4
4
HTML content stripper for Red Hat OpenShift documentation pages.
5
- Removes navigation, headers, footers, and other unnecessary elements,
6
- keeping only the main documentation content.
5
+
6
+ Extracts the main documentation content by removing navigation elements,
7
+ headers, footers, and other non-essential page components.
7
8
"""
8
9
9
10
import argparse
10
11
import os
11
12
import sys
12
13
from pathlib import Path
14
+ from typing import List , Optional
13
15
from bs4 import BeautifulSoup
14
16
15
17
16
- def strip_html_content (input_file_path , output_dir ) :
18
+ def strip_html_content (input_file_path : str , output_dir : str ) -> Optional [ str ] :
17
19
"""
18
20
Extract the main content from an HTML file and save it to the output directory.
19
21
20
22
Args:
21
- input_file_path (str) : Path to the HTML file to process
22
- output_dir (str) : Directory to save the cleaned HTML file
23
+ input_file_path: Path to the HTML file to process
24
+ output_dir: Directory to save the cleaned HTML file
23
25
24
26
Returns:
25
- str: Path to the cleaned HTML file
27
+ Path to the cleaned HTML file or None if processing failed
26
28
"""
27
29
try :
28
- with open (input_file_path , 'r' , encoding = ' utf-8' ) as file :
30
+ with open (input_file_path , "r" , encoding = " utf-8" ) as file :
29
31
html_content = file .read ()
30
32
31
- soup = BeautifulSoup (html_content , 'html.parser' )
32
-
33
- new_soup = BeautifulSoup ('<html><body></body></html>' , 'html.parser' )
33
+ soup = BeautifulSoup (html_content , "html.parser" )
34
+ new_soup = BeautifulSoup ("<html><body></body></html>" , "html.parser" )
34
35
35
- page_header = soup .find ('div' , class_ = 'page-header' )
36
+ # Capture breadcrumbs if they exist
37
+ breadcrumb = soup .find ("ol" , class_ = "breadcrumb hide-for-print" )
38
+ if breadcrumb :
39
+ new_soup .body .append (breadcrumb )
36
40
37
- # Find the main content div
38
- main_content = soup .find ('div' , class_ = 'col-xs-12 col-sm-9 col-md-9 main' )
39
- if not main_content :
40
- main_content = soup .find ('div' , class_ = 'main' )
41
+ # Find all "chapter" sections that contain the main content
42
+ chapters = soup .find_all ("section" , class_ = "chapter" )
41
43
42
- if not page_header and not main_content :
43
- print (f"Warning: Could not identify required content in { input_file_path } " )
44
+ if not chapters :
45
+ print (f"Warning: No <section class='chapter'> found in { input_file_path } " )
44
46
return None
45
47
46
- if main_content :
47
- toc = main_content .find ('div' , id = 'toc' )
48
- if toc :
49
- toc .extract ()
50
-
51
- if page_header :
52
- new_soup .body .append (page_header )
53
-
54
- if main_content :
55
- new_soup .body .append (main_content )
48
+ # Add each chapter to our new document
49
+ for chapter in chapters :
50
+ new_soup .body .append (chapter )
56
51
52
+ # Create output path
57
53
rel_path = os .path .relpath (input_file_path )
58
54
output_file_path = os .path .join (output_dir , rel_path )
59
-
60
55
os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
61
56
62
- with open (output_file_path , 'w' , encoding = ' utf-8' ) as file :
57
+ with open (output_file_path , "w" , encoding = " utf-8" ) as file :
63
58
file .write (str (new_soup ))
64
59
65
60
print (f"Cleaned HTML saved to { output_file_path } " )
@@ -70,14 +65,16 @@ def strip_html_content(input_file_path, output_dir):
70
65
return None
71
66
72
67
73
- def process_directory (input_dir , output_dir , exclusion_list = None ):
68
+ def process_directory (
69
+ input_dir : str , output_dir : str , exclusion_list : Optional [List [str ]] = None
70
+ ) -> None :
74
71
"""
75
72
Process all HTML files in a directory and its subdirectories.
76
73
77
74
Args:
78
- input_dir (str) : Directory containing HTML files to process
79
- output_dir (str) : Directory to save cleaned HTML files
80
- exclusion_list (list) : List of file paths to exclude
75
+ input_dir: Directory containing HTML files to process
76
+ output_dir: Directory to save cleaned HTML files
77
+ exclusion_list: List of file paths to exclude
81
78
"""
82
79
if exclusion_list is None :
83
80
exclusion_list = []
@@ -87,7 +84,7 @@ def process_directory(input_dir, output_dir, exclusion_list=None):
87
84
88
85
for root , _ , files in os .walk (input_dir ):
89
86
for file in files :
90
- if file .endswith (' .html' ):
87
+ if file .endswith (" .html" ):
91
88
file_path = os .path .join (root , file )
92
89
93
90
if file_path in exclusion_list :
@@ -102,37 +99,42 @@ def process_directory(input_dir, output_dir, exclusion_list=None):
102
99
print (f"Processed { processed_files } HTML files, skipped { skipped_files } files." )
103
100
104
101
105
- def main ():
102
+ def main () -> None :
103
+ """Parse command line arguments and run the HTML content stripper."""
106
104
parser = argparse .ArgumentParser (
107
105
description = "Strip unnecessary content from HTML documentation files."
108
106
)
109
107
110
108
parser .add_argument (
111
- '--input' , '-i' , required = True ,
112
- help = "HTML file or directory to process"
109
+ "--input" , "-i" , required = True , help = "HTML file or directory to process"
113
110
)
114
111
parser .add_argument (
115
- '--output-dir' , '-o' , default = 'clean_html' ,
116
- help = "Directory to save cleaned HTML files (default: 'clean_html')"
112
+ "--output-dir" ,
113
+ "-o" ,
114
+ default = "clean_html" ,
115
+ help = "Directory to save cleaned HTML files (default: 'clean_html')" ,
117
116
)
118
117
parser .add_argument (
119
- '--exclude' , '-e' , nargs = '+' , default = [],
120
- help = "Files to exclude from processing"
118
+ "--exclude" ,
119
+ "-e" ,
120
+ nargs = "+" ,
121
+ default = [],
122
+ help = "Files to exclude from processing" ,
121
123
)
122
124
123
125
args = parser .parse_args ()
124
126
125
- # Determine if input is a file or directory
127
+ # Check if input path exists
126
128
input_path = Path (args .input )
127
129
if not input_path .exists ():
128
130
print (f"Error: Input path { args .input } does not exist." )
129
131
sys .exit (1 )
130
132
133
+ # Process single file or directory
131
134
if input_path .is_file ():
132
- if not input_path .name .endswith (' .html' ):
135
+ if not input_path .name .endswith (" .html" ):
133
136
print (f"Error: Input file { args .input } is not an HTML file." )
134
137
sys .exit (1 )
135
-
136
138
strip_html_content (str (input_path ), args .output_dir )
137
139
else :
138
140
process_directory (str (input_path ), args .output_dir , args .exclude )
0 commit comments