Skip to content

Commit 52ecf0d

Browse files
committed
Don't recreate dir structure for single file; fix bug in dir creation
1 parent 81686fc commit 52ecf0d

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

scripts/html_chunking/html-stripper.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@
1111
import os
1212
import sys
1313
from pathlib import Path
14-
from typing import List, Optional
14+
from typing import List, Optional, Tuple
1515
from bs4 import BeautifulSoup
1616

1717

18-
def strip_html_content(input_file_path: str, output_dir: str) -> Optional[str]:
18+
def strip_html_content(input_file_path: str, output_dir: str, preserve_path: bool = True) -> Optional[str]:
1919
"""
2020
Extract the main content from an HTML file and save it to the output directory.
2121
2222
Args:
2323
input_file_path: Path to the HTML file to process
2424
output_dir: Directory to save the cleaned HTML file
25+
preserve_path: Whether to preserve the directory structure (True for directory processing,
26+
False for single file processing)
2527
2628
Returns:
2729
Path to the cleaned HTML file or None if processing failed
@@ -49,10 +51,15 @@ def strip_html_content(input_file_path: str, output_dir: str) -> Optional[str]:
4951
for chapter in chapters:
5052
new_soup.body.append(chapter)
5153

52-
# Create output path
53-
rel_path = os.path.relpath(input_file_path)
54-
output_file_path = os.path.join(output_dir, rel_path)
55-
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
54+
# Create output path based on whether we're preserving directory structure
55+
if preserve_path:
56+
rel_path = os.path.relpath(input_file_path)
57+
output_file_path = os.path.join(output_dir, rel_path)
58+
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
59+
else:
60+
filename = os.path.basename(input_file_path)
61+
output_file_path = os.path.join(output_dir, filename)
62+
os.makedirs(output_dir, exist_ok=True)
5663

5764
with open(output_file_path, "w", encoding="utf-8") as file:
5865
file.write(str(new_soup))
@@ -92,7 +99,7 @@ def process_directory(
9299
skipped_files += 1
93100
continue
94101

95-
result = strip_html_content(file_path, output_dir)
102+
result = strip_html_content(file_path, output_dir, preserve_path=True)
96103
if result:
97104
processed_files += 1
98105

@@ -135,7 +142,7 @@ def main() -> None:
135142
if not input_path.name.endswith(".html"):
136143
print(f"Error: Input file {args.input} is not an HTML file.")
137144
sys.exit(1)
138-
strip_html_content(str(input_path), args.output_dir)
145+
strip_html_content(str(input_path), args.output_dir, preserve_path=False)
139146
else:
140147
process_directory(str(input_path), args.output_dir, args.exclude)
141148

0 commit comments

Comments
 (0)