|
11 | 11 | import os
|
12 | 12 | import sys
|
13 | 13 | from pathlib import Path
|
14 |
| -from typing import List, Optional |
| 14 | +from typing import List, Optional, Tuple |
15 | 15 | from bs4 import BeautifulSoup
|
16 | 16 |
|
17 | 17 |
|
18 |
| -def strip_html_content(input_file_path: str, output_dir: str) -> Optional[str]: |
| 18 | +def strip_html_content(input_file_path: str, output_dir: str, preserve_path: bool = True) -> Optional[str]: |
19 | 19 | """
|
20 | 20 | Extract the main content from an HTML file and save it to the output directory.
|
21 | 21 |
|
22 | 22 | Args:
|
23 | 23 | input_file_path: Path to the HTML file to process
|
24 | 24 | output_dir: Directory to save the cleaned HTML file
|
| 25 | + preserve_path: Whether to preserve the directory structure (True for directory processing, |
| 26 | + False for single file processing) |
25 | 27 |
|
26 | 28 | Returns:
|
27 | 29 | Path to the cleaned HTML file or None if processing failed
|
@@ -49,10 +51,15 @@ def strip_html_content(input_file_path: str, output_dir: str) -> Optional[str]:
|
49 | 51 | for chapter in chapters:
|
50 | 52 | new_soup.body.append(chapter)
|
51 | 53 |
|
52 |
| - # Create output path |
53 |
| - rel_path = os.path.relpath(input_file_path) |
54 |
| - output_file_path = os.path.join(output_dir, rel_path) |
55 |
| - os.makedirs(os.path.dirname(output_file_path), exist_ok=True) |
| 54 | + # Create output path based on whether we're preserving directory structure |
| 55 | + if preserve_path: |
| 56 | + rel_path = os.path.relpath(input_file_path) |
| 57 | + output_file_path = os.path.join(output_dir, rel_path) |
| 58 | + os.makedirs(os.path.dirname(output_file_path), exist_ok=True) |
| 59 | + else: |
| 60 | + filename = os.path.basename(input_file_path) |
| 61 | + output_file_path = os.path.join(output_dir, filename) |
| 62 | + os.makedirs(output_dir, exist_ok=True) |
56 | 63 |
|
57 | 64 | with open(output_file_path, "w", encoding="utf-8") as file:
|
58 | 65 | file.write(str(new_soup))
|
@@ -92,7 +99,7 @@ def process_directory(
|
92 | 99 | skipped_files += 1
|
93 | 100 | continue
|
94 | 101 |
|
95 |
| - result = strip_html_content(file_path, output_dir) |
| 102 | + result = strip_html_content(file_path, output_dir, preserve_path=True) |
96 | 103 | if result:
|
97 | 104 | processed_files += 1
|
98 | 105 |
|
@@ -135,7 +142,7 @@ def main() -> None:
|
135 | 142 | if not input_path.name.endswith(".html"):
|
136 | 143 | print(f"Error: Input file {args.input} is not an HTML file.")
|
137 | 144 | sys.exit(1)
|
138 |
| - strip_html_content(str(input_path), args.output_dir) |
| 145 | + strip_html_content(str(input_path), args.output_dir, preserve_path=False) |
139 | 146 | else:
|
140 | 147 | process_directory(str(input_path), args.output_dir, args.exclude)
|
141 | 148 |
|
|
0 commit comments