|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import re |
| 4 | +from bs4 import BeautifulSoup |
| 5 | + |
| 6 | + |
| 7 | +def process_html_file(file_path, output_dir, markdown_dir): |
| 8 | + with open(file_path, 'r', encoding='utf-8') as file: |
| 9 | + html_content = file.read() |
| 10 | + |
| 11 | + soup = BeautifulSoup(html_content, 'html.parser') |
| 12 | + |
| 13 | + footers = soup.find_all('footer') |
| 14 | + for footer in footers: |
| 15 | + footer.decompose() |
| 16 | + |
| 17 | + navigations = soup.find_all('nav') |
| 18 | + for navigation in navigations: |
| 19 | + navigation.decompose() |
| 20 | + |
| 21 | + hr = soup.find('hr') |
| 22 | + if hr: |
| 23 | + hr.decompose() |
| 24 | + |
| 25 | + ul = soup.find('ul', class_='wy-breadcrumbs') |
| 26 | + if ul: |
| 27 | + ul.decompose() |
| 28 | + |
| 29 | + headerlinks = soup.find_all('a', class_='headerlink') |
| 30 | + for headerlink in headerlinks: |
| 31 | + headerlink.decompose() |
| 32 | + |
| 33 | + code_blocks = soup.find_all('div', class_='highlight-default notranslate') |
| 34 | + for block in code_blocks: |
| 35 | + pre_tag = soup.new_tag("p") |
| 36 | + pre_tag.append("start ") |
| 37 | + pre_tag.append(block.get_text().strip()) |
| 38 | + pre_tag.append(" finish") |
| 39 | + block.replace_with(pre_tag) |
| 40 | + |
| 41 | + clean_html = str(soup) |
| 42 | + |
| 43 | + base_filename = os.path.basename(file_path) |
| 44 | + output_path = os.path.join(output_dir, base_filename) |
| 45 | + |
| 46 | + with open(output_path, 'w', encoding='utf-8') as file: |
| 47 | + file.write(clean_html) |
| 48 | + print(f"Processed: {file_path} -> {output_path}") |
| 49 | + |
| 50 | + clean_filename = os.path.splitext(base_filename)[0] + ".md" |
| 51 | + md_output_path = os.path.join(markdown_dir, clean_filename) |
| 52 | + |
| 53 | + os.system(f"html2text --ignore-images {output_path} > {md_output_path}") |
| 54 | + |
| 55 | + if sys.platform.startswith('win'): |
| 56 | + with open(md_output_path, 'r') as file: |
| 57 | + md_content = file.read() |
| 58 | + else: |
| 59 | + with open(md_output_path, 'r', encoding='utf-8') as file: |
| 60 | + md_content = file.read() |
| 61 | + |
| 62 | + clean_md = format_code_elements(md_content) |
| 63 | + clean_md = clean_md.replace("\n### ", "\n## ") |
| 64 | + clean_md = clean_md.replace("<", "\<") # fix tags errors |
| 65 | + clean_md = clean_md.replace(">", "\>") # fix tags errors |
| 66 | + clean_md = clean_md.replace("````", "```") |
| 67 | + clean_md = delete_beginning(clean_md) |
| 68 | + |
| 69 | + with open(md_output_path, 'w', encoding='utf-8') as file: |
| 70 | + file.write(clean_md) |
| 71 | + |
| 72 | + print(f"Processed: {output_path} -> {md_output_path}\n") |
| 73 | + |
| 74 | + |
| 75 | +def format_code_elements(text: str): |
| 76 | + substrings = [] |
| 77 | + start_index = 0 |
| 78 | + while True: |
| 79 | + start_index = text.find("start", start_index) |
| 80 | + if start_index == -1: |
| 81 | + break |
| 82 | + |
| 83 | + end_index = text.find("finish", start_index + 5) |
| 84 | + if end_index == -1: |
| 85 | + break |
| 86 | + |
| 87 | + substrings.append(text[start_index + 5:end_index]) |
| 88 | + start_index = end_index + 6 |
| 89 | + |
| 90 | + results = [] |
| 91 | + for code in substrings: |
| 92 | + res = re.sub(r'\s+', ' ', code).strip() |
| 93 | + |
| 94 | + res_split = list(res.split()) |
| 95 | + length = len(res_split[0]) + len(res_split[1]) + len(res_split[2]) + 3 |
| 96 | + ind = ' ' * length |
| 97 | + res = res.replace('] [', ']\n' + ind + '[') |
| 98 | + |
| 99 | + results.append(res) |
| 100 | + |
| 101 | + for i in range(len(results)): |
| 102 | + text = text.replace("start" + substrings[i] + "finish", "```sh\n" + results[i] + "\n```") |
| 103 | + |
| 104 | + return text |
| 105 | + |
| 106 | + |
| 107 | +def delete_beginning(text: str): |
| 108 | + start_index = text.find("## Commands") |
| 109 | + end_index = text.find("## Sub-commands") |
| 110 | + if start_index == -1 or end_index == -1: |
| 111 | + return text |
| 112 | + |
| 113 | + return text.replace(text[start_index + 11:end_index + 15], "") |
| 114 | + |
| 115 | + |
| 116 | +def process_html_files_in_directory(directory, output_dir, markdown_dir): |
| 117 | + if not os.path.exists(output_dir): |
| 118 | + os.makedirs(output_dir) |
| 119 | + |
| 120 | + if not os.path.exists(markdown_dir): |
| 121 | + os.makedirs(markdown_dir) |
| 122 | + |
| 123 | + for file in os.listdir(directory): |
| 124 | + if file.endswith('.html'): |
| 125 | + file_path = os.path.join(directory, file) |
| 126 | + process_html_file(file_path, output_dir, markdown_dir) |
| 127 | + |
| 128 | + |
| 129 | +def main(): |
| 130 | + if len(sys.argv) == 4: |
| 131 | + input_html_directory: str = sys.argv[1] |
| 132 | + output_html_directory: str = sys.argv[2] |
| 133 | + output_md_directory: str = sys.argv[3] |
| 134 | + else: |
| 135 | + raise Exception(""" |
| 136 | + You can only pass 3 parameters: |
| 137 | + - input_html_directory |
| 138 | + - output_html_directory |
| 139 | + - output_md_directory |
| 140 | + """) |
| 141 | + |
| 142 | + process_html_files_in_directory( |
| 143 | + input_html_directory, |
| 144 | + output_html_directory, |
| 145 | + output_md_directory |
| 146 | + ) |
| 147 | + |
| 148 | + |
| 149 | +if __name__ == "__main__": |
| 150 | + main() |
0 commit comments