|
| 1 | +import json |
| 2 | +import logging |
| 3 | +from bs4 import BeautifulSoup |
| 4 | +from markdownify import markdownify as md |
| 5 | +from concurrent.futures import ThreadPoolExecutor |
| 6 | + |
| 7 | +class HTMLToMarkdownConverter: |
| 8 | + def __init__(self, strip_tags=None, convert_links=True): |
| 9 | + """ Initialize converter with configuration options. """ |
| 10 | + self.strip_tags = strip_tags or [] |
| 11 | + self.convert_links = convert_links |
| 12 | + |
| 13 | + def convert(self, html_content): |
| 14 | + """ Convert HTML content to Markdown. """ |
| 15 | + try: |
| 16 | + curated_html = self.curate_content(html_content) |
| 17 | + return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links) |
| 18 | + except Exception as e: |
| 19 | + logging.error(f"Error in HTML to Markdown conversion: {e}") |
| 20 | + return "" |
| 21 | + |
| 22 | + def curate_content(self, html): |
| 23 | + """ Curate the HTML content before conversion. """ |
| 24 | + soup = BeautifulSoup(html, 'html.parser') |
| 25 | + # Implement specific curation logic here based on the content nature |
| 26 | + return str(soup) |
| 27 | + |
| 28 | +class DatasetFormatter: |
| 29 | + def __init__(self, converter): |
| 30 | + self.converter = converter |
| 31 | + |
| 32 | + def format_entry(self, entry): |
| 33 | + """ Format a single entry from the dataset. """ |
| 34 | + try: |
| 35 | + title = entry.get('title', 'Untitled') |
| 36 | + url = entry.get('url', '') |
| 37 | + html_content = entry.get('html', '') |
| 38 | + markdown_content = self.converter.convert(html_content) |
| 39 | + return self.structure_markdown(title, url, markdown_content) |
| 40 | + except Exception as e: |
| 41 | + logging.error(f"Error formatting entry: {e}") |
| 42 | + return "" |
| 43 | + |
| 44 | + def structure_markdown(self, title, url, content): |
| 45 | + """ Structure the Markdown content with headers, lists, etc. """ |
| 46 | + structured_content = f"## {title}\n\n" |
| 47 | + if url: |
| 48 | + structured_content += f"[Read More]({url})\n\n" |
| 49 | + structured_content += content |
| 50 | + return structured_content |
| 51 | + |
| 52 | + def format_dataset(self, data): |
| 53 | + """ Format the entire dataset. """ |
| 54 | + formatted_content = "" |
| 55 | + for entry in data: |
| 56 | + formatted_content += self.format_entry(entry) |
| 57 | + return formatted_content |
| 58 | + |
| 59 | +def load_json(file_path): |
| 60 | + """ Load the JSON file. """ |
| 61 | + with open(file_path, 'r') as file: |
| 62 | + return json.load(file) |
| 63 | + |
| 64 | +def save_output_in_chunks(file_path, contents, chunk_size=1024): |
| 65 | + """ Save the formatted content in chunks. """ |
| 66 | + with open(file_path, 'w') as file: |
| 67 | + for content in contents: |
| 68 | + file.write(content) |
| 69 | + if len(content) > chunk_size: |
| 70 | + file.flush() # Flush after writing a large chunk |
| 71 | + |
| 72 | +def chunk_dataset(data, chunk_size): |
| 73 | + """ Divide the dataset into chunks of approximately equal size. """ |
| 74 | + for i in range(0, len(data), chunk_size): |
| 75 | + yield data[i:i + chunk_size] |
| 76 | + |
| 77 | +def process_chunk(chunk): |
| 78 | + """ Process a single chunk of the dataset. """ |
| 79 | + formatter = DatasetFormatter(HTMLToMarkdownConverter()) |
| 80 | + return formatter.format_dataset(chunk) |
| 81 | + |
| 82 | +def main(): |
| 83 | + logging.basicConfig(level=logging.INFO) |
| 84 | + try: |
| 85 | + original_data = load_json('transformers_documentation-gpt-crawler_output.json') |
| 86 | + chunk_size = 200 # Adjust chunk size as needed |
| 87 | + max_threads = 10 # Adjust the maximum number of threads as needed |
| 88 | + |
| 89 | + chunks = list(chunk_dataset(original_data, chunk_size)) |
| 90 | + |
| 91 | + formatted_contents = [] |
| 92 | + with ThreadPoolExecutor(max_workers=max_threads) as executor: |
| 93 | + results = executor.map(process_chunk, chunks) |
| 94 | + for result in results: |
| 95 | + formatted_contents.append(result) |
| 96 | + |
| 97 | + final_formatted_content = '\n'.join(formatted_contents) |
| 98 | + save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents) |
| 99 | + logging.info("Content formatted and saved in chunks successfully.") |
| 100 | + except Exception as e: |
| 101 | + logging.error(f"An error occurred: {e}") |
| 102 | + |
| 103 | +if __name__ == "__main__": |
| 104 | + main() |
0 commit comments