Create conv_html_to_markdown.py

Daethyra · web-flow · commit 7b22c8966f96 · 2023-11-26T01:26:49.000-08:00
diff --git a/conv_html_to_markdown.py b/conv_html_to_markdown.py
@@ -0,0 +1,104 @@
+import json
+import logging
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+from concurrent.futures import ThreadPoolExecutor
+
+class HTMLToMarkdownConverter:
+    def __init__(self, strip_tags=None, convert_links=True):
+        """ Initialize converter with configuration options. """
+        self.strip_tags = strip_tags or []
+        self.convert_links = convert_links
+
+    def convert(self, html_content):
+        """ Convert HTML content to Markdown. """
+        try:
+            curated_html = self.curate_content(html_content)
+            return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links)
+        except Exception as e:
+            logging.error(f"Error in HTML to Markdown conversion: {e}")
+            return ""
+    
+    def curate_content(self, html):
+        """ Curate the HTML content before conversion. """
+        soup = BeautifulSoup(html, 'html.parser')
+        # Implement specific curation logic here based on the content nature
+        return str(soup)
+
+class DatasetFormatter:
+    def __init__(self, converter):
+        self.converter = converter
+
+    def format_entry(self, entry):
+        """ Format a single entry from the dataset. """
+        try:
+            title = entry.get('title', 'Untitled')
+            url = entry.get('url', '')
+            html_content = entry.get('html', '')
+            markdown_content = self.converter.convert(html_content)
+            return self.structure_markdown(title, url, markdown_content)
+        except Exception as e:
+            logging.error(f"Error formatting entry: {e}")
+            return ""
+
+    def structure_markdown(self, title, url, content):
+        """ Structure the Markdown content with headers, lists, etc. """
+        structured_content = f"## {title}\n\n"
+        if url:
+            structured_content += f"[Read More]({url})\n\n"
+        structured_content += content
+        return structured_content
+
+    def format_dataset(self, data):
+        """ Format the entire dataset. """
+        formatted_content = ""
+        for entry in data:
+            formatted_content += self.format_entry(entry)
+        return formatted_content
+
+def load_json(file_path):
+    """ Load the JSON file. """
+    with open(file_path, 'r') as file:
+        return json.load(file)
+
+def save_output_in_chunks(file_path, contents, chunk_size=1024):
+    """ Save the formatted content in chunks. """
+    with open(file_path, 'w') as file:
+        for content in contents:
+            file.write(content)
+            if len(content) > chunk_size:
+                file.flush()  # Flush after writing a large chunk
+
+def chunk_dataset(data, chunk_size):
+    """ Divide the dataset into chunks of approximately equal size. """
+    for i in range(0, len(data), chunk_size):
+        yield data[i:i + chunk_size]
+
+def process_chunk(chunk):
+    """ Process a single chunk of the dataset. """
+    formatter = DatasetFormatter(HTMLToMarkdownConverter())
+    return formatter.format_dataset(chunk)
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    try:
+        original_data = load_json('transformers_documentation-gpt-crawler_output.json')
+        chunk_size = 200  # Adjust chunk size as needed
+        max_threads = 10   # Adjust the maximum number of threads as needed
+
+        chunks = list(chunk_dataset(original_data, chunk_size))
+
+        formatted_contents = []
+        with ThreadPoolExecutor(max_workers=max_threads) as executor:
+            results = executor.map(process_chunk, chunks)
+            for result in results:
+                formatted_contents.append(result)
+
+        final_formatted_content = '\n'.join(formatted_contents)
+        save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents)
+        logging.info("Content formatted and saved in chunks successfully.")
+    except Exception as e:
+        logging.error(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    main()