Skip to content

Commit 7b22c89

Browse files
authored
Create conv_html_to_markdown.py
1 parent b8f46c4 commit 7b22c89

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

conv_html_to_markdown.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import json
2+
import logging
3+
from bs4 import BeautifulSoup
4+
from markdownify import markdownify as md
5+
from concurrent.futures import ThreadPoolExecutor
6+
7+
class HTMLToMarkdownConverter:
8+
def __init__(self, strip_tags=None, convert_links=True):
9+
""" Initialize converter with configuration options. """
10+
self.strip_tags = strip_tags or []
11+
self.convert_links = convert_links
12+
13+
def convert(self, html_content):
14+
""" Convert HTML content to Markdown. """
15+
try:
16+
curated_html = self.curate_content(html_content)
17+
return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links)
18+
except Exception as e:
19+
logging.error(f"Error in HTML to Markdown conversion: {e}")
20+
return ""
21+
22+
def curate_content(self, html):
23+
""" Curate the HTML content before conversion. """
24+
soup = BeautifulSoup(html, 'html.parser')
25+
# Implement specific curation logic here based on the content nature
26+
return str(soup)
27+
28+
class DatasetFormatter:
29+
def __init__(self, converter):
30+
self.converter = converter
31+
32+
def format_entry(self, entry):
33+
""" Format a single entry from the dataset. """
34+
try:
35+
title = entry.get('title', 'Untitled')
36+
url = entry.get('url', '')
37+
html_content = entry.get('html', '')
38+
markdown_content = self.converter.convert(html_content)
39+
return self.structure_markdown(title, url, markdown_content)
40+
except Exception as e:
41+
logging.error(f"Error formatting entry: {e}")
42+
return ""
43+
44+
def structure_markdown(self, title, url, content):
45+
""" Structure the Markdown content with headers, lists, etc. """
46+
structured_content = f"## {title}\n\n"
47+
if url:
48+
structured_content += f"[Read More]({url})\n\n"
49+
structured_content += content
50+
return structured_content
51+
52+
def format_dataset(self, data):
53+
""" Format the entire dataset. """
54+
formatted_content = ""
55+
for entry in data:
56+
formatted_content += self.format_entry(entry)
57+
return formatted_content
58+
59+
def load_json(file_path):
60+
""" Load the JSON file. """
61+
with open(file_path, 'r') as file:
62+
return json.load(file)
63+
64+
def save_output_in_chunks(file_path, contents, chunk_size=1024):
65+
""" Save the formatted content in chunks. """
66+
with open(file_path, 'w') as file:
67+
for content in contents:
68+
file.write(content)
69+
if len(content) > chunk_size:
70+
file.flush() # Flush after writing a large chunk
71+
72+
def chunk_dataset(data, chunk_size):
73+
""" Divide the dataset into chunks of approximately equal size. """
74+
for i in range(0, len(data), chunk_size):
75+
yield data[i:i + chunk_size]
76+
77+
def process_chunk(chunk):
78+
""" Process a single chunk of the dataset. """
79+
formatter = DatasetFormatter(HTMLToMarkdownConverter())
80+
return formatter.format_dataset(chunk)
81+
82+
def main():
83+
logging.basicConfig(level=logging.INFO)
84+
try:
85+
original_data = load_json('transformers_documentation-gpt-crawler_output.json')
86+
chunk_size = 200 # Adjust chunk size as needed
87+
max_threads = 10 # Adjust the maximum number of threads as needed
88+
89+
chunks = list(chunk_dataset(original_data, chunk_size))
90+
91+
formatted_contents = []
92+
with ThreadPoolExecutor(max_workers=max_threads) as executor:
93+
results = executor.map(process_chunk, chunks)
94+
for result in results:
95+
formatted_contents.append(result)
96+
97+
final_formatted_content = '\n'.join(formatted_contents)
98+
save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents)
99+
logging.info("Content formatted and saved in chunks successfully.")
100+
except Exception as e:
101+
logging.error(f"An error occurred: {e}")
102+
103+
if __name__ == "__main__":
104+
main()

0 commit comments

Comments
 (0)