-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_exploration.py
More file actions
306 lines (242 loc) · 10.4 KB
/
data_exploration.py
File metadata and controls
306 lines (242 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import os
import re
from bs4 import BeautifulSoup
import json
from pathlib import Path
# Directory containing the HTML files
DATA_DIR = "./data"
# Directory to save the extracted content
OUTPUT_DIR = "./extracted_content"
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
def extract_article_content(html_content):
"""
Extract the main article content from the HTML.
Returns a dictionary with title and content.
"""
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the title
title_tag = soup.find('title')
title = title_tag.text.split('|')[0].strip() if title_tag else "Unknown Title"
# Find the main content div
content_div = soup.find('div', class_='mw-parser-output')
if not content_div:
return {"title": title, "content": "No content found"}
# Extract the article content
# We'll focus on paragraphs, headings, and lists which contain the actual article text
content_elements = []
# Get all direct children of the content div
for element in content_div.children:
# Skip non-tag elements like NavigableString
if not hasattr(element, 'name'):
continue
# Skip the infobox and navigation templates
if element.name == 'aside' or element.name == 'table':
continue
# Check if element has class attribute before checking if 'infobox' is in it
if hasattr(element, 'get') and element.get('class') and any('infobox' in cls for cls in element.get('class')):
continue
# Include paragraphs, headings, and lists
if element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'ul', 'ol', 'dl']:
# Remove edit section links
for edit_section in element.find_all(class_='mw-editsection'):
edit_section.decompose()
# Clean up the text by removing extra whitespace
text = element.get_text().strip()
if text:
content_elements.append({
"type": element.name,
"text": text
})
return {
"title": title,
"content": content_elements
}
def process_html_files():
"""Process all HTML files in the data directory."""
# Get all HTML files
html_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.html')]
print(f"Found {len(html_files)} HTML files to process")
# Process each file
for i, filename in enumerate(html_files):
if i % 10 == 0:
print(f"Processing file {i+1}/{len(html_files)}: {filename}")
file_path = os.path.join(DATA_DIR, filename)
try:
with open(file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
# Extract the content
article_data = extract_article_content(html_content)
# Save the extracted content as JSON
output_filename = os.path.splitext(filename)[0] + '.json'
output_path = os.path.join(OUTPUT_DIR, output_filename)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(article_data, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"Error processing {filename}: {e}")
print(f"Processed {len(html_files)} files. Extracted content saved to {OUTPUT_DIR}")
def analyze_content_size():
"""Analyze the size of the extracted content vs. original HTML."""
html_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.html')]
json_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.json')]
if not json_files:
print("No extracted content files found. Run process_html_files() first.")
return
total_html_size = 0
total_json_size = 0
for html_file in html_files:
html_path = os.path.join(DATA_DIR, html_file)
json_file = os.path.splitext(html_file)[0] + '.json'
json_path = os.path.join(OUTPUT_DIR, json_file)
if os.path.exists(json_path):
html_size = os.path.getsize(html_path)
json_size = os.path.getsize(json_path)
total_html_size += html_size
total_json_size += json_size
if total_html_size > 0:
reduction_percentage = ((total_html_size - total_json_size) / total_html_size) * 100
print(f"Original HTML total size: {total_html_size / 1024:.2f} KB")
print(f"Extracted content total size: {total_json_size / 1024:.2f} KB")
print(f"Size reduction: {reduction_percentage:.2f}%")
def sample_extracted_content(num_samples=3):
"""Display samples of the extracted content."""
json_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.json')]
if not json_files:
print("No extracted content files found. Run process_html_files() first.")
return
# Take a few samples
samples = json_files[:num_samples]
for sample in samples:
json_path = os.path.join(OUTPUT_DIR, sample)
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"\n--- Sample: {sample} ---")
print(f"Title: {data['title']}")
print("Content preview:")
# Print first few content elements
for i, element in enumerate(data['content'][:5]):
if isinstance(element, dict):
print(f" {element['type']}: {element['text'][:100]}...")
else:
print(f" {element}")
print(f" ... ({len(data['content'])} elements total)")
def create_minimal_html_version():
"""Create minimal HTML versions of the extracted content."""
minimal_html_dir = "./minimal_html"
os.makedirs(minimal_html_dir, exist_ok=True)
json_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.json')]
if not json_files:
print("No extracted content files found. Run process_html_files() first.")
return
for json_file in json_files:
json_path = os.path.join(OUTPUT_DIR, json_file)
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Create a minimal HTML version with basic styling
html_content = f"""<!DOCTYPE html>
<html>
<head>
<title>{data['title']}</title>
<style>
body {{
font-family: Arial, sans-serif;
line-height: 1.6;
max-width: 800px;
margin: 0 auto;
padding: 20px;
color: #333;
}}
h1, h2, h3, h4 {{
color: #444;
}}
h1 {{
border-bottom: 1px solid #ddd;
padding-bottom: 10px;
}}
p {{
margin-bottom: 16px;
}}
.quote {{
font-style: italic;
margin: 20px 0;
padding-left: 20px;
border-left: 4px solid #ddd;
}}
.footer {{
margin-top: 30px;
padding-top: 10px;
border-top: 1px solid #ddd;
font-size: 0.9em;
color: #666;
}}
</style>
</head>
<body>
<h1>{data['title']}</h1>
"""
# Skip the first paragraph if it contains infobox content
# (usually very long with lots of newlines and technical details)
content_elements = data['content']
skip_first = False
if isinstance(content_elements, list) and len(content_elements) > 0:
if content_elements[0]['type'] == 'p' and len(content_elements[0]['text']) > 500 and content_elements[0]['text'].count('\n') > 10:
skip_first = True
# Process each content element
for i, element in enumerate(content_elements):
if not isinstance(element, dict):
continue
# Skip the first element if it's the infobox content
if i == 0 and skip_first:
continue
element_type = element['type']
text = element['text']
# Handle different element types
if element_type in ['h1', 'h2', 'h3', 'h4']:
html_content += f" <{element_type}>{text}</{element_type}>\n"
elif element_type == 'p':
html_content += f" <p>{text}</p>\n"
elif element_type == 'dl':
# Format as a quote
html_content += f' <div class="quote">{text}</div>\n'
elif element_type == 'ul':
# Process list items
items = text.split('\n')
html_content += " <ul>\n"
for item in items:
if item.strip():
html_content += f" <li>{item.strip()}</li>\n"
html_content += " </ul>\n"
elif element_type == 'ol':
# Process ordered list items
items = text.split('\n')
html_content += " <ol>\n"
for item in items:
if item.strip():
html_content += f" <li>{item.strip()}</li>\n"
html_content += " </ol>\n"
# Add footer with link to original
original_filename = os.path.splitext(json_file)[0] + '.html'
html_content += f"""
<div class="footer">
<p>This is a minimal version of the article content extracted from the Civilization 5 Wiki.</p>
<p>Original article: <a href="https://civilization.fandom.com/wiki/{original_filename.replace('.html', '')}">{data['title']}</a></p>
</div>
</body>
</html>"""
# Save the minimal HTML
html_filename = os.path.splitext(json_file)[0] + '.html'
html_path = os.path.join(minimal_html_dir, html_filename)
with open(html_path, 'w', encoding='utf-8') as f:
f.write(html_content)
print(f"Created improved minimal HTML versions in {minimal_html_dir}")
if __name__ == "__main__":
print("Starting data exploration...")
# Process the HTML files
process_html_files()
# Analyze the content size
analyze_content_size()
# Show some samples
sample_extracted_content()
# Create minimal HTML versions
create_minimal_html_version()
print("Data exploration complete!")