-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_compare.py
More file actions
127 lines (106 loc) · 5.15 KB
/
batch_compare.py
File metadata and controls
127 lines (106 loc) · 5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
### Import relevant libraries ###
import requests ## <- library for making http requests
from bs4 import BeautifulSoup ## <- library for scraping info from webpages
import difflib ## <- library for comparing text strings
import csv ## <- to read and write csv files
import os ## <- to write files
### Define constants ###
ARCHIVE_PREFIX = "https://wayback.archive-it.org/9618/20250701131117/" ## <- replace this with the wayback prefix of your capture of interest
INPUT_CSV = "dummy.csv" ## <- The input csv should provide a header-less csv with a single column of URLs of interest (full URLs are needed)
OUTPUT_CSV = "site_diff_summary.csv" ## <- The csv file to which results will be written
### Defines the conditions for fetching the html ###
def fetch_html(url):
try:
response = requests.get(url, timeout=10) ## <- for example, this sets the timeout period as 10 seconds
response.raise_for_status() ## <- and this reports if the URL returned an error
return response.text
except Exception as e:
print(f"❌ Failed to fetch {url}: {e}")
return None
### Extract the text blocks from the html ###
def extract_tagged_blocks(html):
soup = BeautifulSoup(html, 'html.parser')
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
elements = soup.find_all(['h1','h2','h3','h4','h5','h6','p','li','div']) ## <- Finding all the tags that are used in the html documents that match these filters
### Transforming them into blocks of text ###
blocks = []
for el in elements:
text = el.get_text(separator=' ', strip=True)
if text:
blocks.append(f"<{el.name}> {text}")
return blocks
### Define a function to compare live to archived sites ###
def compare_sites(live_url, archive_url):
live_html = fetch_html(live_url)
archive_html = fetch_html(archive_url)
if not live_html or not archive_html:
return None # Skip comparison if either fails
live_blocks = extract_tagged_blocks(live_html)
archive_blocks = extract_tagged_blocks(archive_html)
additions = deletions = inline_changes = 0
matcher = difflib.SequenceMatcher(None, archive_blocks, live_blocks)
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == 'insert':
deletions += (j2 - j1)
elif tag == 'delete':
additions += (i2 - i1)
elif tag == 'replace':
inline_changes += max(i2 - i1, j2 - j1)
total_blocks = len(set(archive_blocks + live_blocks))
total_changes = additions + deletions + inline_changes
percent_changed = (total_changes / total_blocks * 100) if total_blocks > 0 else 0.0 ## <- This is a terrible calculation trying to quantify how much the sites differ from each other
return {
"URL": live_url,
"Percentage of Content Changed": f"{percent_changed:.1f}%",
"Additions": additions,
"Deletions": deletions,
"Inline Changes": inline_changes
}
### Write all the results in a csv file ###
def main():
write_header = not os.path.exists(OUTPUT_CSV) ## <- Checks if output exists
with open(INPUT_CSV, newline='', encoding='utf-8') as infile, \
open(OUTPUT_CSV, "a", newline='', encoding='utf-8') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
if write_header:
writer.writerow(["URL", "Percentage of Content Changed", "Additions", "Deletions", "Inline Changes"])
for row in reader:
if not row or not row[0].strip():
continue # skip blank lines
live_url = row[0].strip()
archive_url = ARCHIVE_PREFIX + live_url
print(f"🔍 Comparing: {live_url}")
result = compare_sites(live_url, archive_url)
if result:
writer.writerow([
result["URL"],
result["Percentage of Content Changed"],
result["Additions"],
result["Deletions"],
result["Inline Changes"]
])
print(f"✅ Done: {live_url}") ## <- tells us that the sites were successfully compared
else:
print(f"⚠️ Skipped: {live_url}") ## <- tells us that the sites were not successfully compared
### After all rows processed, analyze the output CSV for change ranges ### <- This is currently pretty unhelpful but a placeholder for some sort of summary statistic
bins = [0] * 10 # 10 bins: 0-10%, 10-20%, ..., 90-100%
with open(OUTPUT_CSV, newline='', encoding='utf-8') as summary_file:
reader = csv.DictReader(summary_file)
for row in reader:
percent_str = row["Percentage of Content Changed"].strip().rstrip('%')
try:
pct = float(percent_str)
index = min(int(pct // 10), 9)
bins[index] += 1
except ValueError:
continue # skip malformed percentages
### Print histogram summary ###
print("📊 Change Summary:")
for i in range(10):
low = i * 10
high = low + 10
print(f" {low:2d}–{high:3d}% : {bins[i]} URLs")
if __name__ == "__main__":
main()