Skip to content

Commit 83ff0ae

Browse files
authored
Merge pull request #455 from Iamrodos/fix-133
Avoid rewriting unchanged JSON files for labels, milestones, releases…
2 parents 8b7512c + 5739ac0 commit 83ff0ae

File tree

2 files changed

+283
-11
lines changed

2 files changed

+283
-11
lines changed

github_backup/github_backup.py

Lines changed: 85 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,11 +1898,21 @@ def backup_milestones(args, repo_cwd, repository, repos_template):
18981898
for milestone in _milestones:
18991899
milestones[milestone["number"]] = milestone
19001900

1901-
logger.info("Saving {0} milestones to disk".format(len(list(milestones.keys()))))
1901+
written_count = 0
19021902
for number, milestone in list(milestones.items()):
19031903
milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
1904-
with codecs.open(milestone_file, "w", encoding="utf-8") as f:
1905-
json_dump(milestone, f)
1904+
if json_dump_if_changed(milestone, milestone_file):
1905+
written_count += 1
1906+
1907+
total = len(milestones)
1908+
if written_count == total:
1909+
logger.info("Saved {0} milestones to disk".format(total))
1910+
elif written_count == 0:
1911+
logger.info("{0} milestones unchanged, skipped write".format(total))
1912+
else:
1913+
logger.info("Saved {0} of {1} milestones to disk ({2} unchanged)".format(
1914+
written_count, total, total - written_count
1915+
))
19061916

19071917

19081918
def backup_labels(args, repo_cwd, repository, repos_template):
@@ -1955,19 +1965,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19551965
reverse=True,
19561966
)
19571967
releases = releases[: args.number_of_latest_releases]
1958-
logger.info("Saving the latest {0} releases to disk".format(len(releases)))
1959-
else:
1960-
logger.info("Saving {0} releases to disk".format(len(releases)))
19611968

19621969
# for each release, store it
1970+
written_count = 0
19631971
for release in releases:
19641972
release_name = release["tag_name"]
19651973
release_name_safe = release_name.replace("/", "__")
19661974
output_filepath = os.path.join(
19671975
release_cwd, "{0}.json".format(release_name_safe)
19681976
)
1969-
with codecs.open(output_filepath, "w+", encoding="utf-8") as f:
1970-
json_dump(release, f)
1977+
if json_dump_if_changed(release, output_filepath):
1978+
written_count += 1
19711979

19721980
if include_assets:
19731981
assets = retrieve_data(args, release["assets_url"])
@@ -1984,6 +1992,17 @@ def backup_releases(args, repo_cwd, repository, repos_template, include_assets=F
19841992
fine=True if args.token_fine is not None else False,
19851993
)
19861994

1995+
# Log the results
1996+
total = len(releases)
1997+
if written_count == total:
1998+
logger.info("Saved {0} releases to disk".format(total))
1999+
elif written_count == 0:
2000+
logger.info("{0} releases unchanged, skipped write".format(total))
2001+
else:
2002+
logger.info("Saved {0} of {1} releases to disk ({2} unchanged)".format(
2003+
written_count, total, total - written_count
2004+
))
2005+
19872006

19882007
def fetch_repository(
19892008
name,
@@ -2108,9 +2127,10 @@ def _backup_data(args, name, template, output_file, output_directory):
21082127
mkdir_p(output_directory)
21092128
data = retrieve_data(args, template)
21102129

2111-
logger.info("Writing {0} {1} to disk".format(len(data), name))
2112-
with codecs.open(output_file, "w", encoding="utf-8") as f:
2113-
json_dump(data, f)
2130+
if json_dump_if_changed(data, output_file):
2131+
logger.info("Saved {0} {1} to disk".format(len(data), name))
2132+
else:
2133+
logger.info("{0} {1} unchanged, skipped write".format(len(data), name))
21142134

21152135

21162136
def json_dump(data, output_file):
@@ -2122,3 +2142,57 @@ def json_dump(data, output_file):
21222142
indent=4,
21232143
separators=(",", ": "),
21242144
)
2145+
2146+
2147+
def json_dump_if_changed(data, output_file_path):
2148+
"""
2149+
Write JSON data to file only if content has changed.
2150+
2151+
Compares the serialized JSON data with the existing file content
2152+
and only writes if different. This prevents unnecessary file
2153+
modification timestamp updates and disk writes.
2154+
2155+
Uses atomic writes (temp file + rename) to prevent corruption
2156+
if the process is interrupted during the write.
2157+
2158+
Args:
2159+
data: The data to serialize as JSON
2160+
output_file_path: The path to the output file
2161+
2162+
Returns:
2163+
True if file was written (content changed or new file)
2164+
False if write was skipped (content unchanged)
2165+
"""
2166+
# Serialize new data with consistent formatting matching json_dump()
2167+
new_content = json.dumps(
2168+
data,
2169+
ensure_ascii=False,
2170+
sort_keys=True,
2171+
indent=4,
2172+
separators=(",", ": "),
2173+
)
2174+
2175+
# Check if file exists and compare content
2176+
if os.path.exists(output_file_path):
2177+
try:
2178+
with codecs.open(output_file_path, "r", encoding="utf-8") as f:
2179+
existing_content = f.read()
2180+
if existing_content == new_content:
2181+
logger.debug(
2182+
"Content unchanged, skipping write: {0}".format(output_file_path)
2183+
)
2184+
return False
2185+
except (OSError, UnicodeDecodeError) as e:
2186+
# If we can't read the existing file, write the new one
2187+
logger.debug(
2188+
"Error reading existing file {0}, will overwrite: {1}".format(
2189+
output_file_path, e
2190+
)
2191+
)
2192+
2193+
# Write the file atomically using temp file + rename
2194+
temp_file = output_file_path + ".temp"
2195+
with codecs.open(temp_file, "w", encoding="utf-8") as f:
2196+
f.write(new_content)
2197+
os.rename(temp_file, output_file_path) # Atomic on POSIX systems
2198+
return True

tests/test_json_dump_if_changed.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""Tests for json_dump_if_changed functionality."""
2+
3+
import codecs
4+
import json
5+
import os
6+
import tempfile
7+
8+
import pytest
9+
10+
from github_backup import github_backup
11+
12+
13+
class TestJsonDumpIfChanged:
14+
"""Test suite for json_dump_if_changed function."""
15+
16+
def test_writes_new_file(self):
17+
"""Should write file when it doesn't exist."""
18+
with tempfile.TemporaryDirectory() as tmpdir:
19+
output_file = os.path.join(tmpdir, "test.json")
20+
test_data = {"key": "value", "number": 42}
21+
22+
result = github_backup.json_dump_if_changed(test_data, output_file)
23+
24+
assert result is True
25+
assert os.path.exists(output_file)
26+
27+
# Verify content matches expected format
28+
with codecs.open(output_file, "r", encoding="utf-8") as f:
29+
content = f.read()
30+
loaded = json.loads(content)
31+
assert loaded == test_data
32+
33+
def test_skips_unchanged_file(self):
34+
"""Should skip write when content is identical."""
35+
with tempfile.TemporaryDirectory() as tmpdir:
36+
output_file = os.path.join(tmpdir, "test.json")
37+
test_data = {"key": "value", "number": 42}
38+
39+
# First write
40+
result1 = github_backup.json_dump_if_changed(test_data, output_file)
41+
assert result1 is True
42+
43+
# Get the initial mtime
44+
mtime1 = os.path.getmtime(output_file)
45+
46+
# Second write with same data
47+
result2 = github_backup.json_dump_if_changed(test_data, output_file)
48+
assert result2 is False
49+
50+
# File should not have been modified
51+
mtime2 = os.path.getmtime(output_file)
52+
assert mtime1 == mtime2
53+
54+
def test_writes_when_content_changed(self):
55+
"""Should write file when content has changed."""
56+
with tempfile.TemporaryDirectory() as tmpdir:
57+
output_file = os.path.join(tmpdir, "test.json")
58+
test_data1 = {"key": "value1"}
59+
test_data2 = {"key": "value2"}
60+
61+
# First write
62+
result1 = github_backup.json_dump_if_changed(test_data1, output_file)
63+
assert result1 is True
64+
65+
# Second write with different data
66+
result2 = github_backup.json_dump_if_changed(test_data2, output_file)
67+
assert result2 is True
68+
69+
# Verify new content
70+
with codecs.open(output_file, "r", encoding="utf-8") as f:
71+
loaded = json.load(f)
72+
assert loaded == test_data2
73+
74+
def test_uses_consistent_formatting(self):
75+
"""Should use same JSON formatting as json_dump."""
76+
with tempfile.TemporaryDirectory() as tmpdir:
77+
output_file = os.path.join(tmpdir, "test.json")
78+
test_data = {"z": "last", "a": "first", "m": "middle"}
79+
80+
github_backup.json_dump_if_changed(test_data, output_file)
81+
82+
with codecs.open(output_file, "r", encoding="utf-8") as f:
83+
content = f.read()
84+
85+
# Check for consistent formatting:
86+
# - sorted keys
87+
# - 4-space indent
88+
# - comma-colon-space separator
89+
expected = json.dumps(
90+
test_data,
91+
ensure_ascii=False,
92+
sort_keys=True,
93+
indent=4,
94+
separators=(",", ": "),
95+
)
96+
assert content == expected
97+
98+
def test_atomic_write_always_used(self):
99+
"""Should always use temp file and rename for atomic writes."""
100+
with tempfile.TemporaryDirectory() as tmpdir:
101+
output_file = os.path.join(tmpdir, "test.json")
102+
test_data = {"key": "value"}
103+
104+
result = github_backup.json_dump_if_changed(test_data, output_file)
105+
106+
assert result is True
107+
assert os.path.exists(output_file)
108+
109+
# Temp file should not exist after atomic write
110+
temp_file = output_file + ".temp"
111+
assert not os.path.exists(temp_file)
112+
113+
# Verify content
114+
with codecs.open(output_file, "r", encoding="utf-8") as f:
115+
loaded = json.load(f)
116+
assert loaded == test_data
117+
118+
def test_handles_unicode_content(self):
119+
"""Should correctly handle Unicode content."""
120+
with tempfile.TemporaryDirectory() as tmpdir:
121+
output_file = os.path.join(tmpdir, "test.json")
122+
test_data = {
123+
"emoji": "🚀",
124+
"chinese": "你好",
125+
"arabic": "مرحبا",
126+
"cyrillic": "Привет",
127+
}
128+
129+
result = github_backup.json_dump_if_changed(test_data, output_file)
130+
assert result is True
131+
132+
# Verify Unicode is preserved
133+
with codecs.open(output_file, "r", encoding="utf-8") as f:
134+
loaded = json.load(f)
135+
assert loaded == test_data
136+
137+
# Second write should skip
138+
result2 = github_backup.json_dump_if_changed(test_data, output_file)
139+
assert result2 is False
140+
141+
def test_handles_complex_nested_data(self):
142+
"""Should handle complex nested data structures."""
143+
with tempfile.TemporaryDirectory() as tmpdir:
144+
output_file = os.path.join(tmpdir, "test.json")
145+
test_data = {
146+
"users": [
147+
{"id": 1, "name": "Alice", "tags": ["admin", "user"]},
148+
{"id": 2, "name": "Bob", "tags": ["user"]},
149+
],
150+
"metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}},
151+
}
152+
153+
result = github_backup.json_dump_if_changed(test_data, output_file)
154+
assert result is True
155+
156+
# Verify structure is preserved
157+
with codecs.open(output_file, "r", encoding="utf-8") as f:
158+
loaded = json.load(f)
159+
assert loaded == test_data
160+
161+
def test_overwrites_on_unicode_decode_error(self):
162+
"""Should overwrite if existing file has invalid UTF-8."""
163+
with tempfile.TemporaryDirectory() as tmpdir:
164+
output_file = os.path.join(tmpdir, "test.json")
165+
test_data = {"key": "value"}
166+
167+
# Write invalid UTF-8 bytes
168+
with open(output_file, "wb") as f:
169+
f.write(b"\xff\xfe invalid utf-8")
170+
171+
# Should catch UnicodeDecodeError and overwrite
172+
result = github_backup.json_dump_if_changed(test_data, output_file)
173+
assert result is True
174+
175+
# Verify new content was written
176+
with codecs.open(output_file, "r", encoding="utf-8") as f:
177+
loaded = json.load(f)
178+
assert loaded == test_data
179+
180+
def test_key_order_independence(self):
181+
"""Should treat differently-ordered dicts as same if keys/values match."""
182+
with tempfile.TemporaryDirectory() as tmpdir:
183+
output_file = os.path.join(tmpdir, "test.json")
184+
185+
# Write first dict
186+
data1 = {"z": 1, "a": 2, "m": 3}
187+
github_backup.json_dump_if_changed(data1, output_file)
188+
189+
# Try to write same data but different order
190+
data2 = {"a": 2, "m": 3, "z": 1}
191+
result = github_backup.json_dump_if_changed(data2, output_file)
192+
193+
# Should skip because content is the same (keys are sorted)
194+
assert result is False
195+
196+
197+
if __name__ == "__main__":
198+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)