Skip to content

Commit 6358ce6

Browse files
committed
Refactor compare_and_extract_chunks and improve test coverage
To do: - Remove additional comments after approval. Thank you for helping me with my first file system operations contribution!
1 parent e81ef60 commit 6358ce6

File tree

2 files changed

+125
-91
lines changed

2 files changed

+125
-91
lines changed

src/borg/archive.py

Lines changed: 70 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -719,38 +719,82 @@ def extract_helper(self, item, path, hlm, *, dry_run=False):
719719
# In this case, we *want* to extract twice, because there is no other way.
720720
pass
721721

722-
def compare_and_extract_chunks(self, item, fs_path):
722+
def compare_and_extract_chunks(self, item, fs_path, st=None, *, pi=None, sparse=False):
723723
"""Compare file chunks and patch if needed. Returns True if patching succeeded."""
724-
try:
725-
st = os.stat(fs_path, follow_symlinks=False)
726-
if not stat.S_ISREG(st.st_mode):
727-
return False
728-
729-
with open(fs_path, "rb+") as fs_file:
730-
chunk_offset = 0
731-
for chunk_entry in item.chunks:
732-
chunkid_A = chunk_entry.id
733-
size = chunk_entry.size
724+
if st is None or not stat.S_ISREG(st.st_mode):
725+
return False
734726

735-
fs_file.seek(chunk_offset)
736-
data_F = fs_file.read(size)
727+
try:
728+
# First pass: Build fs chunks list
729+
fs_chunks = []
730+
offset = 0
731+
with backup_io("open"):
732+
fs_file = open(fs_path, "rb")
733+
with fs_file:
734+
for chunk in item.chunks:
735+
with backup_io("seek"):
736+
fs_file.seek(offset)
737+
with backup_io("read"):
738+
data = fs_file.read(chunk.size)
739+
if len(data) != chunk.size:
740+
fs_chunks.append(None)
741+
else:
742+
fs_chunks.append(ChunkListEntry(id=self.key.id_hash(data), size=chunk.size))
743+
offset += chunk.size
737744

738-
needs_update = True
739-
if len(data_F) == size:
740-
chunkid_F = self.key.id_hash(data_F)
741-
needs_update = chunkid_A != chunkid_F
745+
# Compare chunks and collect needed chunk IDs
746+
needed_chunks = []
747+
for fs_chunk, item_chunk in zip(fs_chunks, item.chunks):
748+
if fs_chunk is None or fs_chunk.id != item_chunk.id:
749+
needed_chunks.append(item_chunk)
742750

743-
if needs_update:
744-
chunk_data = b"".join(self.pipeline.fetch_many([chunkid_A], ro_type=ROBJ_FILE_STREAM))
745-
fs_file.seek(chunk_offset)
746-
fs_file.write(chunk_data)
751+
if not needed_chunks:
752+
return True
747753

748-
chunk_offset += size
754+
# Fetch all needed chunks and iterate through ALL of them
755+
chunk_data_iter = self.pipeline.fetch_many(
756+
[chunk.id for chunk in needed_chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
757+
)
749758

750-
fs_file.truncate(item.size)
751-
return True
759+
# Second pass: Update file and consume EVERY chunk from the iterator
760+
offset = 0
761+
item_chunk_size = 0
762+
with backup_io("open"):
763+
fs_file = open(fs_path, "rb+")
764+
with fs_file:
765+
for fs_chunk, item_chunk in zip(fs_chunks, item.chunks):
766+
with backup_io("seek"):
767+
fs_file.seek(offset)
768+
if fs_chunk is not None and fs_chunk.id == item_chunk.id:
769+
offset += item_chunk.size
770+
item_chunk_size += item_chunk.size
771+
else:
772+
chunk_data = next(chunk_data_iter)
773+
if pi:
774+
pi.show(increase=len(chunk_data), info=[remove_surrogates(item.path)])
775+
with backup_io("write"):
776+
if sparse and not chunk_data.strip(b"\0"):
777+
fs_file.seek(len(chunk_data), 1) # Seek over sparse section
778+
offset += len(chunk_data)
779+
else:
780+
fs_file.write(chunk_data)
781+
offset += len(chunk_data)
782+
item_chunk_size += len(chunk_data)
783+
with backup_io("truncate_and_attrs"):
784+
fs_file.truncate(item.size)
785+
fs_file.flush()
786+
self.restore_attrs(fs_path, item, fd=fs_file.fileno())
787+
788+
# Size verification like extract_item
789+
if "size" in item and item.size != item_chunk_size:
790+
raise BackupError(f"Size inconsistency detected: size {item.size}, chunks size {item_chunk_size}")
791+
792+
# Damaged chunks check like extract_item
793+
if "chunks_healthy" in item and not item.chunks_healthy:
794+
raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
752795

753-
except (OSError, Exception):
796+
return True
797+
except OSError:
754798
return False
755799

756800
def extract_item(
@@ -855,7 +899,7 @@ def make_parent(path):
855899
with self.extract_helper(item, path, hlm) as hardlink_set:
856900
if hardlink_set:
857901
return
858-
if self.compare_and_extract_chunks(item, path):
902+
if self.compare_and_extract_chunks(item, path, pi=pi, sparse=sparse):
859903
return
860904

861905
with backup_io("open"):

src/borg/testsuite/archive_test.py

Lines changed: 55 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -426,27 +426,31 @@ def __init__(self):
426426
extractor.pipeline = cache
427427
extractor.key = key
428428
extractor.cwd = str(tmpdir)
429+
extractor.restore_attrs = Mock()
429430

430431
# Track fetched chunks across tests
431432
fetched_chunks = []
432433

433-
def create_mock_chunks(test_data, chunk_size=512):
434-
"""Helper function to create mock chunks from test data"""
434+
def create_mock_chunks(item_data, chunk_size=4):
435+
"""Helper function to create mock chunks from archive data"""
435436
chunks = []
436-
for i in range(0, len(test_data), chunk_size):
437-
chunk_data = test_data[i : i + chunk_size]
437+
for i in range(0, len(item_data), chunk_size):
438+
chunk_data = item_data[i : i + chunk_size]
438439
chunk_id = key.id_hash(chunk_data)
439440
chunks.append(Mock(id=chunk_id, size=len(chunk_data)))
440441
cache.objects[chunk_id] = chunk_data
441442

442-
item = Mock(chunks=chunks, size=len(test_data))
443-
target_path = str(tmpdir.join("test.txt"))
444-
return item, target_path
443+
item = Mock(spec=["chunks", "size", "__contains__", "get"])
444+
item.chunks = chunks # Use actual list for chunks
445+
item.size = len(item_data)
446+
item.__contains__ = lambda self, item: item == "size"
445447

446-
def mock_fetch_many(chunk_ids, ro_type):
448+
return item, str(tmpdir.join("test.txt"))
449+
450+
def mock_fetch_many(chunk_ids, is_preloaded=True, ro_type=None):
447451
"""Helper function to track and mock chunk fetching"""
448452
fetched_chunks.extend(chunk_ids)
449-
return [cache.objects[chunk_id] for chunk_id in chunk_ids]
453+
return iter([cache.objects[chunk_id] for chunk_id in chunk_ids])
450454

451455
def clear_fetched_chunks():
452456
"""Helper function to clear tracked chunks between tests"""
@@ -462,99 +466,85 @@ def get_fetched_chunks():
462466

463467

464468
@pytest.mark.parametrize(
465-
"name, test_data, initial_data, expected_fetched_chunks, expected_success",
469+
"name, item_data, fs_data, expected_fetched_chunks",
466470
[
467471
(
468472
"no_changes",
469-
b"A" * 512, # One complete chunk, no changes needed
470-
b"A" * 512, # Identical content
473+
b"1111", # One complete chunk, no changes needed
474+
b"1111", # Identical content
471475
0, # No chunks should be fetched
472-
True,
473476
),
474477
(
475478
"single_chunk_change",
476-
b"A" * 512 + b"B" * 512, # Two chunks
477-
b"A" * 512 + b"X" * 512, # Second chunk different
479+
b"11112222", # Two chunks
480+
b"1111XXXX", # Second chunk different
478481
1, # Only second chunk should be fetched
479-
True,
480482
),
481483
(
482484
"cross_boundary_change",
483-
b"A" * 512 + b"B" * 512, # Two chunks
484-
b"A" * 500 + b"X" * 24, # Change crosses chunk boundary
485+
b"11112222", # Two chunks
486+
b"111XX22", # Change crosses chunk boundary
485487
2, # Both chunks need update
486-
True,
487488
),
488489
(
489490
"exact_multiple_chunks",
490-
b"A" * 512 + b"B" * 512 + b"C" * 512, # Three complete chunks
491-
b"A" * 512 + b"X" * 512 + b"C" * 512, # Middle chunk different
491+
b"11112222333", # Three chunks (last one partial)
492+
b"1111XXXX333", # Middle chunk different
492493
1, # Only middle chunk fetched
493-
True,
494494
),
495495
(
496496
"first_chunk_change",
497-
b"A" * 512 + b"B" * 512, # Two chunks
498-
b"X" * 512 + b"B" * 512, # First chunk different
497+
b"11112222", # Two chunks
498+
b"XXXX2222", # First chunk different
499499
1, # Only first chunk should be fetched
500-
True,
501500
),
502501
(
503502
"all_chunks_different",
504-
b"A" * 512 + b"B" * 512, # Two chunks
505-
b"X" * 512 + b"Y" * 512, # Both chunks different
503+
b"11112222", # Two chunks
504+
b"XXXXYYYY", # Both chunks different
506505
2, # Both chunks should be fetched
507-
True,
508506
),
509507
(
510508
"partial_last_chunk",
511-
b"A" * 512 + b"B" * 100, # One full chunk + partial
512-
b"A" * 512 + b"X" * 100, # Partial chunk different
509+
b"111122", # One full chunk + partial
510+
b"1111XX", # Partial chunk different
513511
1, # Only second chunk should be fetched
514-
True,
515512
),
516513
],
517514
)
518-
def test_compare_and_extract_chunks(
519-
setup_extractor, name, test_data, initial_data, expected_fetched_chunks, expected_success
520-
):
515+
def test_compare_and_extract_chunks(setup_extractor, name, item_data, fs_data, expected_fetched_chunks):
521516
"""Test chunk comparison and extraction"""
522517
extractor, key, cache, tmpdir, create_mock_chunks, get_fetched_chunks, clear_fetched_chunks = setup_extractor
523518
clear_fetched_chunks()
524519

525-
item, target_path = create_mock_chunks(test_data, chunk_size=512)
520+
chunk_size = 4
521+
item, target_path = create_mock_chunks(item_data, chunk_size=chunk_size)
526522

527523
original_chunk_ids = [chunk.id for chunk in item.chunks]
528524

529525
# Write initial file state
530526
with open(target_path, "wb") as f:
531-
f.write(initial_data)
532-
533-
result = extractor.compare_and_extract_chunks(item, target_path)
534-
assert result == expected_success
535-
536-
if expected_success:
537-
# Verify only the expected chunks were fetched
538-
fetched_chunks = get_fetched_chunks()
539-
assert (
540-
len(fetched_chunks) == expected_fetched_chunks
541-
), f"Expected {expected_fetched_chunks} chunks to be fetched, got {len(fetched_chunks)}"
542-
543-
# For single chunk changes, verify it's the correct chunk
544-
if expected_fetched_chunks == 1:
545-
# Find which chunk should have changed by comparing initial_data with test_data
546-
for i, (orig_chunk, mod_chunk) in enumerate(
547-
zip(
548-
[test_data[i : i + 512] for i in range(0, len(test_data), 512)],
549-
[initial_data[i : i + 512] for i in range(0, len(initial_data), 512)],
550-
)
551-
):
552-
if orig_chunk != mod_chunk:
553-
assert (
554-
fetched_chunks[0] == original_chunk_ids[i]
555-
), f"Wrong chunk fetched. Expected chunk at position {i}"
556-
break
557-
558-
# Verify final content
559-
with open(target_path, "rb") as f:
560-
assert f.read() == test_data
527+
f.write(fs_data)
528+
529+
st = os.stat(target_path)
530+
result = extractor.compare_and_extract_chunks(item, target_path, st=st) # Pass st parameter
531+
assert result
532+
533+
# Verify only the expected chunks were fetched
534+
fetched_chunks = get_fetched_chunks()
535+
assert len(fetched_chunks) == expected_fetched_chunks
536+
537+
# For single chunk changes, verify it's the correct chunk
538+
if expected_fetched_chunks == 1:
539+
item_chunks = [item_data[i : i + chunk_size] for i in range(0, len(item_data), chunk_size)]
540+
fs_chunks = [fs_data[i : i + chunk_size] for i in range(0, len(fs_data), chunk_size)]
541+
542+
# Find which chunk should have changed by comparing item_data with fs_data
543+
for i, (item_chunk, fs_chunk) in enumerate(zip(item_chunks, fs_chunks)):
544+
if item_chunk != fs_chunk:
545+
assert fetched_chunks[0] == original_chunk_ids[i]
546+
break
547+
548+
# Verify final content
549+
with open(target_path, "rb") as f:
550+
assert f.read() == item_data

0 commit comments

Comments
 (0)