Skip to content

Commit 2344db1

Browse files
committed
Refactor compare_and_extract_chunks
1 parent 6358ce6 commit 2344db1

File tree

2 files changed

+22
-27
lines changed

2 files changed

+22
-27
lines changed

src/borg/archive.py

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -727,67 +727,62 @@ def compare_and_extract_chunks(self, item, fs_path, st=None, *, pi=None, sparse=
727727
try:
728728
# First pass: Build fs chunks list
729729
fs_chunks = []
730-
offset = 0
731730
with backup_io("open"):
732731
fs_file = open(fs_path, "rb")
733732
with fs_file:
734733
for chunk in item.chunks:
735-
with backup_io("seek"):
736-
fs_file.seek(offset)
737734
with backup_io("read"):
738735
data = fs_file.read(chunk.size)
739-
if len(data) != chunk.size:
740-
fs_chunks.append(None)
741-
else:
742-
fs_chunks.append(ChunkListEntry(id=self.key.id_hash(data), size=chunk.size))
743-
offset += chunk.size
736+
737+
fs_chunks.append(ChunkListEntry(id=self.key.id_hash(data), size=len(data)))
744738

745739
# Compare chunks and collect needed chunk IDs
746740
needed_chunks = []
747741
for fs_chunk, item_chunk in zip(fs_chunks, item.chunks):
748-
if fs_chunk is None or fs_chunk.id != item_chunk.id:
742+
if fs_chunk != item_chunk:
749743
needed_chunks.append(item_chunk)
750744

751745
if not needed_chunks:
752746
return True
753747

754748
# Fetch all needed chunks and iterate through ALL of them
755-
chunk_data_iter = self.pipeline.fetch_many(
756-
[chunk.id for chunk in needed_chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
757-
)
749+
chunk_data_iter = self.pipeline.fetch_many([chunk.id for chunk in needed_chunks], ro_type=ROBJ_FILE_STREAM)
758750

759-
# Second pass: Update file and consume EVERY chunk from the iterator
751+
# Second pass: Update file
760752
offset = 0
761-
item_chunk_size = 0
762753
with backup_io("open"):
763754
fs_file = open(fs_path, "rb+")
764755
with fs_file:
765756
for fs_chunk, item_chunk in zip(fs_chunks, item.chunks):
766-
with backup_io("seek"):
767-
fs_file.seek(offset)
768-
if fs_chunk is not None and fs_chunk.id == item_chunk.id:
757+
if fs_chunk == item_chunk:
758+
with backup_io("seek"):
759+
fs_file.seek(offset)
769760
offset += item_chunk.size
770-
item_chunk_size += item_chunk.size
771761
else:
772762
chunk_data = next(chunk_data_iter)
773-
if pi:
774-
pi.show(increase=len(chunk_data), info=[remove_surrogates(item.path)])
763+
# Need explicit seek before write in rb+ mode to ensure correct file position Without an
764+
# explicit seek before write, the file contents got corrupted (wrote chunks in wrong positions)
765+
with backup_io("seek"):
766+
fs_file.seek(offset)
767+
775768
with backup_io("write"):
776769
if sparse and not chunk_data.strip(b"\0"):
777-
fs_file.seek(len(chunk_data), 1) # Seek over sparse section
770+
fs_file.write(b"\0" * len(chunk_data)) # Write zeros to overwrite old content
778771
offset += len(chunk_data)
779772
else:
780773
fs_file.write(chunk_data)
781774
offset += len(chunk_data)
782-
item_chunk_size += len(chunk_data)
775+
if pi: # <-- Moved outside to track progress in both cases
776+
pi.show(increase=len(chunk_data), info=[remove_surrogates(item.path)])
777+
783778
with backup_io("truncate_and_attrs"):
784779
fs_file.truncate(item.size)
785780
fs_file.flush()
786781
self.restore_attrs(fs_path, item, fd=fs_file.fileno())
787782

788783
# Size verification like extract_item
789-
if "size" in item and item.size != item_chunk_size:
790-
raise BackupError(f"Size inconsistency detected: size {item.size}, chunks size {item_chunk_size}")
784+
if "size" in item and item.size != offset:
785+
raise BackupError(f"Size inconsistency detected: size {item.size}, chunks size {offset}")
791786

792787
# Damaged chunks check like extract_item
793788
if "chunks_healthy" in item and not item.chunks_healthy:

src/borg/testsuite/archive_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ..archive import Archive, CacheChunkBuffer, RobustUnpacker, valid_msgpacked_dict, ITEM_KEYS, Statistics
1313
from ..archive import BackupOSError, backup_io, backup_io_iter, get_item_uid_gid
1414
from ..helpers import msgpack
15-
from ..item import Item, ArchiveItem
15+
from ..item import Item, ArchiveItem, ChunkListEntry
1616
from ..manifest import Manifest
1717
from ..platform import uid2user, gid2group, is_win32
1818

@@ -437,7 +437,7 @@ def create_mock_chunks(item_data, chunk_size=4):
437437
for i in range(0, len(item_data), chunk_size):
438438
chunk_data = item_data[i : i + chunk_size]
439439
chunk_id = key.id_hash(chunk_data)
440-
chunks.append(Mock(id=chunk_id, size=len(chunk_data)))
440+
chunks.append(ChunkListEntry(id=chunk_id, size=len(chunk_data)))
441441
cache.objects[chunk_id] = chunk_data
442442

443443
item = Mock(spec=["chunks", "size", "__contains__", "get"])
@@ -447,7 +447,7 @@ def create_mock_chunks(item_data, chunk_size=4):
447447

448448
return item, str(tmpdir.join("test.txt"))
449449

450-
def mock_fetch_many(chunk_ids, is_preloaded=True, ro_type=None):
450+
def mock_fetch_many(chunk_ids, ro_type=None):
451451
"""Helper function to track and mock chunk fetching"""
452452
fetched_chunks.extend(chunk_ids)
453453
return iter([cache.objects[chunk_id] for chunk_id in chunk_ids])

0 commit comments

Comments
 (0)