Skip to content

Commit e199290

Browse files
ttaylorrgitster
authored andcommitted
pack-objects: only perform verbatim reuse on the preferred pack
When reusing objects from source pack(s), write_reused_pack_verbatim() is responsible for reusing objects whole eword_t's at a time. It works by taking the longest continuous run of objects from the beginning of each source pack that the caller wants, and reuses the entirety of that section from each pack. This is based on the assumption that we don't have any gaps within the region. This assumption relieves us from having to patch any OFS_DELTAs, since we know that there aren't any gaps between any delta and its base in that region. To illustrate why this assumption is necessary, suppose we have some pack P, which has objects X, Y, and Z. If the MIDX's copy of Y was selected from a pack other than P, then the bit corresponding to object Y will appear earlier in the bitmap than the bits corresponding to X and Z. If pack-objects already has or will use the copy of Y from the pack it was selected from in the MIDX, then it is an error to reuse all objects between X and Z in the source pack. Doing so will cause us to reuse Y from a different pack than the one which represents Y in the MIDX, causing us to either: - include the object twice, assuming that the caller wants Y in the pack, or - include the object once, resulting in us packing more objects than necessary. This regression comes from ca0fd69 (pack-objects: prepare `write_reused_pack_verbatim()` for multi-pack reuse, 2023-12-14), which incorrectly assumed that there would be no gaps in reusable regions of non-preferred packs. Instead, we can only safely perform the whole-word reuse optimization on the preferred pack, where we know with certainty that no gaps exist in that region of the bitmap. We can still reuse objects from non-preferred packs, but we have to inspect them individually in write_reused_pack() to ensure that any gaps that may exist are accounted for. This allows us to simplify the implementation of write_reused_pack_verbatim() back to almost its pre-multi-pack reuse form, since we can now assume that the beginning of the pack appears at the beginning of the bitmap, meaning that we don't have to account for any bits up to the first word boundary (like we had to special case in ca0fd69). The only significant changes from the pre-ca0fd69e37 implementation are: - that we can no longer inspect words up to the end of reuse_packfile_bitmap->word_alloc, since we only want to look at words whose bits all correspond to objects in the given packfile, and - that we return early when given a reuse_packfile which is not preferred, making the call a noop. In the future, it might be possible to restore this optimization if we could guarantee that some reuse packs don't contain any gaps by construction (similar to the "disjoint packs" idea in very early versions of multi-pack reuse). Helped-by: Jeff King <[email protected]> Signed-off-by: Taylor Blau <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 57f35cf commit e199290

File tree

2 files changed

+41
-56
lines changed

2 files changed

+41
-56
lines changed

builtin/pack-objects.c

Lines changed: 40 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,78 +1100,64 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
11001100

11011101
static size_t write_reused_pack_verbatim(struct bitmapped_pack *reuse_packfile,
11021102
struct hashfile *out,
1103-
off_t pack_start,
11041103
struct pack_window **w_curs)
11051104
{
1106-
size_t pos = reuse_packfile->bitmap_pos;
1105+
size_t pos = 0;
11071106
size_t end;
11081107

1109-
if (pos % BITS_IN_EWORD) {
1110-
size_t word_pos = (pos / BITS_IN_EWORD);
1111-
size_t offset = pos % BITS_IN_EWORD;
1112-
size_t last;
1113-
eword_t word = reuse_packfile_bitmap->words[word_pos];
1114-
1115-
if (offset + reuse_packfile->bitmap_nr < BITS_IN_EWORD)
1116-
last = offset + reuse_packfile->bitmap_nr;
1117-
else
1118-
last = BITS_IN_EWORD;
1119-
1120-
for (; offset < last; offset++) {
1121-
if (word >> offset == 0)
1122-
return word_pos;
1123-
if (!bitmap_get(reuse_packfile_bitmap,
1124-
word_pos * BITS_IN_EWORD + offset))
1125-
return word_pos;
1126-
}
1127-
1128-
pos += BITS_IN_EWORD - (pos % BITS_IN_EWORD);
1108+
if (reuse_packfile->bitmap_pos) {
1109+
/*
1110+
* We can't reuse whole chunks verbatim out of
1111+
* non-preferred packs since we can't guarantee that
1112+
* all duplicate objects were resolved in favor of
1113+
* that pack.
1114+
*
1115+
* Even if we have a whole eword_t worth of bits that
1116+
* could be reused, there may be objects between the
1117+
* objects corresponding to the first and last bit of
1118+
* that word which were selected from a different
1119+
* pack, causing us to send duplicate or unwanted
1120+
* objects.
1121+
*
1122+
* Handle non-preferred packs from within
1123+
* write_reused_pack(), which inspects and reuses
1124+
* individual bits.
1125+
*/
1126+
return reuse_packfile->bitmap_pos / BITS_IN_EWORD;
11291127
}
11301128

11311129
/*
1132-
* Now we're going to copy as many whole eword_t's as possible.
1133-
* "end" is the index of the last whole eword_t we copy, but
1134-
* there may be additional bits to process. Those are handled
1135-
* individually by write_reused_pack().
1130+
* Only read through the last word whose bits all correspond
1131+
* to objects in the given packfile, since we must stop at a
1132+
* word boundary.
11361133
*
1137-
* Begin by advancing to the first word boundary in range of the
1138-
* bit positions occupied by objects in "reuse_packfile". Then
1139-
* pick the last word boundary in the same range. If we have at
1140-
* least one word's worth of bits to process, continue on.
1134+
* If there is no whole word to read (i.e. the packfile
1135+
* contains fewer than BITS_IN_EWORD objects), then we'll
1136+
* inspect bits one-by-one in write_reused_pack().
11411137
*/
1142-
end = reuse_packfile->bitmap_pos + reuse_packfile->bitmap_nr;
1143-
if (end % BITS_IN_EWORD)
1144-
end -= end % BITS_IN_EWORD;
1145-
if (pos >= end)
1146-
return reuse_packfile->bitmap_pos / BITS_IN_EWORD;
1147-
1148-
while (pos < end &&
1149-
reuse_packfile_bitmap->words[pos / BITS_IN_EWORD] == (eword_t)~0)
1150-
pos += BITS_IN_EWORD;
1138+
end = reuse_packfile->bitmap_nr / BITS_IN_EWORD;
1139+
if (reuse_packfile_bitmap->word_alloc < end)
1140+
BUG("fewer words than expected in reuse_packfile_bitmap");
11511141

1152-
if (pos > end)
1153-
pos = end;
1142+
while (pos < end && reuse_packfile_bitmap->words[pos] == (eword_t)~0)
1143+
pos++;
11541144

1155-
if (reuse_packfile->bitmap_pos < pos) {
1156-
off_t pack_start_off = pack_pos_to_offset(reuse_packfile->p, 0);
1157-
off_t pack_end_off = pack_pos_to_offset(reuse_packfile->p,
1158-
pos - reuse_packfile->bitmap_pos);
1145+
if (pos) {
1146+
off_t to_write;
11591147

1160-
written += pos - reuse_packfile->bitmap_pos;
1148+
written = (pos * BITS_IN_EWORD);
1149+
to_write = pack_pos_to_offset(reuse_packfile->p, written)
1150+
- sizeof(struct pack_header);
11611151

11621152
/* We're recording one chunk, not one object. */
1163-
record_reused_object(pack_start_off,
1164-
pack_start_off - (hashfile_total(out) - pack_start));
1153+
record_reused_object(sizeof(struct pack_header), 0);
11651154
hashflush(out);
11661155
copy_pack_data(out, reuse_packfile->p, w_curs,
1167-
pack_start_off, pack_end_off - pack_start_off);
1156+
sizeof(struct pack_header), to_write);
11681157

11691158
display_progress(progress_state, written);
11701159
}
1171-
if (pos % BITS_IN_EWORD)
1172-
BUG("attempted to jump past a word boundary to %"PRIuMAX,
1173-
(uintmax_t)pos);
1174-
return pos / BITS_IN_EWORD;
1160+
return pos;
11751161
}
11761162

11771163
static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
@@ -1183,8 +1169,7 @@ static void write_reused_pack(struct bitmapped_pack *reuse_packfile,
11831169
struct pack_window *w_curs = NULL;
11841170

11851171
if (allow_ofs_delta)
1186-
i = write_reused_pack_verbatim(reuse_packfile, f, pack_start,
1187-
&w_curs);
1172+
i = write_reused_pack_verbatim(reuse_packfile, f, &w_curs);
11881173

11891174
for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
11901175
eword_t word = reuse_packfile_bitmap->words[i];

t/t5332-multi-pack-reuse.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ test_expect_success 'duplicate objects' '
259259
)
260260
'
261261

262-
test_expect_failure 'duplicate objects with verbatim reuse' '
262+
test_expect_success 'duplicate objects with verbatim reuse' '
263263
git init duplicate-objects-verbatim &&
264264
(
265265
cd duplicate-objects-verbatim &&

0 commit comments

Comments
 (0)