Skip to content

Commit bb514de

Browse files
peffgitster
authored andcommitted
pack-objects: improve partial packfile reuse
The old code to reuse deltas from an existing packfile just tried to dump a whole segment of the pack verbatim. That's faster than the traditional way of actually adding objects to the packing list, but it didn't kick in very often. This new code is really going for a middle ground: do _some_ per-object work, but way less than we'd traditionally do. The general strategy of the new code is to make a bitmap of objects from the packfile we'll include, and then iterate over it, writing out each object exactly as it is in our on-disk pack, but _not_ adding it to our packlist (which costs memory, and increases the search space for deltas). One complication is that if we're omitting some objects, we can't set a delta against a base that we're not sending. So we have to check each object in try_partial_reuse() to make sure we have its delta. About performance, in the worst case we might have interleaved objects that we are sending or not sending, and we'd have as many chunks as objects. But in practice we send big chunks. For instance, packing torvalds/linux on GitHub servers now reused 6.5M objects, but only needed ~50k chunks. Helped-by: Jonathan Tan <[email protected]> Signed-off-by: Jeff King <[email protected]> Signed-off-by: Christian Couder <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent ff48302 commit bb514de

File tree

3 files changed

+281
-86
lines changed

3 files changed

+281
-86
lines changed

builtin/pack-objects.c

Lines changed: 170 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ static struct progress *progress_state;
9292

9393
static struct packed_git *reuse_packfile;
9494
static uint32_t reuse_packfile_objects;
95-
static off_t reuse_packfile_offset;
95+
static struct bitmap *reuse_packfile_bitmap;
9696

9797
static int use_bitmap_index_default = 1;
9898
static int use_bitmap_index = -1;
@@ -785,57 +785,185 @@ static struct object_entry **compute_write_order(void)
785785
return wo;
786786
}
787787

788-
static off_t write_reused_pack(struct hashfile *f)
788+
789+
/*
790+
* A reused set of objects. All objects in a chunk have the same
791+
* relative position in the original packfile and the generated
792+
* packfile.
793+
*/
794+
795+
static struct reused_chunk {
796+
/* The offset of the first object of this chunk in the original
797+
* packfile. */
798+
off_t original;
799+
/* The offset of the first object of this chunk in the generated
800+
* packfile minus "original". */
801+
off_t difference;
802+
} *reused_chunks;
803+
static int reused_chunks_nr;
804+
static int reused_chunks_alloc;
805+
806+
static void record_reused_object(off_t where, off_t offset)
807+
{
808+
if (reused_chunks_nr && reused_chunks[reused_chunks_nr-1].difference == offset)
809+
return;
810+
811+
ALLOC_GROW(reused_chunks, reused_chunks_nr + 1,
812+
reused_chunks_alloc);
813+
reused_chunks[reused_chunks_nr].original = where;
814+
reused_chunks[reused_chunks_nr].difference = offset;
815+
reused_chunks_nr++;
816+
}
817+
818+
/*
819+
* Binary search to find the chunk that "where" is in. Note
820+
* that we're not looking for an exact match, just the first
821+
* chunk that contains it (which implicitly ends at the start
822+
* of the next chunk.
823+
*/
824+
static off_t find_reused_offset(off_t where)
789825
{
790-
unsigned char buffer[8192];
791-
off_t to_write, total;
792-
int fd;
826+
int lo = 0, hi = reused_chunks_nr;
827+
while (lo < hi) {
828+
int mi = lo + ((hi - lo) / 2);
829+
if (where == reused_chunks[mi].original)
830+
return reused_chunks[mi].difference;
831+
if (where < reused_chunks[mi].original)
832+
hi = mi;
833+
else
834+
lo = mi + 1;
835+
}
793836

794-
if (!is_pack_valid(reuse_packfile))
795-
die(_("packfile is invalid: %s"), reuse_packfile->pack_name);
837+
/*
838+
* The first chunk starts at zero, so we can't have gone below
839+
* there.
840+
*/
841+
assert(lo);
842+
return reused_chunks[lo-1].difference;
843+
}
796844

797-
fd = git_open(reuse_packfile->pack_name);
798-
if (fd < 0)
799-
die_errno(_("unable to open packfile for reuse: %s"),
800-
reuse_packfile->pack_name);
845+
static void write_reused_pack_one(size_t pos, struct hashfile *out,
846+
struct pack_window **w_curs)
847+
{
848+
off_t offset, next, cur;
849+
enum object_type type;
850+
unsigned long size;
801851

802-
if (lseek(fd, sizeof(struct pack_header), SEEK_SET) == -1)
803-
die_errno(_("unable to seek in reused packfile"));
852+
offset = reuse_packfile->revindex[pos].offset;
853+
next = reuse_packfile->revindex[pos + 1].offset;
804854

805-
if (reuse_packfile_offset < 0)
806-
reuse_packfile_offset = reuse_packfile->pack_size - the_hash_algo->rawsz;
855+
record_reused_object(offset, offset - hashfile_total(out));
807856

808-
total = to_write = reuse_packfile_offset - sizeof(struct pack_header);
857+
cur = offset;
858+
type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
859+
assert(type >= 0);
809860

810-
while (to_write) {
811-
int read_pack = xread(fd, buffer, sizeof(buffer));
861+
if (type == OBJ_OFS_DELTA) {
862+
off_t base_offset;
863+
off_t fixup;
864+
865+
unsigned char header[MAX_PACK_OBJECT_HEADER];
866+
unsigned len;
867+
868+
base_offset = get_delta_base(reuse_packfile, w_curs, &cur, type, offset);
869+
assert(base_offset != 0);
870+
871+
/* Convert to REF_DELTA if we must... */
872+
if (!allow_ofs_delta) {
873+
int base_pos = find_revindex_position(reuse_packfile, base_offset);
874+
const unsigned char *base_sha1 =
875+
nth_packed_object_sha1(reuse_packfile,
876+
reuse_packfile->revindex[base_pos].nr);
877+
878+
len = encode_in_pack_object_header(header, sizeof(header),
879+
OBJ_REF_DELTA, size);
880+
hashwrite(out, header, len);
881+
hashwrite(out, base_sha1, 20);
882+
copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
883+
return;
884+
}
812885

813-
if (read_pack <= 0)
814-
die_errno(_("unable to read from reused packfile"));
886+
/* Otherwise see if we need to rewrite the offset... */
887+
fixup = find_reused_offset(offset) -
888+
find_reused_offset(base_offset);
889+
if (fixup) {
890+
unsigned char ofs_header[10];
891+
unsigned i, ofs_len;
892+
off_t ofs = offset - base_offset - fixup;
815893

816-
if (read_pack > to_write)
817-
read_pack = to_write;
894+
len = encode_in_pack_object_header(header, sizeof(header),
895+
OBJ_OFS_DELTA, size);
818896

819-
hashwrite(f, buffer, read_pack);
820-
to_write -= read_pack;
897+
i = sizeof(ofs_header) - 1;
898+
ofs_header[i] = ofs & 127;
899+
while (ofs >>= 7)
900+
ofs_header[--i] = 128 | (--ofs & 127);
901+
902+
ofs_len = sizeof(ofs_header) - i;
903+
904+
hashwrite(out, header, len);
905+
hashwrite(out, ofs_header + sizeof(ofs_header) - ofs_len, ofs_len);
906+
copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
907+
return;
908+
}
909+
910+
/* ...otherwise we have no fixup, and can write it verbatim */
911+
}
912+
913+
copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset);
914+
}
915+
916+
static size_t write_reused_pack_verbatim(struct hashfile *out,
917+
struct pack_window **w_curs)
918+
{
919+
size_t pos = 0;
920+
921+
while (pos < reuse_packfile_bitmap->word_alloc &&
922+
reuse_packfile_bitmap->words[pos] == (eword_t)~0)
923+
pos++;
924+
925+
if (pos) {
926+
off_t to_write;
927+
928+
written = (pos * BITS_IN_EWORD);
929+
to_write = reuse_packfile->revindex[written].offset
930+
- sizeof(struct pack_header);
931+
932+
/* We're recording one chunk, not one object. */
933+
record_reused_object(sizeof(struct pack_header), 0);
934+
hashflush(out);
935+
copy_pack_data(out, reuse_packfile, w_curs,
936+
sizeof(struct pack_header), to_write);
821937

822-
/*
823-
* We don't know the actual number of objects written,
824-
* only how many bytes written, how many bytes total, and
825-
* how many objects total. So we can fake it by pretending all
826-
* objects we are writing are the same size. This gives us a
827-
* smooth progress meter, and at the end it matches the true
828-
* answer.
829-
*/
830-
written = reuse_packfile_objects *
831-
(((double)(total - to_write)) / total);
832938
display_progress(progress_state, written);
833939
}
940+
return pos;
941+
}
942+
943+
static void write_reused_pack(struct hashfile *f)
944+
{
945+
size_t i = 0;
946+
uint32_t offset;
947+
struct pack_window *w_curs = NULL;
948+
949+
if (allow_ofs_delta)
950+
i = write_reused_pack_verbatim(f, &w_curs);
951+
952+
for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
953+
eword_t word = reuse_packfile_bitmap->words[i];
954+
size_t pos = (i * BITS_IN_EWORD);
955+
956+
for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
957+
if ((word >> offset) == 0)
958+
break;
959+
960+
offset += ewah_bit_ctz64(word >> offset);
961+
write_reused_pack_one(pos + offset, f, &w_curs);
962+
display_progress(progress_state, ++written);
963+
}
964+
}
834965

835-
close(fd);
836-
written = reuse_packfile_objects;
837-
display_progress(progress_state, written);
838-
return reuse_packfile_offset - sizeof(struct pack_header);
966+
unuse_pack(&w_curs);
839967
}
840968

841969
static const char no_split_warning[] = N_(
@@ -868,11 +996,9 @@ static void write_pack_file(void)
868996
offset = write_pack_header(f, nr_remaining);
869997

870998
if (reuse_packfile) {
871-
off_t packfile_size;
872999
assert(pack_to_stdout);
873-
874-
packfile_size = write_reused_pack(f);
875-
offset += packfile_size;
1000+
write_reused_pack(f);
1001+
offset = hashfile_total(f);
8761002
}
8771003

8781004
nr_written = 0;
@@ -2677,6 +2803,7 @@ static void prepare_pack(int window, int depth)
26772803

26782804
if (nr_deltas && n > 1) {
26792805
unsigned nr_done = 0;
2806+
26802807
if (progress)
26812808
progress_state = start_progress(_("Compressing objects"),
26822809
nr_deltas);
@@ -3062,7 +3189,6 @@ static int pack_options_allow_reuse(void)
30623189
{
30633190
return allow_pack_reuse &&
30643191
pack_to_stdout &&
3065-
allow_ofs_delta &&
30663192
!ignore_packed_keep_on_disk &&
30673193
!ignore_packed_keep_in_core &&
30683194
(!local || !have_non_local_packs) &&
@@ -3079,7 +3205,7 @@ static int get_object_list_from_bitmap(struct rev_info *revs)
30793205
bitmap_git,
30803206
&reuse_packfile,
30813207
&reuse_packfile_objects,
3082-
&reuse_packfile_offset)) {
3208+
&reuse_packfile_bitmap)) {
30833209
assert(reuse_packfile_objects);
30843210
nr_result += reuse_packfile_objects;
30853211
display_progress(progress_state, nr_result);

0 commit comments

Comments
 (0)