Skip to content

Commit 0fabafd

Browse files
ttaylorrgitster
authored andcommitted
builtin/repack.c: add '--geometric' option
Often it is useful to both: - have relatively few packfiles in a repository, and - avoid having so few packfiles in a repository that we repack its entire contents regularly This patch implements a '--geometric=<n>' option in 'git repack'. This allows the caller to specify that they would like each pack to be at least a factor times as large as the previous largest pack (by object count). Concretely, say that a repository has 'n' packfiles, labeled P1, P2, ..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'. With a geometric factor of 'r', it should be that: objects(Pi) > r*objects(P(i-1)) for all i in [1, n], where the packs are sorted by objects(P1) <= objects(P2) <= ... <= objects(Pn). Since finding a true optimal repacking is NP-hard, we approximate it along two directions: 1. We assume that there is a cutoff of packs _before starting the repack_ where everything to the right of that cut-off already forms a geometric progression (or no cutoff exists and everything must be repacked). 2. We assume that everything smaller than the cutoff count must be repacked. This forms our base assumption, but it can also cause even the "heavy" packs to get repacked, for e.g., if we have 6 packs containing the following number of objects: 1, 1, 1, 2, 4, 32 then we would place the cutoff between '1, 1' and '1, 2, 4, 32', rolling up the first two packs into a pack with 2 objects. That breaks our progression and leaves us: 2, 1, 2, 4, 32 ^ (where the '^' indicates the position of our split). To restore a progression, we move the split forward (towards larger packs) joining each pack into our new pack until a geometric progression is restored. Here, that looks like: 2, 1, 2, 4, 32 ~> 3, 2, 4, 32 ~> 5, 4, 32 ~> ... ~> 9, 32 ^ ^ ^ ^ This has the advantage of not repacking the heavy-side of packs too often while also only creating one new pack at a time. Another wrinkle is that we assume that loose, indexed, and reflog'd objects are insignificant, and lump them into any new pack that we create. This can lead to non-idempotent results. Suggested-by: Derrick Stolee <[email protected]> Signed-off-by: Taylor Blau <[email protected]> Reviewed-by: Jeff King <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 20b031f commit 0fabafd

File tree

3 files changed

+343
-4
lines changed

3 files changed

+343
-4
lines changed

Documentation/git-repack.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,29 @@ depth is 4095.
165165
Pass the `--delta-islands` option to `git-pack-objects`, see
166166
linkgit:git-pack-objects[1].
167167

168+
-g=<factor>::
169+
--geometric=<factor>::
170+
Arrange resulting pack structure so that each successive pack
171+
contains at least `<factor>` times the number of objects as the
172+
next-largest pack.
173+
+
174+
`git repack` ensures this by determining a "cut" of packfiles that need
175+
to be repacked into one in order to ensure a geometric progression. It
176+
picks the smallest set of packfiles such that as many of the larger
177+
packfiles (by count of objects contained in that pack) may be left
178+
intact.
179+
+
180+
Unlike other repack modes, the set of objects to pack is determined
181+
uniquely by the set of packs being "rolled-up"; in other words, the
182+
packs determined to need to be combined in order to restore a geometric
183+
progression.
184+
+
185+
When `--unpacked` is specified, loose objects are implicitly included in
186+
this "roll-up", without respect to their reachability. This is subject
187+
to change in the future. This option (implying a drastically different
188+
repack mode) is not guaranteed to work with all other combinations of
189+
option to `git repack`).
190+
168191
Configuration
169192
-------------
170193

builtin/repack.c

Lines changed: 183 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -297,13 +297,132 @@ static void repack_promisor_objects(const struct pack_objects_args *args,
297297
#define ALL_INTO_ONE 1
298298
#define LOOSEN_UNREACHABLE 2
299299

300+
struct pack_geometry {
301+
struct packed_git **pack;
302+
uint32_t pack_nr, pack_alloc;
303+
uint32_t split;
304+
};
305+
306+
static uint32_t geometry_pack_weight(struct packed_git *p)
307+
{
308+
if (open_pack_index(p))
309+
die(_("cannot open index for %s"), p->pack_name);
310+
return p->num_objects;
311+
}
312+
313+
static int geometry_cmp(const void *va, const void *vb)
314+
{
315+
uint32_t aw = geometry_pack_weight(*(struct packed_git **)va),
316+
bw = geometry_pack_weight(*(struct packed_git **)vb);
317+
318+
if (aw < bw)
319+
return -1;
320+
if (aw > bw)
321+
return 1;
322+
return 0;
323+
}
324+
325+
static void init_pack_geometry(struct pack_geometry **geometry_p)
326+
{
327+
struct packed_git *p;
328+
struct pack_geometry *geometry;
329+
330+
*geometry_p = xcalloc(1, sizeof(struct pack_geometry));
331+
geometry = *geometry_p;
332+
333+
for (p = get_all_packs(the_repository); p; p = p->next) {
334+
if (!pack_kept_objects && p->pack_keep)
335+
continue;
336+
337+
ALLOC_GROW(geometry->pack,
338+
geometry->pack_nr + 1,
339+
geometry->pack_alloc);
340+
341+
geometry->pack[geometry->pack_nr] = p;
342+
geometry->pack_nr++;
343+
}
344+
345+
QSORT(geometry->pack, geometry->pack_nr, geometry_cmp);
346+
}
347+
348+
static void split_pack_geometry(struct pack_geometry *geometry, int factor)
349+
{
350+
uint32_t i;
351+
uint32_t split;
352+
off_t total_size = 0;
353+
354+
if (geometry->pack_nr <= 1) {
355+
geometry->split = geometry->pack_nr;
356+
return;
357+
}
358+
359+
split = geometry->pack_nr - 1;
360+
361+
/*
362+
* First, count the number of packs (in descending order of size) which
363+
* already form a geometric progression.
364+
*/
365+
for (i = geometry->pack_nr - 1; i > 0; i--) {
366+
struct packed_git *ours = geometry->pack[i];
367+
struct packed_git *prev = geometry->pack[i - 1];
368+
if (geometry_pack_weight(ours) >= factor * geometry_pack_weight(prev))
369+
split--;
370+
else
371+
break;
372+
}
373+
374+
if (split) {
375+
/*
376+
* Move the split one to the right, since the top element in the
377+
* last-compared pair can't be in the progression. Only do this
378+
* when we split in the middle of the array (otherwise if we got
379+
* to the end, then the split is in the right place).
380+
*/
381+
split++;
382+
}
383+
384+
/*
385+
* Then, anything to the left of 'split' must be in a new pack. But,
386+
* creating that new pack may cause packs in the heavy half to no longer
387+
* form a geometric progression.
388+
*
389+
* Compute an expected size of the new pack, and then determine how many
390+
* packs in the heavy half need to be joined into it (if any) to restore
391+
* the geometric progression.
392+
*/
393+
for (i = 0; i < split; i++)
394+
total_size += geometry_pack_weight(geometry->pack[i]);
395+
for (i = split; i < geometry->pack_nr; i++) {
396+
struct packed_git *ours = geometry->pack[i];
397+
if (geometry_pack_weight(ours) < factor * total_size) {
398+
split++;
399+
total_size += geometry_pack_weight(ours);
400+
} else
401+
break;
402+
}
403+
404+
geometry->split = split;
405+
}
406+
407+
static void clear_pack_geometry(struct pack_geometry *geometry)
408+
{
409+
if (!geometry)
410+
return;
411+
412+
free(geometry->pack);
413+
geometry->pack_nr = 0;
414+
geometry->pack_alloc = 0;
415+
geometry->split = 0;
416+
}
417+
300418
int cmd_repack(int argc, const char **argv, const char *prefix)
301419
{
302420
struct child_process cmd = CHILD_PROCESS_INIT;
303421
struct string_list_item *item;
304422
struct string_list names = STRING_LIST_INIT_DUP;
305423
struct string_list rollback = STRING_LIST_INIT_NODUP;
306424
struct string_list existing_packs = STRING_LIST_INIT_DUP;
425+
struct pack_geometry *geometry = NULL;
307426
struct strbuf line = STRBUF_INIT;
308427
int i, ext, ret;
309428
FILE *out;
@@ -316,6 +435,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
316435
struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
317436
int no_update_server_info = 0;
318437
struct pack_objects_args po_args = {NULL};
438+
int geometric_factor = 0;
319439

320440
struct option builtin_repack_options[] = {
321441
OPT_BIT('a', NULL, &pack_everything,
@@ -356,6 +476,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
356476
N_("repack objects in packs marked with .keep")),
357477
OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
358478
N_("do not repack this pack")),
479+
OPT_INTEGER('g', "geometric", &geometric_factor,
480+
N_("find a geometric progression with factor <N>")),
359481
OPT_END()
360482
};
361483

@@ -382,6 +504,13 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
382504
if (write_bitmaps && !(pack_everything & ALL_INTO_ONE))
383505
die(_(incremental_bitmap_conflict_error));
384506

507+
if (geometric_factor) {
508+
if (pack_everything)
509+
die(_("--geometric is incompatible with -A, -a"));
510+
init_pack_geometry(&geometry);
511+
split_pack_geometry(geometry, geometric_factor);
512+
}
513+
385514
packdir = mkpathdup("%s/pack", get_object_directory());
386515
packtmp = mkpathdup("%s/.tmp-%d-pack", packdir, (int)getpid());
387516

@@ -396,9 +525,19 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
396525
strvec_pushf(&cmd.args, "--keep-pack=%s",
397526
keep_pack_list.items[i].string);
398527
strvec_push(&cmd.args, "--non-empty");
399-
strvec_push(&cmd.args, "--all");
400-
strvec_push(&cmd.args, "--reflog");
401-
strvec_push(&cmd.args, "--indexed-objects");
528+
if (!geometry) {
529+
/*
530+
* 'git pack-objects' will up all objects loose or packed
531+
* (either rolling them up or leaving them alone), so don't pass
532+
* these options.
533+
*
534+
* The implementation of 'git pack-objects --stdin-packs'
535+
* makes them redundant (and the two are incompatible).
536+
*/
537+
strvec_push(&cmd.args, "--all");
538+
strvec_push(&cmd.args, "--reflog");
539+
strvec_push(&cmd.args, "--indexed-objects");
540+
}
402541
if (has_promisor_remote())
403542
strvec_push(&cmd.args, "--exclude-promisor-objects");
404543
if (write_bitmaps > 0)
@@ -429,17 +568,37 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
429568
strvec_push(&cmd.env_array, "GIT_REF_PARANOIA=1");
430569
}
431570
}
571+
} else if (geometry) {
572+
strvec_push(&cmd.args, "--stdin-packs");
573+
strvec_push(&cmd.args, "--unpacked");
432574
} else {
433575
strvec_push(&cmd.args, "--unpacked");
434576
strvec_push(&cmd.args, "--incremental");
435577
}
436578

437-
cmd.no_stdin = 1;
579+
if (geometry)
580+
cmd.in = -1;
581+
else
582+
cmd.no_stdin = 1;
438583

439584
ret = start_command(&cmd);
440585
if (ret)
441586
return ret;
442587

588+
if (geometry) {
589+
FILE *in = xfdopen(cmd.in, "w");
590+
/*
591+
* The resulting pack should contain all objects in packs that
592+
* are going to be rolled up, but exclude objects in packs which
593+
* are being left alone.
594+
*/
595+
for (i = 0; i < geometry->split; i++)
596+
fprintf(in, "%s\n", pack_basename(geometry->pack[i]));
597+
for (i = geometry->split; i < geometry->pack_nr; i++)
598+
fprintf(in, "^%s\n", pack_basename(geometry->pack[i]));
599+
fclose(in);
600+
}
601+
443602
out = xfdopen(cmd.out, "r");
444603
while (strbuf_getline_lf(&line, out) != EOF) {
445604
if (line.len != the_hash_algo->hexsz)
@@ -507,6 +666,25 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
507666
if (!string_list_has_string(&names, sha1))
508667
remove_redundant_pack(packdir, item->string);
509668
}
669+
670+
if (geometry) {
671+
struct strbuf buf = STRBUF_INIT;
672+
673+
uint32_t i;
674+
for (i = 0; i < geometry->split; i++) {
675+
struct packed_git *p = geometry->pack[i];
676+
if (string_list_has_string(&names,
677+
hash_to_hex(p->hash)))
678+
continue;
679+
680+
strbuf_reset(&buf);
681+
strbuf_addstr(&buf, pack_basename(p));
682+
strbuf_strip_suffix(&buf, ".pack");
683+
684+
remove_redundant_pack(packdir, buf.buf);
685+
}
686+
strbuf_release(&buf);
687+
}
510688
if (!po_args.quiet && isatty(2))
511689
opts |= PRUNE_PACKED_VERBOSE;
512690
prune_packed_objects(opts);
@@ -528,6 +706,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
528706
string_list_clear(&names, 0);
529707
string_list_clear(&rollback, 0);
530708
string_list_clear(&existing_packs, 0);
709+
clear_pack_geometry(geometry);
531710
strbuf_release(&line);
532711

533712
return 0;

0 commit comments

Comments
 (0)