Skip to content

Commit a52d459

Browse files
KarthikNayakgitster
authored andcommitted
bundle: fix non-linear performance scaling with refs
The 'git bundle create' command has non-linear performance with the number of refs in the repository. Benchmarking the command shows that a large portion of the time (~75%) is spent in the `object_array_remove_duplicates()` function. The `object_array_remove_duplicates()` function was added in b2a6d1c (bundle: allow the same ref to be given more than once, 2009-01-17) to skip duplicate refs provided by the user from being written to the bundle. Since this is an O(N^2) algorithm, in repos with large number of references, this can take up a large amount of time. Let's instead use a 'strset' to skip duplicates inside `write_bundle_refs()`. This improves the performance by around 6 times when tested against in repository with 100000 refs: Benchmark 1: bundle (refcount = 100000, revision = master) Time (mean ± σ): 14.653 s ± 0.203 s [User: 13.940 s, System: 0.762 s] Range (min … max): 14.237 s … 14.920 s 10 runs Benchmark 2: bundle (refcount = 100000, revision = HEAD) Time (mean ± σ): 2.394 s ± 0.023 s [User: 1.684 s, System: 0.798 s] Range (min … max): 2.364 s … 2.425 s 10 runs Summary bundle (refcount = 100000, revision = HEAD) ran 6.12 ± 0.10 times faster than bundle (refcount = 100000, revision = master) Previously, `object_array_remove_duplicates()` ensured that both the refname and the object it pointed to were checked for duplicates. The new approach, implemented within `write_bundle_refs()`, eliminates duplicate refnames without comparing the objects they reference. This works because, for bundle creation, we only need to prevent duplicate refs from being written to the bundle header. The `revs->pending` array can contain duplicates of multiple types. First, references which resolve to the same refname. For e.g. "git bundle create out.bdl master master" or "git bundle create out.bdl refs/heads/master refs/heads/master" or "git bundle create out.bdl master refs/heads/master". In these scenarios we want to prevent writing "refs/heads/master" twice to the bundle header. Since both the refnames here would point to the same object (unless there is a race), we do not need to check equality of the object. Second, refnames which are duplicates but do not point to the same object. This can happen when we use an exclusion criteria. For e.g. "git bundle create out.bdl master master^!", Here `revs->pending` would contain two elements, both with refname set to "master". However, each of them would be pointing to an INTERESTING and UNINTERESTING object respectively. Since we only write refnames with INTERESTING objects to the bundle header, we perform our duplicate checks only on such objects. Signed-off-by: Karthik Nayak <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 09d86e0 commit a52d459

File tree

4 files changed

+7
-44
lines changed

4 files changed

+7
-44
lines changed

bundle.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
384384
{
385385
int i;
386386
int ref_count = 0;
387+
struct strset objects = STRSET_INIT;
387388

388389
for (i = 0; i < revs->pending.nr; i++) {
389390
struct object_array_entry *e = revs->pending.objects + i;
@@ -401,6 +402,9 @@ static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
401402
flag = 0;
402403
display_ref = (flag & REF_ISSYMREF) ? e->name : ref;
403404

405+
if (strset_contains(&objects, display_ref))
406+
goto skip_write_ref;
407+
404408
if (e->item->type == OBJ_TAG &&
405409
!is_tag_in_date_range(e->item, revs)) {
406410
e->item->flags |= UNINTERESTING;
@@ -423,6 +427,7 @@ static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
423427
}
424428

425429
ref_count++;
430+
strset_add(&objects, display_ref);
426431
write_or_die(bundle_fd, oid_to_hex(&e->item->oid), the_hash_algo->hexsz);
427432
write_or_die(bundle_fd, " ", 1);
428433
write_or_die(bundle_fd, display_ref, strlen(display_ref));
@@ -431,6 +436,8 @@ static int write_bundle_refs(int bundle_fd, struct rev_info *revs)
431436
free(ref);
432437
}
433438

439+
strset_clear(&objects);
440+
434441
/* end header */
435442
write_or_die(bundle_fd, "\n", 1);
436443
return ref_count;
@@ -566,7 +573,6 @@ int create_bundle(struct repository *r, const char *path,
566573
*/
567574
revs.blob_objects = revs.tree_objects = 0;
568575
traverse_commit_list(&revs, write_bundle_prerequisites, NULL, &bpi);
569-
object_array_remove_duplicates(&revs_copy.pending);
570576

571577
/* write bundle refs */
572578
ref_count = write_bundle_refs(bundle_fd, &revs_copy);

object.c

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -491,39 +491,6 @@ void object_array_clear(struct object_array *array)
491491
array->nr = array->alloc = 0;
492492
}
493493

494-
/*
495-
* Return true if array already contains an entry.
496-
*/
497-
static int contains_object(struct object_array *array,
498-
const struct object *item, const char *name)
499-
{
500-
unsigned nr = array->nr, i;
501-
struct object_array_entry *object = array->objects;
502-
503-
for (i = 0; i < nr; i++, object++)
504-
if (item == object->item && !strcmp(object->name, name))
505-
return 1;
506-
return 0;
507-
}
508-
509-
void object_array_remove_duplicates(struct object_array *array)
510-
{
511-
unsigned nr = array->nr, src;
512-
struct object_array_entry *objects = array->objects;
513-
514-
array->nr = 0;
515-
for (src = 0; src < nr; src++) {
516-
if (!contains_object(array, objects[src].item,
517-
objects[src].name)) {
518-
if (src != array->nr)
519-
objects[array->nr] = objects[src];
520-
array->nr++;
521-
} else {
522-
object_array_release_entry(&objects[src]);
523-
}
524-
}
525-
}
526-
527494
void clear_object_flags(unsigned flags)
528495
{
529496
int i;

object.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,6 @@ typedef int (*object_array_each_func_t)(struct object_array_entry *, void *);
324324
void object_array_filter(struct object_array *array,
325325
object_array_each_func_t want, void *cb_data);
326326

327-
/*
328-
* Remove from array all but the first entry with a given name.
329-
* Warning: this function uses an O(N^2) algorithm.
330-
*/
331-
void object_array_remove_duplicates(struct object_array *array);
332-
333327
/*
334328
* Remove any objects from the array, freeing all used memory; afterwards
335329
* the array is ready to store more objects with add_object_array().

t/t6020-bundle-misc.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,6 @@ test_expect_success 'create bundle with duplicate refnames' '
684684
test_cmp expect actual
685685
'
686686

687-
# This exhibits a bug, since the same refname is now added to the bundle twice.
688687
test_expect_success 'create bundle with duplicate refnames and --all' '
689688
git bundle create out.bdl --all "main" "main" &&
690689
@@ -701,7 +700,6 @@ test_expect_success 'create bundle with duplicate refnames and --all' '
701700
<TAG-2> refs/tags/v2
702701
<TAG-3> refs/tags/v3
703702
<COMMIT-P> HEAD
704-
<COMMIT-P> refs/heads/main
705703
EOF
706704
test_cmp expect actual
707705
'
@@ -717,15 +715,13 @@ test_expect_success 'create bundle with duplicate exlusion refnames' '
717715
test_cmp expect actual
718716
'
719717

720-
# This exhibits a bug, since the same refname is now added to the bundle twice.
721718
test_expect_success 'create bundle with duplicate refname short-form' '
722719
git bundle create out.bdl "main" "main" "refs/heads/main" "refs/heads/main" &&
723720
724721
git bundle list-heads out.bdl |
725722
make_user_friendly_and_stable_output >actual &&
726723
cat >expect <<-\EOF &&
727724
<COMMIT-P> refs/heads/main
728-
<COMMIT-P> refs/heads/main
729725
EOF
730726
test_cmp expect actual
731727
'

0 commit comments

Comments
 (0)