Skip to content

Commit a56b946

Browse files
garimasi514gitster
authored andcommitted
revision.c: use Bloom filters to speed up path based revision walks
Revision walk will now use Bloom filters for commits to speed up revision walks for a particular path (for computing history for that path), if they are present in the commit-graph file. We load the Bloom filters during the prepare_revision_walk step, currently only when dealing with a single pathspec. Extending it to work with multiple pathspecs can be explored and built on top of this series in the future. While comparing trees in rev_compare_trees(), if the Bloom filter says that the file is not different between the two trees, we don't need to compute the expensive diff. This is where we get our performance gains. The other response of the Bloom filter is '`:maybe', in which case we fall back to the full diff calculation to determine if the path was changed in the commit. We do not try to use Bloom filters when the '--walk-reflogs' option is specified. The '--walk-reflogs' option does not walk the commit ancestry chain like the rest of the options. Incorporating the performance gains when walking reflog entries would add more complexity, and can be explored in a later series. Performance Gains: We tested the performance of `git log -- <path>` on the git repo, the linux and some internal large repos, with a variety of paths of varying depths. On the git and linux repos: - we observed a 2x to 5x speed up. On a large internal repo with files seated 6-10 levels deep in the tree: - we observed 10x to 20x speed ups, with some paths going up to 28 times faster. Helped-by: Derrick Stolee <[email protected] Helped-by: SZEDER Gábor <[email protected]> Helped-by: Jonathan Tan <[email protected]> Signed-off-by: Garima Singh <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent d38e07b commit a56b946

File tree

4 files changed

+118
-2
lines changed

4 files changed

+118
-2
lines changed

bloom.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,23 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
253253

254254
return filter;
255255
}
256+
257+
int bloom_filter_contains(const struct bloom_filter *filter,
258+
const struct bloom_key *key,
259+
const struct bloom_filter_settings *settings)
260+
{
261+
int i;
262+
uint64_t mod = filter->len * BITS_PER_WORD;
263+
264+
if (!mod)
265+
return -1;
266+
267+
for (i = 0; i < settings->num_hashes; i++) {
268+
uint64_t hash_mod = key->hashes[i] % mod;
269+
uint64_t block_pos = hash_mod / BITS_PER_WORD;
270+
if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
271+
return 0;
272+
}
273+
274+
return 1;
275+
}

bloom.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,8 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
8383
struct commit *c,
8484
int compute_if_not_present);
8585

86+
int bloom_filter_contains(const struct bloom_filter *filter,
87+
const struct bloom_key *key,
88+
const struct bloom_filter_settings *settings);
89+
8690
#endif

revision.c

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "prio-queue.h"
3030
#include "hashmap.h"
3131
#include "utf8.h"
32+
#include "bloom.h"
3233

3334
volatile show_early_output_fn_t show_early_output;
3435

@@ -624,11 +625,80 @@ static void file_change(struct diff_options *options,
624625
options->flags.has_changes = 1;
625626
}
626627

628+
static void prepare_to_use_bloom_filter(struct rev_info *revs)
629+
{
630+
struct pathspec_item *pi;
631+
char *path_alloc = NULL;
632+
const char *path;
633+
int last_index;
634+
int len;
635+
636+
if (!revs->commits)
637+
return;
638+
639+
repo_parse_commit(revs->repo, revs->commits->item);
640+
641+
if (!revs->repo->objects->commit_graph)
642+
return;
643+
644+
revs->bloom_filter_settings = revs->repo->objects->commit_graph->bloom_filter_settings;
645+
if (!revs->bloom_filter_settings)
646+
return;
647+
648+
pi = &revs->pruning.pathspec.items[0];
649+
last_index = pi->len - 1;
650+
651+
/* remove single trailing slash from path, if needed */
652+
if (pi->match[last_index] == '/') {
653+
path_alloc = xstrdup(pi->match);
654+
path_alloc[last_index] = '\0';
655+
path = path_alloc;
656+
} else
657+
path = pi->match;
658+
659+
len = strlen(path);
660+
661+
revs->bloom_key = xmalloc(sizeof(struct bloom_key));
662+
fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
663+
664+
free(path_alloc);
665+
}
666+
667+
static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
668+
struct commit *commit)
669+
{
670+
struct bloom_filter *filter;
671+
int result;
672+
673+
if (!revs->repo->objects->commit_graph)
674+
return -1;
675+
676+
if (commit->generation == GENERATION_NUMBER_INFINITY)
677+
return -1;
678+
679+
filter = get_bloom_filter(revs->repo, commit, 0);
680+
681+
if (!filter) {
682+
return -1;
683+
}
684+
685+
if (!filter->len) {
686+
return -1;
687+
}
688+
689+
result = bloom_filter_contains(filter,
690+
revs->bloom_key,
691+
revs->bloom_filter_settings);
692+
693+
return result;
694+
}
695+
627696
static int rev_compare_tree(struct rev_info *revs,
628-
struct commit *parent, struct commit *commit)
697+
struct commit *parent, struct commit *commit, int nth_parent)
629698
{
630699
struct tree *t1 = get_commit_tree(parent);
631700
struct tree *t2 = get_commit_tree(commit);
701+
int bloom_ret = 1;
632702

633703
if (!t1)
634704
return REV_TREE_NEW;
@@ -653,11 +723,19 @@ static int rev_compare_tree(struct rev_info *revs,
653723
return REV_TREE_SAME;
654724
}
655725

726+
if (revs->bloom_key && !nth_parent) {
727+
bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
728+
729+
if (bloom_ret == 0)
730+
return REV_TREE_SAME;
731+
}
732+
656733
tree_difference = REV_TREE_SAME;
657734
revs->pruning.flags.has_changes = 0;
658735
if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
659736
&revs->pruning) < 0)
660737
return REV_TREE_DIFFERENT;
738+
661739
return tree_difference;
662740
}
663741

@@ -855,7 +933,7 @@ static void try_to_simplify_commit(struct rev_info *revs, struct commit *commit)
855933
die("cannot simplify commit %s (because of %s)",
856934
oid_to_hex(&commit->object.oid),
857935
oid_to_hex(&p->object.oid));
858-
switch (rev_compare_tree(revs, p, commit)) {
936+
switch (rev_compare_tree(revs, p, commit, nth_parent)) {
859937
case REV_TREE_SAME:
860938
if (!revs->simplify_history || !relevant_commit(p)) {
861939
/* Even if a merge with an uninteresting
@@ -3362,6 +3440,8 @@ int prepare_revision_walk(struct rev_info *revs)
33623440
FOR_EACH_OBJECT_PROMISOR_ONLY);
33633441
}
33643442

3443+
if (revs->pruning.pathspec.nr == 1 && !revs->reflog_info)
3444+
prepare_to_use_bloom_filter(revs);
33653445
if (revs->no_walk != REVISION_WALK_NO_WALK_UNSORTED)
33663446
commit_list_sort_by_date(&revs->commits);
33673447
if (revs->no_walk)
@@ -3379,6 +3459,7 @@ int prepare_revision_walk(struct rev_info *revs)
33793459
simplify_merges(revs);
33803460
if (revs->children.name)
33813461
set_children(revs);
3462+
33823463
return 0;
33833464
}
33843465

revision.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ struct repository;
5656
struct rev_info;
5757
struct string_list;
5858
struct saved_parents;
59+
struct bloom_key;
60+
struct bloom_filter_settings;
5961
define_shared_commit_slab(revision_sources, char *);
6062

6163
struct rev_cmdline_info {
@@ -291,6 +293,15 @@ struct rev_info {
291293
struct revision_sources *sources;
292294

293295
struct topo_walk_info *topo_walk_info;
296+
297+
/* Commit graph bloom filter fields */
298+
/* The bloom filter key for the pathspec */
299+
struct bloom_key *bloom_key;
300+
/*
301+
* The bloom filter settings used to generate the key.
302+
* This is loaded from the commit-graph being used.
303+
*/
304+
struct bloom_filter_settings *bloom_filter_settings;
294305
};
295306

296307
int ref_excluded(struct string_list *, const char *path);

0 commit comments

Comments
 (0)