Skip to content

Commit 288ed98

Browse files
committed
Merge branch 'tb/bloom-improvements'
"git commit-graph write" learned to limit the number of bloom filters that are computed from scratch with the --max-new-filters option. * tb/bloom-improvements: commit-graph: introduce 'commitGraph.maxNewFilters' builtin/commit-graph.c: introduce '--max-new-filters=<n>' commit-graph: rename 'split_commit_graph_opts' bloom: encode out-of-bounds filters as non-empty bloom/diff: properly short-circuit on max_changes bloom: use provided 'struct bloom_filter_settings' bloom: split 'get_bloom_filter()' in two commit-graph.c: store maximum changed paths commit-graph: respect 'commitGraph.readChangedPaths' t/helper/test-read-graph.c: prepare repo settings commit-graph: pass a 'struct repository *' in more places t4216: use an '&&'-chain commit-graph: introduce 'get_bloom_filter_settings()'
2 parents c5a8f1e + d356d5d commit 288ed98

22 files changed

+508
-123
lines changed

Documentation/config.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ include::config/column.txt[]
340340

341341
include::config/commit.txt[]
342342

343+
include::config/commitgraph.txt[]
344+
343345
include::config/credential.txt[]
344346

345347
include::config/completion.txt[]

Documentation/config/commitgraph.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
commitGraph.maxNewFilters::
2+
Specifies the default value for the `--max-new-filters` option of `git
3+
commit-graph write` (c.f., linkgit:git-commit-graph[1]).
4+
5+
commitGraph.readChangedPaths::
6+
If true, then git will use the changed-path Bloom filters in the
7+
commit-graph file (if it exists, and they are present). Defaults to
8+
true. See linkgit:git-commit-graph[1] for more information.

Documentation/git-commit-graph.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ this option is given, future commit-graph writes will automatically assume
6767
that this option was intended. Use `--no-changed-paths` to stop storing this
6868
data.
6969
+
70+
With the `--max-new-filters=<n>` option, generate at most `n` new Bloom
71+
filters (if `--changed-paths` is specified). If `n` is `-1`, no limit is
72+
enforced. Only commits present in the new layer count against this
73+
limit. To retroactively compute Bloom filters over earlier layers, it is
74+
advised to use `--split=replace`. Overrides the `commitGraph.maxNewFilters`
75+
configuration.
76+
+
7077
With the `--split[=<strategy>]` option, write the commit-graph as a
7178
chain of multiple commit-graph files stored in
7279
`<dir>/info/commit-graphs`. Commit-graph layers are merged based on the

Documentation/technical/commit-graph-format.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ CHUNK DATA:
125125
* The rest of the chunk is the concatenation of all the computed Bloom
126126
filters for the commits in lexicographic order.
127127
* Note: Commits with no changes or more than 512 changes have Bloom filters
128-
of length zero.
128+
of length one, with either all bits set to zero or one respectively.
129129
* The BDAT chunk is present if and only if BIDX is present.
130130

131131
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]

blame.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,7 @@ static int maybe_changed_path(struct repository *r,
12761276
if (commit_graph_generation(origin->commit) == GENERATION_NUMBER_INFINITY)
12771277
return 1;
12781278

1279-
filter = get_bloom_filter(r, origin->commit, 0);
1279+
filter = get_bloom_filter(r, origin->commit);
12801280

12811281
if (!filter)
12821282
return 1;
@@ -2892,16 +2892,18 @@ void setup_blame_bloom_data(struct blame_scoreboard *sb,
28922892
const char *path)
28932893
{
28942894
struct blame_bloom_data *bd;
2895+
struct bloom_filter_settings *bs;
28952896

28962897
if (!sb->repo->objects->commit_graph)
28972898
return;
28982899

2899-
if (!sb->repo->objects->commit_graph->bloom_filter_settings)
2900+
bs = get_bloom_filter_settings(sb->repo);
2901+
if (!bs)
29002902
return;
29012903

29022904
bd = xmalloc(sizeof(struct blame_bloom_data));
29032905

2904-
bd->settings = sb->repo->objects->commit_graph->bloom_filter_settings;
2906+
bd->settings = bs;
29052907

29062908
bd->alloc = 4;
29072909
bd->nr = 0;

bloom.c

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
3838
while (graph_pos < g->num_commits_in_base)
3939
g = g->base_graph;
4040

41-
/* The commit graph commit 'c' lives in doesn't carry bloom filters. */
41+
/* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
4242
if (!g->chunk_bloom_indexes)
4343
return 0;
4444

@@ -177,15 +177,25 @@ static int pathmap_cmp(const void *hashmap_cmp_fn_data,
177177
return strcmp(e1->path, e2->path);
178178
}
179179

180-
struct bloom_filter *get_bloom_filter(struct repository *r,
181-
struct commit *c,
182-
int compute_if_not_present)
180+
static void init_truncated_large_filter(struct bloom_filter *filter)
181+
{
182+
filter->data = xmalloc(1);
183+
filter->data[0] = 0xFF;
184+
filter->len = 1;
185+
}
186+
187+
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
188+
struct commit *c,
189+
int compute_if_not_present,
190+
const struct bloom_filter_settings *settings,
191+
enum bloom_filter_computed *computed)
183192
{
184193
struct bloom_filter *filter;
185-
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
186194
int i;
187195
struct diff_options diffopt;
188-
int max_changes = 512;
196+
197+
if (computed)
198+
*computed = BLOOM_NOT_COMPUTED;
189199

190200
if (!bloom_filters.slab_size)
191201
return NULL;
@@ -194,20 +204,19 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
194204

195205
if (!filter->data) {
196206
load_commit_graph_info(r, c);
197-
if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH &&
198-
r->objects->commit_graph->chunk_bloom_indexes)
207+
if (commit_graph_position(c) != COMMIT_NOT_FROM_GRAPH)
199208
load_bloom_filter_from_graph(r->objects->commit_graph, filter, c);
200209
}
201210

202-
if (filter->data)
211+
if (filter->data && filter->len)
203212
return filter;
204213
if (!compute_if_not_present)
205214
return NULL;
206215

207216
repo_diff_setup(r, &diffopt);
208217
diffopt.flags.recursive = 1;
209218
diffopt.detect_rename = 0;
210-
diffopt.max_changes = max_changes;
219+
diffopt.max_changes = settings->max_changed_paths;
211220
diff_setup_done(&diffopt);
212221

213222
/* ensure commit is parsed so we have parent information */
@@ -219,7 +228,7 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
219228
diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
220229
diffcore_std(&diffopt);
221230

222-
if (diffopt.num_changes <= max_changes) {
231+
if (diff_queued_diff.nr <= settings->max_changed_paths) {
223232
struct hashmap pathmap;
224233
struct pathmap_hash_entry *e;
225234
struct hashmap_iter iter;
@@ -256,23 +265,41 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
256265
diff_free_filepair(diff_queued_diff.queue[i]);
257266
}
258267

259-
filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
268+
if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
269+
init_truncated_large_filter(filter);
270+
if (computed)
271+
*computed |= BLOOM_TRUNC_LARGE;
272+
goto cleanup;
273+
}
274+
275+
filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
276+
if (!filter->len) {
277+
if (computed)
278+
*computed |= BLOOM_TRUNC_EMPTY;
279+
filter->len = 1;
280+
}
260281
filter->data = xcalloc(filter->len, sizeof(unsigned char));
261282

262283
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
263284
struct bloom_key key;
264-
fill_bloom_key(e->path, strlen(e->path), &key, &settings);
265-
add_key_to_filter(&key, filter, &settings);
285+
fill_bloom_key(e->path, strlen(e->path), &key, settings);
286+
add_key_to_filter(&key, filter, settings);
266287
}
267288

289+
cleanup:
268290
hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
269291
} else {
270292
for (i = 0; i < diff_queued_diff.nr; i++)
271293
diff_free_filepair(diff_queued_diff.queue[i]);
272-
filter->data = NULL;
273-
filter->len = 0;
294+
init_truncated_large_filter(filter);
295+
296+
if (computed)
297+
*computed |= BLOOM_TRUNC_LARGE;
274298
}
275299

300+
if (computed)
301+
*computed |= BLOOM_COMPUTED;
302+
276303
free(diff_queued_diff.queue);
277304
DIFF_QUEUE_CLEAR(&diff_queued_diff);
278305

bloom.h

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,18 @@ struct bloom_filter_settings {
2828
* that contain n*b bits.
2929
*/
3030
uint32_t bits_per_entry;
31+
32+
/*
33+
* The maximum number of changed paths per commit
34+
* before declaring a Bloom filter to be too-large.
35+
*
36+
* Not written to the commit-graph file.
37+
*/
38+
uint32_t max_changed_paths;
3139
};
3240

33-
#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10 }
41+
#define DEFAULT_BLOOM_MAX_CHANGES 512
42+
#define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
3443
#define BITS_PER_WORD 8
3544
#define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
3645

@@ -80,9 +89,21 @@ void add_key_to_filter(const struct bloom_key *key,
8089

8190
void init_bloom_filters(void);
8291

83-
struct bloom_filter *get_bloom_filter(struct repository *r,
84-
struct commit *c,
85-
int compute_if_not_present);
92+
enum bloom_filter_computed {
93+
BLOOM_NOT_COMPUTED = (1 << 0),
94+
BLOOM_COMPUTED = (1 << 1),
95+
BLOOM_TRUNC_LARGE = (1 << 2),
96+
BLOOM_TRUNC_EMPTY = (1 << 3),
97+
};
98+
99+
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
100+
struct commit *c,
101+
int compute_if_not_present,
102+
const struct bloom_filter_settings *settings,
103+
enum bloom_filter_computed *computed);
104+
105+
#define get_bloom_filter(r, c) get_or_compute_bloom_filter( \
106+
(r), (c), 0, NULL, NULL)
86107

87108
int bloom_filter_contains(const struct bloom_filter *filter,
88109
const struct bloom_key *key,

builtin/commit-graph.c

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ static char const * const builtin_commit_graph_usage[] = {
1313
N_("git commit-graph verify [--object-dir <objdir>] [--shallow] [--[no-]progress]"),
1414
N_("git commit-graph write [--object-dir <objdir>] [--append] "
1515
"[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
16-
"[--changed-paths] [--[no-]progress] <split options>"),
16+
"[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
17+
"<split options>"),
1718
NULL
1819
};
1920

@@ -25,7 +26,8 @@ static const char * const builtin_commit_graph_verify_usage[] = {
2526
static const char * const builtin_commit_graph_write_usage[] = {
2627
N_("git commit-graph write [--object-dir <objdir>] [--append] "
2728
"[--split[=<strategy>]] [--reachable|--stdin-packs|--stdin-commits] "
28-
"[--changed-paths] [--[no-]progress] <split options>"),
29+
"[--changed-paths] [--[no-]max-new-filters <n>] [--[no-]progress] "
30+
"<split options>"),
2931
NULL
3032
};
3133

@@ -106,7 +108,7 @@ static int graph_verify(int argc, const char **argv)
106108
FREE_AND_NULL(graph_name);
107109

108110
if (open_ok)
109-
graph = load_commit_graph_one_fd_st(fd, &st, odb);
111+
graph = load_commit_graph_one_fd_st(the_repository, fd, &st, odb);
110112
else
111113
graph = read_commit_graph_one(the_repository, odb);
112114

@@ -119,7 +121,7 @@ static int graph_verify(int argc, const char **argv)
119121
}
120122

121123
extern int read_replace_refs;
122-
static struct split_commit_graph_opts split_opts;
124+
static struct commit_graph_opts write_opts;
123125

124126
static int write_option_parse_split(const struct option *opt, const char *arg,
125127
int unset)
@@ -162,6 +164,35 @@ static int read_one_commit(struct oidset *commits, struct progress *progress,
162164
return 0;
163165
}
164166

167+
static int write_option_max_new_filters(const struct option *opt,
168+
const char *arg,
169+
int unset)
170+
{
171+
int *to = opt->value;
172+
if (unset)
173+
*to = -1;
174+
else {
175+
const char *s;
176+
*to = strtol(arg, (char **)&s, 10);
177+
if (*s)
178+
return error(_("%s expects a numerical value"),
179+
optname(opt, opt->flags));
180+
}
181+
return 0;
182+
}
183+
184+
static int git_commit_graph_write_config(const char *var, const char *value,
185+
void *cb)
186+
{
187+
if (!strcmp(var, "commitgraph.maxnewfilters"))
188+
write_opts.max_new_filters = git_config_int(var, value);
189+
/*
190+
* No need to fall-back to 'git_default_config', since this was already
191+
* called in 'cmd_commit_graph()'.
192+
*/
193+
return 0;
194+
}
195+
165196
static int graph_write(int argc, const char **argv)
166197
{
167198
struct string_list pack_indexes = STRING_LIST_INIT_NODUP;
@@ -187,27 +218,33 @@ static int graph_write(int argc, const char **argv)
187218
OPT_BOOL(0, "changed-paths", &opts.enable_changed_paths,
188219
N_("enable computation for changed paths")),
189220
OPT_BOOL(0, "progress", &opts.progress, N_("force progress reporting")),
190-
OPT_CALLBACK_F(0, "split", &split_opts.flags, NULL,
221+
OPT_CALLBACK_F(0, "split", &write_opts.split_flags, NULL,
191222
N_("allow writing an incremental commit-graph file"),
192223
PARSE_OPT_OPTARG | PARSE_OPT_NONEG,
193224
write_option_parse_split),
194-
OPT_INTEGER(0, "max-commits", &split_opts.max_commits,
225+
OPT_INTEGER(0, "max-commits", &write_opts.max_commits,
195226
N_("maximum number of commits in a non-base split commit-graph")),
196-
OPT_INTEGER(0, "size-multiple", &split_opts.size_multiple,
227+
OPT_INTEGER(0, "size-multiple", &write_opts.size_multiple,
197228
N_("maximum ratio between two levels of a split commit-graph")),
198-
OPT_EXPIRY_DATE(0, "expire-time", &split_opts.expire_time,
229+
OPT_EXPIRY_DATE(0, "expire-time", &write_opts.expire_time,
199230
N_("only expire files older than a given date-time")),
231+
OPT_CALLBACK_F(0, "max-new-filters", &write_opts.max_new_filters,
232+
NULL, N_("maximum number of changed-path Bloom filters to compute"),
233+
0, write_option_max_new_filters),
200234
OPT_END(),
201235
};
202236

203237
opts.progress = isatty(2);
204238
opts.enable_changed_paths = -1;
205-
split_opts.size_multiple = 2;
206-
split_opts.max_commits = 0;
207-
split_opts.expire_time = 0;
239+
write_opts.size_multiple = 2;
240+
write_opts.max_commits = 0;
241+
write_opts.expire_time = 0;
242+
write_opts.max_new_filters = -1;
208243

209244
trace2_cmd_mode("write");
210245

246+
git_config(git_commit_graph_write_config, &opts);
247+
211248
argc = parse_options(argc, argv, NULL,
212249
builtin_commit_graph_write_options,
213250
builtin_commit_graph_write_usage, 0);
@@ -232,7 +269,7 @@ static int graph_write(int argc, const char **argv)
232269
odb = find_odb(the_repository, opts.obj_dir);
233270

234271
if (opts.reachable) {
235-
if (write_commit_graph_reachable(odb, flags, &split_opts))
272+
if (write_commit_graph_reachable(odb, flags, &write_opts))
236273
return 1;
237274
return 0;
238275
}
@@ -261,7 +298,7 @@ static int graph_write(int argc, const char **argv)
261298
opts.stdin_packs ? &pack_indexes : NULL,
262299
opts.stdin_commits ? &commits : NULL,
263300
flags,
264-
&split_opts))
301+
&write_opts))
265302
result = 1;
266303

267304
cleanup:

0 commit comments

Comments
 (0)