Skip to content

Commit 6b7e02f

Browse files
dschoGit for Windows Build Agent
authored andcommitted
pack-objects: create new name-hash algorithm (#5157)
This is an updated version of gitgitgadget#1785, intended for early consumption into Git for Windows. The idea here is to add a new `--full-name-hash` option to `git pack-objects` and `git repack`. This adjusts the name-hash value used for finding delta bases in such a way that uses the full path name with a lower likelihood of collisions than the default name-hash algorithm. In many repositories with name-hash collisions and many versions of those paths, this can significantly reduce the size of a full repack. It can also help in certain cases of `git push`, but only if the pack is already artificially inflated by name-hash collisions; cases that find "sibling" deltas as better choices become worse with `--full-name-hash`. Thus, this option is currently recommended for full repacks of large repos, and on client machines without reachability bitmaps. Some care is taken to ignore this option when using bitmaps, either writing bitmaps or using a bitmap walk during reads. The bitmap file format contains name-hash values, but no way to indicate which function is used, so compatibility is a concern for bitmaps. Future work could explore this idea. After this PR is merged, then the more-involved `--path-walk` option may be considered.
2 parents 7e3ac2d + 9601e22 commit 6b7e02f

20 files changed

+258
-13
lines changed

Documentation/git-pack-objects.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ SYNOPSIS
1616
[--cruft] [--cruft-expiration=<time>]
1717
[--stdout [--filter=<filter-spec>] | <base-name>]
1818
[--shallow] [--keep-true-parents] [--[no-]sparse]
19-
[--path-walk] < <object-list>
19+
[--path-walk] [--full-name-hash] < <object-list>
2020

2121

2222
DESCRIPTION

Documentation/git-repack.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ SYNOPSIS
1111
[verse]
1212
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
1313
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
14-
[--write-midx] [--path-walk]
14+
[--write-midx] [--path-walk] [--full-name-hash]
1515

1616
DESCRIPTION
1717
-----------

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -824,6 +824,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
824824
TEST_BUILTINS_OBJS += test-match-trees.o
825825
TEST_BUILTINS_OBJS += test-mergesort.o
826826
TEST_BUILTINS_OBJS += test-mktemp.o
827+
TEST_BUILTINS_OBJS += test-name-hash.o
827828
TEST_BUILTINS_OBJS += test-online-cpus.o
828829
TEST_BUILTINS_OBJS += test-pack-mtimes.o
829830
TEST_BUILTINS_OBJS += test-parse-options.o

builtin/pack-objects.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ static const char *pack_usage[] = {
192192
" [--cruft] [--cruft-expiration=<time>]\n"
193193
" [--stdout [--filter=<filter-spec>] | <base-name>]\n"
194194
" [--shallow] [--keep-true-parents] [--[no-]sparse]\n"
195-
" [--path-walk] < <object-list>"),
195+
" [--path-walk] [--full-name-hash] < <object-list>"),
196196
NULL
197197
};
198198

@@ -276,6 +276,14 @@ struct configured_exclusion {
276276
static struct oidmap configured_exclusions;
277277

278278
static struct oidset excluded_by_config;
279+
static int use_full_name_hash = -1;
280+
281+
static inline uint32_t pack_name_hash_fn(const char *name)
282+
{
283+
if (use_full_name_hash)
284+
return pack_full_name_hash(name);
285+
return pack_name_hash(name);
286+
}
279287

280288
/*
281289
* stats
@@ -1709,7 +1717,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
17091717
return 0;
17101718
}
17111719

1712-
create_object_entry(oid, type, pack_name_hash(name),
1720+
create_object_entry(oid, type, pack_name_hash_fn(name),
17131721
exclude, name && no_try_delta(name),
17141722
found_pack, found_offset);
17151723
return 1;
@@ -1923,7 +1931,7 @@ static void add_preferred_base_object(const char *name)
19231931
{
19241932
struct pbase_tree *it;
19251933
size_t cmplen;
1926-
unsigned hash = pack_name_hash(name);
1934+
unsigned hash = pack_name_hash_fn(name);
19271935

19281936
if (!num_preferred_base || check_pbase_path(hash))
19291937
return;
@@ -3644,7 +3652,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
36443652
* here using a now in order to perhaps improve the delta selection
36453653
* process.
36463654
*/
3647-
oe->hash = pack_name_hash(name);
3655+
oe->hash = pack_name_hash_fn(name);
36483656
oe->no_try_delta = name && no_try_delta(name);
36493657

36503658
stdin_packs_hints_nr++;
@@ -3794,7 +3802,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
37943802
entry = packlist_find(&to_pack, oid);
37953803
if (entry) {
37963804
if (name) {
3797-
entry->hash = pack_name_hash(name);
3805+
entry->hash = pack_name_hash_fn(name);
37983806
entry->no_try_delta = no_try_delta(name);
37993807
}
38003808
} else {
@@ -3817,7 +3825,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
38173825
return;
38183826
}
38193827

3820-
entry = create_object_entry(oid, type, pack_name_hash(name),
3828+
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
38213829
0, name && no_try_delta(name),
38223830
pack, offset);
38233831
}
@@ -4739,6 +4747,8 @@ int cmd_pack_objects(int argc,
47394747
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
47404748
N_("protocol"),
47414749
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
4750+
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
4751+
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
47424752
OPT_END(),
47434753
};
47444754

@@ -4917,6 +4927,11 @@ int cmd_pack_objects(int argc,
49174927
if (pack_to_stdout || !rev_list_all)
49184928
write_bitmap_index = 0;
49194929

4930+
if (write_bitmap_index && use_full_name_hash > 0)
4931+
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
4932+
if (use_full_name_hash < 0)
4933+
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);
4934+
49204935
if (use_delta_islands)
49214936
strvec_push(&rp, "--topo-order");
49224937

builtin/repack.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ static char *packdir, *packtmp_name, *packtmp;
4141
static const char *const git_repack_usage[] = {
4242
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
4343
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
44-
"[--write-midx] [--path-walk]"),
44+
"[--write-midx] [--path-walk] [--full-name-hash]"),
4545
NULL
4646
};
4747

@@ -61,6 +61,7 @@ struct pack_objects_args {
6161
int quiet;
6262
int local;
6363
int path_walk;
64+
int full_name_hash;
6465
struct list_objects_filter_options filter_options;
6566
};
6667

@@ -311,6 +312,8 @@ static void prepare_pack_objects(struct child_process *cmd,
311312
strvec_pushf(&cmd->args, "--no-reuse-object");
312313
if (args->path_walk)
313314
strvec_pushf(&cmd->args, "--path-walk");
315+
if (args->full_name_hash)
316+
strvec_pushf(&cmd->args, "--full-name-hash");
314317
if (args->local)
315318
strvec_push(&cmd->args, "--local");
316319
if (args->quiet)
@@ -1210,6 +1213,8 @@ int cmd_repack(int argc,
12101213
N_("pass --no-reuse-object to git-pack-objects")),
12111214
OPT_BOOL(0, "path-walk", &po_args.path_walk,
12121215
N_("pass --path-walk to git-pack-objects")),
1216+
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
1217+
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
12131218
OPT_NEGBIT('n', NULL, &run_update_server_info,
12141219
N_("do not run git-update-server-info"), 1),
12151220
OPT__QUIET(&po_args.quiet, N_("be quiet")),

ci/run-build-and-tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ linux-TEST-vars)
2626
export GIT_TEST_CHECKOUT_WORKERS=2
2727
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
2828
export GIT_TEST_PACK_PATH_WALK=1
29+
export GIT_TEST_FULL_NAME_HASH=1
2930
;;
3031
linux-clang)
3132
export GIT_TEST_DEFAULT_HASH=sha1

pack-objects.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,27 @@ static inline uint32_t pack_name_hash(const char *name)
219219
return hash;
220220
}
221221

222+
static inline uint32_t pack_full_name_hash(const char *name)
223+
{
224+
const uint32_t bigp = 1234572167U;
225+
uint32_t c, hash = bigp;
226+
227+
if (!name)
228+
return 0;
229+
230+
/*
231+
* Do the simplest thing that will resemble pseudo-randomness: add
232+
* random multiples of a large prime number with a binary shift.
233+
* The goal is not to be cryptographic, but to be generally
234+
* uniformly distributed.
235+
*/
236+
while ((c = *name++) != 0) {
237+
hash += c * bigp;
238+
hash = (hash >> 5) | (hash << 27);
239+
}
240+
return hash;
241+
}
242+
222243
static inline enum object_type oe_type(const struct object_entry *e)
223244
{
224245
return e->type_valid ? e->type_ : OBJ_BAD;

t/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,10 @@ a test and then fails then the whole test run will abort. This can help to make
495495
sure the expected tests are executed and not silently skipped when their
496496
dependency breaks or is simply not present in a new environment.
497497

498+
GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
499+
function in 'git pack-objects' to be the one used by the --full-name-hash
500+
option.
501+
498502
Naming Tests
499503
------------
500504

t/helper/test-name-hash.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* test-name-hash.c: Read a list of paths over stdin and report on their
3+
* name-hash and full name-hash.
4+
*/
5+
6+
#include "test-tool.h"
7+
#include "git-compat-util.h"
8+
#include "pack-objects.h"
9+
#include "strbuf.h"
10+
11+
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
12+
{
13+
struct strbuf line = STRBUF_INIT;
14+
15+
while (!strbuf_getline(&line, stdin)) {
16+
uint32_t name_hash = pack_name_hash(line.buf);
17+
uint32_t full_hash = pack_full_name_hash(line.buf);
18+
19+
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
20+
}
21+
22+
strbuf_release(&line);
23+
return 0;
24+
}

t/helper/test-tool.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static struct test_cmd cmds[] = {
4444
{ "match-trees", cmd__match_trees },
4545
{ "mergesort", cmd__mergesort },
4646
{ "mktemp", cmd__mktemp },
47+
{ "name-hash", cmd__name_hash },
4748
{ "online-cpus", cmd__online_cpus },
4849
{ "pack-mtimes", cmd__pack_mtimes },
4950
{ "parse-options", cmd__parse_options },

0 commit comments

Comments
 (0)