Skip to content

Commit 410334e

Browse files
derrickstoleegitster
authored andcommitted
read-cache: use hashfile instead of git_hash_ctx
The do_write_index() method in read-cache.c has its own hashing logic and buffering mechanism. Specifically, the ce_write() method was introduced by 4990aad (Speed up index file writing by chunking it nicely, 2005-04-20) and similar mechanisms were introduced a few months later in c38138c (git-pack-objects: write the pack files with a SHA1 csum, 2005-06-26). Based on the timing, in the early days of the Git codebase, I figured that these roughly equivalent code paths were never unified only because it got lost in the shuffle. The hashfile API has since been used extensively in other file formats, such as pack-indexes, multi-pack-indexes, and commit-graphs. Therefore, it seems prudent to unify the index writing code to use the same mechanism. I discovered this disparity while trying to create a new index format that uses the chunk-format API. That API uses a hashfile as its base, so it is incompatible with the custom code in read-cache.c. This rewrite is rather straightforward. It replaces all writes to the temporary file with writes to the hashfile struct. This takes care of many of the direct interactions with the_hash_algo. There are still some git_hash_ctx uses remaining: the extension headers are hashed for use in the End of Index Entries (EOIE) extension. This use of the git_hash_ctx is left as-is. There are multiple reasons to not use a hashfile here, including the fact that the data is not actually writing to a file, just a hash computation. These hashes do not block our adoption of the chunk-format API in a future change to the index, so leave it as-is. The internals of the algorithms are mostly identical. Previously, the hashfile API used a smaller 8KB buffer instead of the 128KB buffer from read-cache.c. The previous change already unified these sizes. There is one subtle point: we do not pass the CSUM_FSYNC to the finalize_hashfile() method, which differs from most consumers of the hashfile API. The extra fsync() call indicated by this flag causes a significant peformance degradation that is noticeable for quick commands that write the index, such as "git add". Other consumers can absorb this cost with their more complicated data structure organization, and further writing structures such as pack-files and commit-graphs is rarely in the critical path for common user interactions. Some static methods become orphaned in this diff, so I marked them as MAYBE_UNUSED. The diff is much harder to read if they are deleted during this change. Instead, they will be deleted in the following change. In addition to the test suite passing, I computed indexes using the previous binaries and the binaries compiled after this change, and found the index data to be exactly equal. Finally, I did extensive performance testing of "git update-index --force-write" on repos of various sizes, including one with over 2 million paths at HEAD. These tests demonstrated less than 1% difference in behavior. As expected, the performance should be considered unchanged. The previous changes to increase the hashfile buffer size from 8K to 128K ensured this change would not create a peformance regression. Signed-off-by: Derrick Stolee <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 2ca245f commit 410334e

File tree

1 file changed

+66
-71
lines changed

1 file changed

+66
-71
lines changed

read-cache.c

Lines changed: 66 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "thread-utils.h"
2727
#include "progress.h"
2828
#include "sparse-index.h"
29+
#include "csum-file.h"
2930

3031
/* Mask for the name length in ce_flags in the on-disk index */
3132

@@ -2525,6 +2526,7 @@ int repo_index_has_changes(struct repository *repo,
25252526
static unsigned char write_buffer[WRITE_BUFFER_SIZE];
25262527
static unsigned long write_buffer_len;
25272528

2529+
MAYBE_UNUSED
25282530
static int ce_write_flush(git_hash_ctx *context, int fd)
25292531
{
25302532
unsigned int buffered = write_buffer_len;
@@ -2537,6 +2539,7 @@ static int ce_write_flush(git_hash_ctx *context, int fd)
25372539
return 0;
25382540
}
25392541

2542+
MAYBE_UNUSED
25402543
static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
25412544
{
25422545
while (len) {
@@ -2559,19 +2562,24 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
25592562
return 0;
25602563
}
25612564

2562-
static int write_index_ext_header(git_hash_ctx *context, git_hash_ctx *eoie_context,
2563-
int fd, unsigned int ext, unsigned int sz)
2565+
static int write_index_ext_header(struct hashfile *f,
2566+
git_hash_ctx *eoie_f,
2567+
unsigned int ext,
2568+
unsigned int sz)
25642569
{
2565-
ext = htonl(ext);
2566-
sz = htonl(sz);
2567-
if (eoie_context) {
2568-
the_hash_algo->update_fn(eoie_context, &ext, 4);
2569-
the_hash_algo->update_fn(eoie_context, &sz, 4);
2570+
hashwrite_be32(f, ext);
2571+
hashwrite_be32(f, sz);
2572+
2573+
if (eoie_f) {
2574+
ext = htonl(ext);
2575+
sz = htonl(sz);
2576+
the_hash_algo->update_fn(eoie_f, &ext, sizeof(ext));
2577+
the_hash_algo->update_fn(eoie_f, &sz, sizeof(sz));
25702578
}
2571-
return ((ce_write(context, fd, &ext, 4) < 0) ||
2572-
(ce_write(context, fd, &sz, 4) < 0)) ? -1 : 0;
2579+
return 0;
25732580
}
25742581

2582+
MAYBE_UNUSED
25752583
static int ce_flush(git_hash_ctx *context, int fd, unsigned char *hash)
25762584
{
25772585
unsigned int left = write_buffer_len;
@@ -2673,11 +2681,10 @@ static void copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
26732681
}
26742682
}
26752683

2676-
static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
2684+
static int ce_write_entry(struct hashfile *f, struct cache_entry *ce,
26772685
struct strbuf *previous_name, struct ondisk_cache_entry *ondisk)
26782686
{
26792687
int size;
2680-
int result;
26812688
unsigned int saved_namelen;
26822689
int stripped_name = 0;
26832690
static unsigned char padding[8] = { 0x00 };
@@ -2693,11 +2700,9 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
26932700
if (!previous_name) {
26942701
int len = ce_namelen(ce);
26952702
copy_cache_entry_to_ondisk(ondisk, ce);
2696-
result = ce_write(c, fd, ondisk, size);
2697-
if (!result)
2698-
result = ce_write(c, fd, ce->name, len);
2699-
if (!result)
2700-
result = ce_write(c, fd, padding, align_padding_size(size, len));
2703+
hashwrite(f, ondisk, size);
2704+
hashwrite(f, ce->name, len);
2705+
hashwrite(f, padding, align_padding_size(size, len));
27012706
} else {
27022707
int common, to_remove, prefix_size;
27032708
unsigned char to_remove_vi[16];
@@ -2711,13 +2716,10 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
27112716
prefix_size = encode_varint(to_remove, to_remove_vi);
27122717

27132718
copy_cache_entry_to_ondisk(ondisk, ce);
2714-
result = ce_write(c, fd, ondisk, size);
2715-
if (!result)
2716-
result = ce_write(c, fd, to_remove_vi, prefix_size);
2717-
if (!result)
2718-
result = ce_write(c, fd, ce->name + common, ce_namelen(ce) - common);
2719-
if (!result)
2720-
result = ce_write(c, fd, padding, 1);
2719+
hashwrite(f, ondisk, size);
2720+
hashwrite(f, to_remove_vi, prefix_size);
2721+
hashwrite(f, ce->name + common, ce_namelen(ce) - common);
2722+
hashwrite(f, padding, 1);
27212723

27222724
strbuf_splice(previous_name, common, to_remove,
27232725
ce->name + common, ce_namelen(ce) - common);
@@ -2727,7 +2729,7 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
27272729
ce->ce_flags &= ~CE_STRIP_NAME;
27282730
}
27292731

2730-
return result;
2732+
return 0;
27312733
}
27322734

27332735
/*
@@ -2839,8 +2841,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
28392841
int strip_extensions)
28402842
{
28412843
uint64_t start = getnanotime();
2842-
int newfd = tempfile->fd;
2843-
git_hash_ctx c, eoie_c;
2844+
struct hashfile *f;
2845+
git_hash_ctx *eoie_c = NULL;
28442846
struct cache_header hdr;
28452847
int i, err = 0, removed, extended, hdr_version;
28462848
struct cache_entry **cache = istate->cache;
@@ -2854,6 +2856,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
28542856
struct index_entry_offset_table *ieot = NULL;
28552857
int nr, nr_threads;
28562858

2859+
f = hashfd(tempfile->fd, tempfile->filename.buf);
2860+
28572861
for (i = removed = extended = 0; i < entries; i++) {
28582862
if (cache[i]->ce_flags & CE_REMOVE)
28592863
removed++;
@@ -2882,9 +2886,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
28822886
hdr.hdr_version = htonl(hdr_version);
28832887
hdr.hdr_entries = htonl(entries - removed);
28842888

2885-
the_hash_algo->init_fn(&c);
2886-
if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
2887-
return -1;
2889+
hashwrite(f, &hdr, sizeof(hdr));
28882890

28892891
if (!HAVE_THREADS || git_config_get_index_threads(&nr_threads))
28902892
nr_threads = 1;
@@ -2919,12 +2921,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
29192921
}
29202922
}
29212923

2922-
offset = lseek(newfd, 0, SEEK_CUR);
2923-
if (offset < 0) {
2924-
free(ieot);
2925-
return -1;
2926-
}
2927-
offset += write_buffer_len;
2924+
offset = hashfile_total(f);
2925+
29282926
nr = 0;
29292927
previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
29302928

@@ -2959,14 +2957,10 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
29592957
if (previous_name)
29602958
previous_name->buf[0] = 0;
29612959
nr = 0;
2962-
offset = lseek(newfd, 0, SEEK_CUR);
2963-
if (offset < 0) {
2964-
free(ieot);
2965-
return -1;
2966-
}
2967-
offset += write_buffer_len;
2960+
2961+
offset = hashfile_total(f);
29682962
}
2969-
if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
2963+
if (ce_write_entry(f, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
29702964
err = -1;
29712965

29722966
if (err)
@@ -2985,14 +2979,16 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
29852979
return err;
29862980
}
29872981

2988-
/* Write extension data here */
2989-
offset = lseek(newfd, 0, SEEK_CUR);
2990-
if (offset < 0) {
2991-
free(ieot);
2992-
return -1;
2982+
offset = hashfile_total(f);
2983+
2984+
/*
2985+
* The extension headers must be hashed on their own for the
2986+
* EOIE extension. Create a hashfile here to compute that hash.
2987+
*/
2988+
if (offset && record_eoie()) {
2989+
CALLOC_ARRAY(eoie_c, 1);
2990+
the_hash_algo->init_fn(eoie_c);
29932991
}
2994-
offset += write_buffer_len;
2995-
the_hash_algo->init_fn(&eoie_c);
29962992

29972993
/*
29982994
* Lets write out CACHE_EXT_INDEXENTRYOFFSETTABLE first so that we
@@ -3005,8 +3001,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30053001
struct strbuf sb = STRBUF_INIT;
30063002

30073003
write_ieot_extension(&sb, ieot);
3008-
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0
3009-
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
3004+
err = write_index_ext_header(f, eoie_c, CACHE_EXT_INDEXENTRYOFFSETTABLE, sb.len) < 0;
3005+
hashwrite(f, sb.buf, sb.len);
30103006
strbuf_release(&sb);
30113007
free(ieot);
30123008
if (err)
@@ -3018,9 +3014,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30183014
struct strbuf sb = STRBUF_INIT;
30193015

30203016
err = write_link_extension(&sb, istate) < 0 ||
3021-
write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_LINK,
3022-
sb.len) < 0 ||
3023-
ce_write(&c, newfd, sb.buf, sb.len) < 0;
3017+
write_index_ext_header(f, eoie_c, CACHE_EXT_LINK,
3018+
sb.len) < 0;
3019+
hashwrite(f, sb.buf, sb.len);
30243020
strbuf_release(&sb);
30253021
if (err)
30263022
return -1;
@@ -3029,8 +3025,8 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30293025
struct strbuf sb = STRBUF_INIT;
30303026

30313027
cache_tree_write(&sb, istate->cache_tree);
3032-
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_TREE, sb.len) < 0
3033-
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
3028+
err = write_index_ext_header(f, eoie_c, CACHE_EXT_TREE, sb.len) < 0;
3029+
hashwrite(f, sb.buf, sb.len);
30343030
strbuf_release(&sb);
30353031
if (err)
30363032
return -1;
@@ -3039,9 +3035,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30393035
struct strbuf sb = STRBUF_INIT;
30403036

30413037
resolve_undo_write(&sb, istate->resolve_undo);
3042-
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_RESOLVE_UNDO,
3043-
sb.len) < 0
3044-
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
3038+
err = write_index_ext_header(f, eoie_c, CACHE_EXT_RESOLVE_UNDO,
3039+
sb.len) < 0;
3040+
hashwrite(f, sb.buf, sb.len);
30453041
strbuf_release(&sb);
30463042
if (err)
30473043
return -1;
@@ -3050,9 +3046,9 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30503046
struct strbuf sb = STRBUF_INIT;
30513047

30523048
write_untracked_extension(&sb, istate->untracked);
3053-
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_UNTRACKED,
3054-
sb.len) < 0 ||
3055-
ce_write(&c, newfd, sb.buf, sb.len) < 0;
3049+
err = write_index_ext_header(f, eoie_c, CACHE_EXT_UNTRACKED,
3050+
sb.len) < 0;
3051+
hashwrite(f, sb.buf, sb.len);
30563052
strbuf_release(&sb);
30573053
if (err)
30583054
return -1;
@@ -3061,14 +3057,14 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30613057
struct strbuf sb = STRBUF_INIT;
30623058

30633059
write_fsmonitor_extension(&sb, istate);
3064-
err = write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_FSMONITOR, sb.len) < 0
3065-
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
3060+
err = write_index_ext_header(f, eoie_c, CACHE_EXT_FSMONITOR, sb.len) < 0;
3061+
hashwrite(f, sb.buf, sb.len);
30663062
strbuf_release(&sb);
30673063
if (err)
30683064
return -1;
30693065
}
30703066
if (istate->sparse_index) {
3071-
if (write_index_ext_header(&c, &eoie_c, newfd, CACHE_EXT_SPARSE_DIRECTORIES, 0) < 0)
3067+
if (write_index_ext_header(f, eoie_c, CACHE_EXT_SPARSE_DIRECTORIES, 0) < 0)
30723068
return -1;
30733069
}
30743070

@@ -3078,19 +3074,18 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
30783074
* read. Write it out regardless of the strip_extensions parameter as we need it
30793075
* when loading the shared index.
30803076
*/
3081-
if (offset && record_eoie()) {
3077+
if (eoie_c) {
30823078
struct strbuf sb = STRBUF_INIT;
30833079

3084-
write_eoie_extension(&sb, &eoie_c, offset);
3085-
err = write_index_ext_header(&c, NULL, newfd, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0
3086-
|| ce_write(&c, newfd, sb.buf, sb.len) < 0;
3080+
write_eoie_extension(&sb, eoie_c, offset);
3081+
err = write_index_ext_header(f, NULL, CACHE_EXT_ENDOFINDEXENTRIES, sb.len) < 0;
3082+
hashwrite(f, sb.buf, sb.len);
30873083
strbuf_release(&sb);
30883084
if (err)
30893085
return -1;
30903086
}
30913087

3092-
if (ce_flush(&c, newfd, istate->oid.hash))
3093-
return -1;
3088+
finalize_hashfile(f, istate->oid.hash, CSUM_HASH_IN_STREAM);
30943089
if (close_tempfile_gently(tempfile)) {
30953090
error(_("could not close '%s'"), get_tempfile_path(tempfile));
30963091
return -1;

0 commit comments

Comments
 (0)