Skip to content

Commit 568508e

Browse files
committed
bulk-checkin: replace fast-import based implementation
This extends the earlier approach to stream a large file directly from the filesystem to its own packfile, and allows "git add" to send large files directly into a single pack. Older code used to spawn fast-import, but the new bulk-checkin API replaces it. Signed-off-by: Junio C Hamano <[email protected]>
1 parent 6c52614 commit 568508e

File tree

11 files changed

+403
-78
lines changed

11 files changed

+403
-78
lines changed

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ LIB_H += argv-array.h
505505
LIB_H += attr.h
506506
LIB_H += blob.h
507507
LIB_H += builtin.h
508+
LIB_H += bulk-checkin.h
508509
LIB_H += cache.h
509510
LIB_H += cache-tree.h
510511
LIB_H += color.h
@@ -591,6 +592,7 @@ LIB_OBJS += base85.o
591592
LIB_OBJS += bisect.o
592593
LIB_OBJS += blob.o
593594
LIB_OBJS += branch.o
595+
LIB_OBJS += bulk-checkin.o
594596
LIB_OBJS += bundle.o
595597
LIB_OBJS += cache-tree.o
596598
LIB_OBJS += color.o

builtin/add.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "diff.h"
1414
#include "diffcore.h"
1515
#include "revision.h"
16+
#include "bulk-checkin.h"
1617

1718
static const char * const builtin_add_usage[] = {
1819
"git add [options] [--] <filepattern>...",
@@ -458,11 +459,15 @@ int cmd_add(int argc, const char **argv, const char *prefix)
458459
free(seen);
459460
}
460461

462+
plug_bulk_checkin();
463+
461464
exit_status |= add_files_to_cache(prefix, pathspec, flags);
462465

463466
if (add_new_files)
464467
exit_status |= add_files(&dir, flags);
465468

469+
unplug_bulk_checkin();
470+
466471
finish:
467472
if (active_cache_changed) {
468473
if (write_cache(newfd, active_cache, active_nr) ||

builtin/pack-objects.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ static struct pack_idx_option pack_idx_opts;
7676
static const char *base_name;
7777
static int progress = 1;
7878
static int window = 10;
79-
static unsigned long pack_size_limit, pack_size_limit_cfg;
79+
static unsigned long pack_size_limit;
8080
static int depth = 50;
8181
static int delta_search_threads;
8282
static int pack_to_stdout;
@@ -2009,10 +2009,6 @@ static int git_pack_config(const char *k, const char *v, void *cb)
20092009
pack_idx_opts.version);
20102010
return 0;
20112011
}
2012-
if (!strcmp(k, "pack.packsizelimit")) {
2013-
pack_size_limit_cfg = git_config_ulong(k, v);
2014-
return 0;
2015-
}
20162012
return git_default_config(k, v, cb);
20172013
}
20182014

bulk-checkin.c

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
/*
2+
* Copyright (c) 2011, Google Inc.
3+
*/
4+
#include "bulk-checkin.h"
5+
#include "csum-file.h"
6+
#include "pack.h"
7+
8+
static int pack_compression_level = Z_DEFAULT_COMPRESSION;
9+
10+
static struct bulk_checkin_state {
11+
unsigned plugged:1;
12+
13+
char *pack_tmp_name;
14+
struct sha1file *f;
15+
off_t offset;
16+
struct pack_idx_option pack_idx_opts;
17+
18+
struct pack_idx_entry **written;
19+
uint32_t alloc_written;
20+
uint32_t nr_written;
21+
} state;
22+
23+
static void finish_bulk_checkin(struct bulk_checkin_state *state)
24+
{
25+
unsigned char sha1[20];
26+
char packname[PATH_MAX];
27+
int i;
28+
29+
if (!state->f)
30+
return;
31+
32+
if (state->nr_written == 0) {
33+
close(state->f->fd);
34+
unlink(state->pack_tmp_name);
35+
goto clear_exit;
36+
} else if (state->nr_written == 1) {
37+
sha1close(state->f, sha1, CSUM_FSYNC);
38+
} else {
39+
int fd = sha1close(state->f, sha1, 0);
40+
fixup_pack_header_footer(fd, sha1, state->pack_tmp_name,
41+
state->nr_written, sha1,
42+
state->offset);
43+
close(fd);
44+
}
45+
46+
sprintf(packname, "%s/pack/pack-", get_object_directory());
47+
finish_tmp_packfile(packname, state->pack_tmp_name,
48+
state->written, state->nr_written,
49+
&state->pack_idx_opts, sha1);
50+
for (i = 0; i < state->nr_written; i++)
51+
free(state->written[i]);
52+
53+
clear_exit:
54+
free(state->written);
55+
memset(state, 0, sizeof(*state));
56+
57+
/* Make objects we just wrote available to ourselves */
58+
reprepare_packed_git();
59+
}
60+
61+
static int already_written(struct bulk_checkin_state *state, unsigned char sha1[])
62+
{
63+
int i;
64+
65+
/* The object may already exist in the repository */
66+
if (has_sha1_file(sha1))
67+
return 1;
68+
69+
/* Might want to keep the list sorted */
70+
for (i = 0; i < state->nr_written; i++)
71+
if (!hashcmp(state->written[i]->sha1, sha1))
72+
return 1;
73+
74+
/* This is a new object we need to keep */
75+
return 0;
76+
}
77+
78+
/*
79+
* Read the contents from fd for size bytes, streaming it to the
80+
* packfile in state while updating the hash in ctx. Signal a failure
81+
* by returning a negative value when the resulting pack would exceed
82+
* the pack size limit and this is not the first object in the pack,
83+
* so that the caller can discard what we wrote from the current pack
84+
* by truncating it and opening a new one. The caller will then call
85+
* us again after rewinding the input fd.
86+
*
87+
* The already_hashed_to pointer is kept untouched by the caller to
88+
* make sure we do not hash the same byte when we are called
89+
* again. This way, the caller does not have to checkpoint its hash
90+
* status before calling us just in case we ask it to call us again
91+
* with a new pack.
92+
*/
93+
static int stream_to_pack(struct bulk_checkin_state *state,
94+
git_SHA_CTX *ctx, off_t *already_hashed_to,
95+
int fd, size_t size, enum object_type type,
96+
const char *path, unsigned flags)
97+
{
98+
git_zstream s;
99+
unsigned char obuf[16384];
100+
unsigned hdrlen;
101+
int status = Z_OK;
102+
int write_object = (flags & HASH_WRITE_OBJECT);
103+
off_t offset = 0;
104+
105+
memset(&s, 0, sizeof(s));
106+
git_deflate_init(&s, pack_compression_level);
107+
108+
hdrlen = encode_in_pack_object_header(type, size, obuf);
109+
s.next_out = obuf + hdrlen;
110+
s.avail_out = sizeof(obuf) - hdrlen;
111+
112+
while (status != Z_STREAM_END) {
113+
unsigned char ibuf[16384];
114+
115+
if (size && !s.avail_in) {
116+
ssize_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
117+
if (xread(fd, ibuf, rsize) != rsize)
118+
die("failed to read %d bytes from '%s'",
119+
(int)rsize, path);
120+
offset += rsize;
121+
if (*already_hashed_to < offset) {
122+
size_t hsize = offset - *already_hashed_to;
123+
if (rsize < hsize)
124+
hsize = rsize;
125+
if (hsize)
126+
git_SHA1_Update(ctx, ibuf, hsize);
127+
*already_hashed_to = offset;
128+
}
129+
s.next_in = ibuf;
130+
s.avail_in = rsize;
131+
size -= rsize;
132+
}
133+
134+
status = git_deflate(&s, size ? 0 : Z_FINISH);
135+
136+
if (!s.avail_out || status == Z_STREAM_END) {
137+
if (write_object) {
138+
size_t written = s.next_out - obuf;
139+
140+
/* would we bust the size limit? */
141+
if (state->nr_written &&
142+
pack_size_limit_cfg &&
143+
pack_size_limit_cfg < state->offset + written) {
144+
git_deflate_abort(&s);
145+
return -1;
146+
}
147+
148+
sha1write(state->f, obuf, written);
149+
state->offset += written;
150+
}
151+
s.next_out = obuf;
152+
s.avail_out = sizeof(obuf);
153+
}
154+
155+
switch (status) {
156+
case Z_OK:
157+
case Z_BUF_ERROR:
158+
case Z_STREAM_END:
159+
continue;
160+
default:
161+
die("unexpected deflate failure: %d", status);
162+
}
163+
}
164+
git_deflate_end(&s);
165+
return 0;
166+
}
167+
168+
/* Lazily create backing packfile for the state */
169+
static void prepare_to_stream(struct bulk_checkin_state *state,
170+
unsigned flags)
171+
{
172+
if (!(flags & HASH_WRITE_OBJECT) || state->f)
173+
return;
174+
175+
state->f = create_tmp_packfile(&state->pack_tmp_name);
176+
reset_pack_idx_option(&state->pack_idx_opts);
177+
178+
/* Pretend we are going to write only one object */
179+
state->offset = write_pack_header(state->f, 1);
180+
if (!state->offset)
181+
die_errno("unable to write pack header");
182+
}
183+
184+
static int deflate_to_pack(struct bulk_checkin_state *state,
185+
unsigned char result_sha1[],
186+
int fd, size_t size,
187+
enum object_type type, const char *path,
188+
unsigned flags)
189+
{
190+
off_t seekback, already_hashed_to;
191+
git_SHA_CTX ctx;
192+
unsigned char obuf[16384];
193+
unsigned header_len;
194+
struct sha1file_checkpoint checkpoint;
195+
struct pack_idx_entry *idx = NULL;
196+
197+
seekback = lseek(fd, 0, SEEK_CUR);
198+
if (seekback == (off_t) -1)
199+
return error("cannot find the current offset");
200+
201+
header_len = sprintf((char *)obuf, "%s %" PRIuMAX,
202+
typename(type), (uintmax_t)size) + 1;
203+
git_SHA1_Init(&ctx);
204+
git_SHA1_Update(&ctx, obuf, header_len);
205+
206+
/* Note: idx is non-NULL when we are writing */
207+
if ((flags & HASH_WRITE_OBJECT) != 0)
208+
idx = xcalloc(1, sizeof(*idx));
209+
210+
already_hashed_to = 0;
211+
212+
while (1) {
213+
prepare_to_stream(state, flags);
214+
if (idx) {
215+
sha1file_checkpoint(state->f, &checkpoint);
216+
idx->offset = state->offset;
217+
crc32_begin(state->f);
218+
}
219+
if (!stream_to_pack(state, &ctx, &already_hashed_to,
220+
fd, size, type, path, flags))
221+
break;
222+
/*
223+
* Writing this object to the current pack will make
224+
* it too big; we need to truncate it, start a new
225+
* pack, and write into it.
226+
*/
227+
if (!idx)
228+
die("BUG: should not happen");
229+
sha1file_truncate(state->f, &checkpoint);
230+
state->offset = checkpoint.offset;
231+
finish_bulk_checkin(state);
232+
if (lseek(fd, seekback, SEEK_SET) == (off_t) -1)
233+
return error("cannot seek back");
234+
}
235+
git_SHA1_Final(result_sha1, &ctx);
236+
if (!idx)
237+
return 0;
238+
239+
idx->crc32 = crc32_end(state->f);
240+
if (already_written(state, result_sha1)) {
241+
sha1file_truncate(state->f, &checkpoint);
242+
state->offset = checkpoint.offset;
243+
free(idx);
244+
} else {
245+
hashcpy(idx->sha1, result_sha1);
246+
ALLOC_GROW(state->written,
247+
state->nr_written + 1,
248+
state->alloc_written);
249+
state->written[state->nr_written++] = idx;
250+
}
251+
return 0;
252+
}
253+
254+
int index_bulk_checkin(unsigned char *sha1,
255+
int fd, size_t size, enum object_type type,
256+
const char *path, unsigned flags)
257+
{
258+
int status = deflate_to_pack(&state, sha1, fd, size, type,
259+
path, flags);
260+
if (!state.plugged)
261+
finish_bulk_checkin(&state);
262+
return status;
263+
}
264+
265+
void plug_bulk_checkin(void)
266+
{
267+
state.plugged = 1;
268+
}
269+
270+
void unplug_bulk_checkin(void)
271+
{
272+
state.plugged = 0;
273+
if (state.f)
274+
finish_bulk_checkin(&state);
275+
}

bulk-checkin.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/*
2+
* Copyright (c) 2011, Google Inc.
3+
*/
4+
#ifndef BULK_CHECKIN_H
5+
#define BULK_CHECKIN_H
6+
7+
#include "cache.h"
8+
9+
extern int index_bulk_checkin(unsigned char sha1[],
10+
int fd, size_t size, enum object_type type,
11+
const char *path, unsigned flags);
12+
13+
extern void plug_bulk_checkin(void);
14+
extern void unplug_bulk_checkin(void);
15+
16+
#endif

cache.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ int git_inflate(git_zstream *, int flush);
3535
void git_deflate_init(git_zstream *, int level);
3636
void git_deflate_init_gzip(git_zstream *, int level);
3737
void git_deflate_end(git_zstream *);
38+
int git_deflate_abort(git_zstream *);
3839
int git_deflate_end_gently(git_zstream *);
3940
int git_deflate(git_zstream *, int flush);
4041
unsigned long git_deflate_bound(git_zstream *, unsigned long);
@@ -598,6 +599,7 @@ extern size_t packed_git_window_size;
598599
extern size_t packed_git_limit;
599600
extern size_t delta_base_cache_limit;
600601
extern unsigned long big_file_threshold;
602+
extern unsigned long pack_size_limit_cfg;
601603
extern int read_replace_refs;
602604
extern int fsync_object_files;
603605
extern int core_preload_index;

config.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,10 @@ int git_default_config(const char *var, const char *value, void *dummy)
797797
return 0;
798798
}
799799

800+
if (!strcmp(var, "pack.packsizelimit")) {
801+
pack_size_limit_cfg = git_config_ulong(var, value);
802+
return 0;
803+
}
800804
/* Add other config variables here and to Documentation/config.txt. */
801805
return 0;
802806
}

environment.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ char *notes_ref_name;
6060
int grafts_replace_parents = 1;
6161
int core_apply_sparse_checkout;
6262
struct startup_info *startup_info;
63+
unsigned long pack_size_limit_cfg;
6364

6465
/* Parallel index stat data preload? */
6566
int core_preload_index = 0;

0 commit comments

Comments
 (0)