Skip to content

Commit 4dd1fbc

Browse files
committed
Bigfile: teach "git add" to send a large file straight to a pack
When adding a new content to the repository, we have always slurped the blob in its entirety in-core first, and computed the object name and compressed it into a loose object file. Handling large binary files (e.g. video and audio asset for games) has been problematic because of this design. At the middle level of "git add" callchain is an internal API index_fd() that takes an open file descriptor to read from the working tree file being added with its size. Teach it to call out to fast-import when adding a large blob. The write-out codepath in entry.c::write_entry() should be taught to stream, instead of reading everything in core. This should not be so hard to implement, especially if we limit ourselves only to loose object files and non-delta representation in packfiles. Signed-off-by: Junio C Hamano <[email protected]>
1 parent 7b41e1e commit 4dd1fbc

File tree

2 files changed

+110
-1
lines changed

2 files changed

+110
-1
lines changed

sha1_file.c

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "pack.h"
1212
#include "blob.h"
1313
#include "commit.h"
14+
#include "run-command.h"
1415
#include "tag.h"
1516
#include "tree.h"
1617
#include "tree-walk.h"
@@ -2658,6 +2659,85 @@ static int index_core(unsigned char *sha1, int fd, size_t size,
26582659
return ret;
26592660
}
26602661

2662+
/*
2663+
* This creates one packfile per large blob, because the caller
2664+
* immediately wants the result sha1, and fast-import can report the
2665+
* object name via marks mechanism only by closing the created
2666+
* packfile.
2667+
*
2668+
* This also bypasses the usual "convert-to-git" dance, and that is on
2669+
* purpose. We could write a streaming version of the converting
2670+
* functions and insert that before feeding the data to fast-import
2671+
* (or equivalent in-core API described above), but the primary
2672+
* motivation for trying to stream from the working tree file and to
2673+
* avoid mmaping it in core is to deal with large binary blobs, and
2674+
* by definition they do _not_ want to get any conversion.
2675+
*/
2676+
static int index_stream(unsigned char *sha1, int fd, size_t size,
2677+
enum object_type type, const char *path,
2678+
unsigned flags)
2679+
{
2680+
struct child_process fast_import;
2681+
char export_marks[512];
2682+
const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
2683+
char tmpfile[512];
2684+
char fast_import_cmd[512];
2685+
char buf[512];
2686+
int len, tmpfd;
2687+
2688+
strcpy(tmpfile, git_path("hashstream_XXXXXX"));
2689+
tmpfd = git_mkstemp_mode(tmpfile, 0600);
2690+
if (tmpfd < 0)
2691+
die_errno("cannot create tempfile: %s", tmpfile);
2692+
if (close(tmpfd))
2693+
die_errno("cannot close tempfile: %s", tmpfile);
2694+
sprintf(export_marks, "--export-marks=%s", tmpfile);
2695+
2696+
memset(&fast_import, 0, sizeof(fast_import));
2697+
fast_import.in = -1;
2698+
fast_import.argv = argv;
2699+
fast_import.git_cmd = 1;
2700+
if (start_command(&fast_import))
2701+
die_errno("index-stream: git fast-import failed");
2702+
2703+
len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
2704+
(unsigned long) size);
2705+
write_or_whine(fast_import.in, fast_import_cmd, len,
2706+
"index-stream: feeding fast-import");
2707+
while (size) {
2708+
char buf[10240];
2709+
size_t sz = size < sizeof(buf) ? size : sizeof(buf);
2710+
size_t actual;
2711+
2712+
actual = read_in_full(fd, buf, sz);
2713+
if (actual < 0)
2714+
die_errno("index-stream: reading input");
2715+
if (write_in_full(fast_import.in, buf, actual) != actual)
2716+
die_errno("index-stream: feeding fast-import");
2717+
size -= actual;
2718+
}
2719+
if (close(fast_import.in))
2720+
die_errno("index-stream: closing fast-import");
2721+
if (finish_command(&fast_import))
2722+
die_errno("index-stream: finishing fast-import");
2723+
2724+
tmpfd = open(tmpfile, O_RDONLY);
2725+
if (tmpfd < 0)
2726+
die_errno("index-stream: cannot open fast-import mark");
2727+
len = read(tmpfd, buf, sizeof(buf));
2728+
if (len < 0)
2729+
die_errno("index-stream: reading fast-import mark");
2730+
if (close(tmpfd) < 0)
2731+
die_errno("index-stream: closing fast-import mark");
2732+
if (unlink(tmpfile))
2733+
die_errno("index-stream: unlinking fast-import mark");
2734+
if (len != 44 ||
2735+
memcmp(":1 ", buf, 3) ||
2736+
get_sha1_hex(buf + 3, sha1))
2737+
die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
2738+
return 0;
2739+
}
2740+
26612741
int index_fd(unsigned char *sha1, int fd, struct stat *st,
26622742
enum object_type type, const char *path, unsigned flags)
26632743
{
@@ -2666,8 +2746,10 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
26662746

26672747
if (!S_ISREG(st->st_mode))
26682748
ret = index_pipe(sha1, fd, type, path, flags);
2669-
else
2749+
else if (size <= big_file_threshold || type != OBJ_BLOB)
26702750
ret = index_core(sha1, fd, size, type, path, flags);
2751+
else
2752+
ret = index_stream(sha1, fd, size, type, path, flags);
26712753
close(fd);
26722754
return ret;
26732755
}

t/t1050-large.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/sh
2+
# Copyright (c) 2011, Google Inc.
3+
4+
test_description='adding and checking out large blobs'
5+
6+
. ./test-lib.sh
7+
8+
test_expect_success setup '
9+
git config core.bigfilethreshold 200k &&
10+
echo X | dd of=large bs=1k seek=2000
11+
'
12+
13+
test_expect_success 'add a large file' '
14+
git add large &&
15+
# make sure we got a packfile and no loose objects
16+
test -f .git/objects/pack/pack-*.pack &&
17+
test ! -f .git/objects/??/??????????????????????????????????????
18+
'
19+
20+
test_expect_success 'checkout a large file' '
21+
large=$(git rev-parse :large) &&
22+
git update-index --add --cacheinfo 100644 $large another &&
23+
git checkout another &&
24+
cmp large another ;# this must not be test_cmp
25+
'
26+
27+
test_done

0 commit comments

Comments
 (0)