Skip to content

Commit 5cfe425

Browse files
committed
Merge branch 'jc/bigfile'
* jc/bigfile: Bigfile: teach "git add" to send a large file straight to a pack index_fd(): split into two helper functions index_fd(): turn write_object and format_check arguments into one flag
2 parents 84da3e2 + 4dd1fbc commit 5cfe425

File tree

7 files changed

+164
-31
lines changed

7 files changed

+164
-31
lines changed

builtin/hash-object.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,11 @@ static void hash_fd(int fd, const char *type, int write_object, const char *path
1414
{
1515
struct stat st;
1616
unsigned char sha1[20];
17+
unsigned flags = (HASH_FORMAT_CHECK |
18+
(write_object ? HASH_WRITE_OBJECT : 0));
19+
1720
if (fstat(fd, &st) < 0 ||
18-
index_fd(sha1, fd, &st, write_object, type_from_string(type), path, 1))
21+
index_fd(sha1, fd, &st, type_from_string(type), path, flags))
1922
die(write_object
2023
? "Unable to add %s to database"
2124
: "Unable to hash %s", path);

builtin/update-index.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ static int add_one_path(struct cache_entry *old, const char *path, int len, stru
9999
fill_stat_cache_info(ce, st);
100100
ce->ce_mode = ce_mode_from_stat(old, st->st_mode);
101101

102-
if (index_path(ce->sha1, path, st, !info_only))
102+
if (index_path(ce->sha1, path, st,
103+
info_only ? 0 : HASH_WRITE_OBJECT))
103104
return -1;
104105
option = allow_add ? ADD_CACHE_OK_TO_ADD : 0;
105106
option |= allow_replace ? ADD_CACHE_OK_TO_REPLACE : 0;

cache.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -518,8 +518,11 @@ struct pathspec {
518518
extern int init_pathspec(struct pathspec *, const char **);
519519
extern void free_pathspec(struct pathspec *);
520520
extern int ce_path_match(const struct cache_entry *ce, const struct pathspec *pathspec);
521-
extern int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object, enum object_type type, const char *path, int format_check);
522-
extern int index_path(unsigned char *sha1, const char *path, struct stat *st, int write_object);
521+
522+
#define HASH_WRITE_OBJECT 1
523+
#define HASH_FORMAT_CHECK 2
524+
extern int index_fd(unsigned char *sha1, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags);
525+
extern int index_path(unsigned char *sha1, const char *path, struct stat *st, unsigned flags);
523526
extern void fill_stat_cache_info(struct cache_entry *ce, struct stat *st);
524527

525528
#define REFRESH_REALLY 0x0001 /* ignore_valid */

notes-merge.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ int notes_merge_commit(struct notes_merge_options *o,
707707
/* write file as blob, and add to partial_tree */
708708
if (stat(ent->name, &st))
709709
die_errno("Failed to stat '%s'", ent->name);
710-
if (index_path(blob_sha1, ent->name, &st, 1))
710+
if (index_path(blob_sha1, ent->name, &st, HASH_WRITE_OBJECT))
711711
die("Failed to write blob object from '%s'", ent->name);
712712
if (add_note(partial_tree, obj_sha1, blob_sha1, NULL))
713713
die("Failed to add resolved note '%s' to notes tree",

read-cache.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ static int ce_compare_data(struct cache_entry *ce, struct stat *st)
9292

9393
if (fd >= 0) {
9494
unsigned char sha1[20];
95-
if (!index_fd(sha1, fd, st, 0, OBJ_BLOB, ce->name, 0))
95+
if (!index_fd(sha1, fd, st, OBJ_BLOB, ce->name, 0))
9696
match = hashcmp(sha1, ce->sha1);
9797
/* index_fd() closed the file descriptor already */
9898
}
@@ -641,7 +641,7 @@ int add_to_index(struct index_state *istate, const char *path, struct stat *st,
641641
return 0;
642642
}
643643
if (!intent_only) {
644-
if (index_path(ce->sha1, path, st, 1))
644+
if (index_path(ce->sha1, path, st, HASH_WRITE_OBJECT))
645645
return error("unable to index file %s", path);
646646
} else
647647
record_intent_to_add(ce);

sha1_file.c

Lines changed: 123 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "pack.h"
1212
#include "blob.h"
1313
#include "commit.h"
14+
#include "run-command.h"
1415
#include "tag.h"
1516
#include "tree.h"
1617
#include "tree-walk.h"
@@ -2578,10 +2579,11 @@ static void check_tag(const void *buf, size_t size)
25782579
}
25792580

25802581
static int index_mem(unsigned char *sha1, void *buf, size_t size,
2581-
int write_object, enum object_type type,
2582-
const char *path, int format_check)
2582+
enum object_type type,
2583+
const char *path, unsigned flags)
25832584
{
25842585
int ret, re_allocated = 0;
2586+
int write_object = flags & HASH_WRITE_OBJECT;
25852587

25862588
if (!type)
25872589
type = OBJ_BLOB;
@@ -2597,7 +2599,7 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
25972599
re_allocated = 1;
25982600
}
25992601
}
2600-
if (format_check) {
2602+
if (flags & HASH_FORMAT_CHECK) {
26012603
if (type == OBJ_TREE)
26022604
check_tree(buf, size);
26032605
if (type == OBJ_COMMIT)
@@ -2615,44 +2617,141 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
26152617
return ret;
26162618
}
26172619

2620+
static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
2621+
const char *path, unsigned flags)
2622+
{
2623+
struct strbuf sbuf = STRBUF_INIT;
2624+
int ret;
2625+
2626+
if (strbuf_read(&sbuf, fd, 4096) >= 0)
2627+
ret = index_mem(sha1, sbuf.buf, sbuf.len, type, path, flags);
2628+
else
2629+
ret = -1;
2630+
strbuf_release(&sbuf);
2631+
return ret;
2632+
}
2633+
26182634
#define SMALL_FILE_SIZE (32*1024)
26192635

2620-
int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object,
2621-
enum object_type type, const char *path, int format_check)
2636+
static int index_core(unsigned char *sha1, int fd, size_t size,
2637+
enum object_type type, const char *path,
2638+
unsigned flags)
26222639
{
26232640
int ret;
2624-
size_t size = xsize_t(st->st_size);
26252641

2626-
if (!S_ISREG(st->st_mode)) {
2627-
struct strbuf sbuf = STRBUF_INIT;
2628-
if (strbuf_read(&sbuf, fd, 4096) >= 0)
2629-
ret = index_mem(sha1, sbuf.buf, sbuf.len, write_object,
2630-
type, path, format_check);
2631-
else
2632-
ret = -1;
2633-
strbuf_release(&sbuf);
2634-
} else if (!size) {
2635-
ret = index_mem(sha1, NULL, size, write_object, type, path,
2636-
format_check);
2642+
if (!size) {
2643+
ret = index_mem(sha1, NULL, size, type, path, flags);
26372644
} else if (size <= SMALL_FILE_SIZE) {
26382645
char *buf = xmalloc(size);
26392646
if (size == read_in_full(fd, buf, size))
2640-
ret = index_mem(sha1, buf, size, write_object, type,
2641-
path, format_check);
2647+
ret = index_mem(sha1, buf, size, type, path, flags);
26422648
else
26432649
ret = error("short read %s", strerror(errno));
26442650
free(buf);
26452651
} else {
26462652
void *buf = xmmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
2647-
ret = index_mem(sha1, buf, size, write_object, type, path,
2648-
format_check);
2653+
ret = index_mem(sha1, buf, size, type, path, flags);
26492654
munmap(buf, size);
26502655
}
2656+
return ret;
2657+
}
2658+
2659+
/*
2660+
* This creates one packfile per large blob, because the caller
2661+
* immediately wants the result sha1, and fast-import can report the
2662+
* object name via marks mechanism only by closing the created
2663+
* packfile.
2664+
*
2665+
* This also bypasses the usual "convert-to-git" dance, and that is on
2666+
* purpose. We could write a streaming version of the converting
2667+
* functions and insert that before feeding the data to fast-import
2668+
* (or equivalent in-core API described above), but the primary
2669+
* motivation for trying to stream from the working tree file and to
2670+
* avoid mmaping it in core is to deal with large binary blobs, and
2671+
* by definition they do _not_ want to get any conversion.
2672+
*/
2673+
static int index_stream(unsigned char *sha1, int fd, size_t size,
2674+
enum object_type type, const char *path,
2675+
unsigned flags)
2676+
{
2677+
struct child_process fast_import;
2678+
char export_marks[512];
2679+
const char *argv[] = { "fast-import", "--quiet", export_marks, NULL };
2680+
char tmpfile[512];
2681+
char fast_import_cmd[512];
2682+
char buf[512];
2683+
int len, tmpfd;
2684+
2685+
strcpy(tmpfile, git_path("hashstream_XXXXXX"));
2686+
tmpfd = git_mkstemp_mode(tmpfile, 0600);
2687+
if (tmpfd < 0)
2688+
die_errno("cannot create tempfile: %s", tmpfile);
2689+
if (close(tmpfd))
2690+
die_errno("cannot close tempfile: %s", tmpfile);
2691+
sprintf(export_marks, "--export-marks=%s", tmpfile);
2692+
2693+
memset(&fast_import, 0, sizeof(fast_import));
2694+
fast_import.in = -1;
2695+
fast_import.argv = argv;
2696+
fast_import.git_cmd = 1;
2697+
if (start_command(&fast_import))
2698+
die_errno("index-stream: git fast-import failed");
2699+
2700+
len = sprintf(fast_import_cmd, "blob\nmark :1\ndata %lu\n",
2701+
(unsigned long) size);
2702+
write_or_whine(fast_import.in, fast_import_cmd, len,
2703+
"index-stream: feeding fast-import");
2704+
while (size) {
2705+
char buf[10240];
2706+
size_t sz = size < sizeof(buf) ? size : sizeof(buf);
2707+
size_t actual;
2708+
2709+
actual = read_in_full(fd, buf, sz);
2710+
if (actual < 0)
2711+
die_errno("index-stream: reading input");
2712+
if (write_in_full(fast_import.in, buf, actual) != actual)
2713+
die_errno("index-stream: feeding fast-import");
2714+
size -= actual;
2715+
}
2716+
if (close(fast_import.in))
2717+
die_errno("index-stream: closing fast-import");
2718+
if (finish_command(&fast_import))
2719+
die_errno("index-stream: finishing fast-import");
2720+
2721+
tmpfd = open(tmpfile, O_RDONLY);
2722+
if (tmpfd < 0)
2723+
die_errno("index-stream: cannot open fast-import mark");
2724+
len = read(tmpfd, buf, sizeof(buf));
2725+
if (len < 0)
2726+
die_errno("index-stream: reading fast-import mark");
2727+
if (close(tmpfd) < 0)
2728+
die_errno("index-stream: closing fast-import mark");
2729+
if (unlink(tmpfile))
2730+
die_errno("index-stream: unlinking fast-import mark");
2731+
if (len != 44 ||
2732+
memcmp(":1 ", buf, 3) ||
2733+
get_sha1_hex(buf + 3, sha1))
2734+
die_errno("index-stream: unexpected fast-import mark: <%s>", buf);
2735+
return 0;
2736+
}
2737+
2738+
int index_fd(unsigned char *sha1, int fd, struct stat *st,
2739+
enum object_type type, const char *path, unsigned flags)
2740+
{
2741+
int ret;
2742+
size_t size = xsize_t(st->st_size);
2743+
2744+
if (!S_ISREG(st->st_mode))
2745+
ret = index_pipe(sha1, fd, type, path, flags);
2746+
else if (size <= big_file_threshold || type != OBJ_BLOB)
2747+
ret = index_core(sha1, fd, size, type, path, flags);
2748+
else
2749+
ret = index_stream(sha1, fd, size, type, path, flags);
26512750
close(fd);
26522751
return ret;
26532752
}
26542753

2655-
int index_path(unsigned char *sha1, const char *path, struct stat *st, int write_object)
2754+
int index_path(unsigned char *sha1, const char *path, struct stat *st, unsigned flags)
26562755
{
26572756
int fd;
26582757
struct strbuf sb = STRBUF_INIT;
@@ -2663,7 +2762,7 @@ int index_path(unsigned char *sha1, const char *path, struct stat *st, int write
26632762
if (fd < 0)
26642763
return error("open(\"%s\"): %s", path,
26652764
strerror(errno));
2666-
if (index_fd(sha1, fd, st, write_object, OBJ_BLOB, path, 0) < 0)
2765+
if (index_fd(sha1, fd, st, OBJ_BLOB, path, flags) < 0)
26672766
return error("%s: failed to insert into database",
26682767
path);
26692768
break;
@@ -2673,7 +2772,7 @@ int index_path(unsigned char *sha1, const char *path, struct stat *st, int write
26732772
return error("readlink(\"%s\"): %s", path,
26742773
errstr);
26752774
}
2676-
if (!write_object)
2775+
if (!(flags & HASH_WRITE_OBJECT))
26772776
hash_sha1_file(sb.buf, sb.len, blob_type, sha1);
26782777
else if (write_sha1_file(sb.buf, sb.len, blob_type, sha1))
26792778
return error("%s: failed to insert into database",

t/t1050-large.sh

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/sh
2+
# Copyright (c) 2011, Google Inc.
3+
4+
test_description='adding and checking out large blobs'
5+
6+
. ./test-lib.sh
7+
8+
test_expect_success setup '
9+
git config core.bigfilethreshold 200k &&
10+
echo X | dd of=large bs=1k seek=2000
11+
'
12+
13+
test_expect_success 'add a large file' '
14+
git add large &&
15+
# make sure we got a packfile and no loose objects
16+
test -f .git/objects/pack/pack-*.pack &&
17+
test ! -f .git/objects/??/??????????????????????????????????????
18+
'
19+
20+
test_expect_success 'checkout a large file' '
21+
large=$(git rev-parse :large) &&
22+
git update-index --add --cacheinfo 100644 $large another &&
23+
git checkout another &&
24+
cmp large another ;# this must not be test_cmp
25+
'
26+
27+
test_done

0 commit comments

Comments
 (0)