Skip to content

Commit f0d8900

Browse files
committed
Merge branch 'sp/stream-clean-filter'
When running a required clean filter, we do not have to mmap the original before feeding the filter. Instead, stream the file contents directly to the filter and process its output. * sp/stream-clean-filter: sha1_file: don't convert off_t to size_t too early to avoid potential die() convert: stream from fd to required clean filter to reduce used address space copy_fd(): do not close the input file descriptor mmap_limit: introduce GIT_MMAP_LIMIT to allow testing expected mmap size memory_limit: use git_env_ulong() to parse GIT_ALLOC_LIMIT config.c: add git_env_ulong() to parse environment variable convert: drop arguments other than 'path' from would_convert_to_git()
2 parents 9342f49 + 9079ab7 commit f0d8900

File tree

10 files changed

+164
-52
lines changed

10 files changed

+164
-52
lines changed

cache.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,7 @@ extern int git_config_rename_section_in_file(const char *, const char *, const c
13241324
extern const char *git_etc_gitconfig(void);
13251325
extern int check_repository_format_version(const char *var, const char *value, void *cb);
13261326
extern int git_env_bool(const char *, int);
1327+
extern unsigned long git_env_ulong(const char *, unsigned long);
13271328
extern int git_config_system(void);
13281329
extern int config_error_nonbool(const char *);
13291330
#if defined(__GNUC__)

config.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,12 +1139,28 @@ const char *git_etc_gitconfig(void)
11391139
return system_wide;
11401140
}
11411141

1142+
/*
1143+
* Parse environment variable 'k' as a boolean (in various
1144+
* possible spellings); if missing, use the default value 'def'.
1145+
*/
11421146
int git_env_bool(const char *k, int def)
11431147
{
11441148
const char *v = getenv(k);
11451149
return v ? git_config_bool(k, v) : def;
11461150
}
11471151

1152+
/*
1153+
* Parse environment variable 'k' as ulong with possibly a unit
1154+
* suffix; if missing, use the default value 'val'.
1155+
*/
1156+
unsigned long git_env_ulong(const char *k, unsigned long val)
1157+
{
1158+
const char *v = getenv(k);
1159+
if (v && !git_parse_ulong(v, &val))
1160+
die("failed to parse %s", k);
1161+
return val;
1162+
}
1163+
11481164
int git_config_system(void)
11491165
{
11501166
return !git_env_bool("GIT_CONFIG_NOSYSTEM", 0);

convert.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len,
312312
struct filter_params {
313313
const char *src;
314314
unsigned long size;
315+
int fd;
315316
const char *cmd;
316317
const char *path;
317318
};
318319

319-
static int filter_buffer(int in, int out, void *data)
320+
static int filter_buffer_or_fd(int in, int out, void *data)
320321
{
321322
/*
322323
* Spawn cmd and feed the buffer contents through its stdin.
@@ -354,7 +355,12 @@ static int filter_buffer(int in, int out, void *data)
354355

355356
sigchain_push(SIGPIPE, SIG_IGN);
356357

357-
write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
358+
if (params->src) {
359+
write_err = (write_in_full(child_process.in, params->src, params->size) < 0);
360+
} else {
361+
write_err = copy_fd(params->fd, child_process.in);
362+
}
363+
358364
if (close(child_process.in))
359365
write_err = 1;
360366
if (write_err)
@@ -370,7 +376,7 @@ static int filter_buffer(int in, int out, void *data)
370376
return (write_err || status);
371377
}
372378

373-
static int apply_filter(const char *path, const char *src, size_t len,
379+
static int apply_filter(const char *path, const char *src, size_t len, int fd,
374380
struct strbuf *dst, const char *cmd)
375381
{
376382
/*
@@ -391,11 +397,12 @@ static int apply_filter(const char *path, const char *src, size_t len,
391397
return 1;
392398

393399
memset(&async, 0, sizeof(async));
394-
async.proc = filter_buffer;
400+
async.proc = filter_buffer_or_fd;
395401
async.data = &params;
396402
async.out = -1;
397403
params.src = src;
398404
params.size = len;
405+
params.fd = fd;
399406
params.cmd = cmd;
400407
params.path = path;
401408

@@ -746,6 +753,25 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)
746753
}
747754
}
748755

756+
int would_convert_to_git_filter_fd(const char *path)
757+
{
758+
struct conv_attrs ca;
759+
760+
convert_attrs(&ca, path);
761+
if (!ca.drv)
762+
return 0;
763+
764+
/*
765+
* Apply a filter to an fd only if the filter is required to succeed.
766+
* We must die if the filter fails, because the original data before
767+
* filtering is not available.
768+
*/
769+
if (!ca.drv->required)
770+
return 0;
771+
772+
return apply_filter(path, NULL, 0, -1, NULL, ca.drv->clean);
773+
}
774+
749775
int convert_to_git(const char *path, const char *src, size_t len,
750776
struct strbuf *dst, enum safe_crlf checksafe)
751777
{
@@ -760,7 +786,7 @@ int convert_to_git(const char *path, const char *src, size_t len,
760786
required = ca.drv->required;
761787
}
762788

763-
ret |= apply_filter(path, src, len, dst, filter);
789+
ret |= apply_filter(path, src, len, -1, dst, filter);
764790
if (!ret && required)
765791
die("%s: clean filter '%s' failed", path, ca.drv->name);
766792

@@ -777,6 +803,23 @@ int convert_to_git(const char *path, const char *src, size_t len,
777803
return ret | ident_to_git(path, src, len, dst, ca.ident);
778804
}
779805

806+
void convert_to_git_filter_fd(const char *path, int fd, struct strbuf *dst,
807+
enum safe_crlf checksafe)
808+
{
809+
struct conv_attrs ca;
810+
convert_attrs(&ca, path);
811+
812+
assert(ca.drv);
813+
assert(ca.drv->clean);
814+
815+
if (!apply_filter(path, NULL, 0, fd, dst, ca.drv->clean))
816+
die("%s: clean filter '%s' failed", path, ca.drv->name);
817+
818+
ca.crlf_action = input_crlf_action(ca.crlf_action, ca.eol_attr);
819+
crlf_to_git(path, dst->buf, dst->len, dst, ca.crlf_action, checksafe);
820+
ident_to_git(path, dst->buf, dst->len, dst, ca.ident);
821+
}
822+
780823
static int convert_to_working_tree_internal(const char *path, const char *src,
781824
size_t len, struct strbuf *dst,
782825
int normalizing)
@@ -810,7 +853,7 @@ static int convert_to_working_tree_internal(const char *path, const char *src,
810853
}
811854
}
812855

813-
ret_filter = apply_filter(path, src, len, dst, filter);
856+
ret_filter = apply_filter(path, src, len, -1, dst, filter);
814857
if (!ret_filter && required)
815858
die("%s: smudge filter %s failed", path, ca.drv->name);
816859

convert.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,15 @@ extern int convert_to_working_tree(const char *path, const char *src,
4040
size_t len, struct strbuf *dst);
4141
extern int renormalize_buffer(const char *path, const char *src, size_t len,
4242
struct strbuf *dst);
43-
static inline int would_convert_to_git(const char *path, const char *src,
44-
size_t len, enum safe_crlf checksafe)
43+
static inline int would_convert_to_git(const char *path)
4544
{
46-
return convert_to_git(path, src, len, NULL, checksafe);
45+
return convert_to_git(path, NULL, 0, NULL, 0);
4746
}
47+
/* Precondition: would_convert_to_git_filter_fd(path) == true */
48+
extern void convert_to_git_filter_fd(const char *path, int fd,
49+
struct strbuf *dst,
50+
enum safe_crlf checksafe);
51+
extern int would_convert_to_git_filter_fd(const char *path);
4852

4953
/*****************************************************************
5054
*

copy.c

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,17 @@ int copy_fd(int ifd, int ofd)
44
{
55
while (1) {
66
char buffer[8192];
7-
char *buf = buffer;
87
ssize_t len = xread(ifd, buffer, sizeof(buffer));
98
if (!len)
109
break;
1110
if (len < 0) {
12-
int read_error = errno;
13-
close(ifd);
1411
return error("copy-fd: read returned %s",
15-
strerror(read_error));
16-
}
17-
while (len) {
18-
int written = xwrite(ofd, buf, len);
19-
if (written > 0) {
20-
buf += written;
21-
len -= written;
22-
}
23-
else if (!written) {
24-
close(ifd);
25-
return error("copy-fd: write returned 0");
26-
} else {
27-
int write_error = errno;
28-
close(ifd);
29-
return error("copy-fd: write returned %s",
30-
strerror(write_error));
31-
}
12+
strerror(errno));
3213
}
14+
if (write_in_full(ofd, buffer, len) < 0)
15+
return error("copy-fd: write returned %s",
16+
strerror(errno));
3317
}
34-
close(ifd);
3518
return 0;
3619
}
3720

@@ -60,6 +43,7 @@ int copy_file(const char *dst, const char *src, int mode)
6043
return fdo;
6144
}
6245
status = copy_fd(fdi, fdo);
46+
close(fdi);
6347
if (close(fdo) != 0)
6448
return error("%s: close error: %s", dst, strerror(errno));
6549

lockfile.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,11 @@ int hold_lock_file_for_append(struct lock_file *lk, const char *path, int flags)
224224
} else if (copy_fd(orig_fd, fd)) {
225225
if (flags & LOCK_DIE_ON_ERROR)
226226
exit(128);
227+
close(orig_fd);
227228
close(fd);
228229
return -1;
230+
} else {
231+
close(orig_fd);
229232
}
230233
return fd;
231234
}

sha1_file.c

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -663,10 +663,26 @@ void release_pack_memory(size_t need)
663663
; /* nothing */
664664
}
665665

666+
static void mmap_limit_check(size_t length)
667+
{
668+
static size_t limit = 0;
669+
if (!limit) {
670+
limit = git_env_ulong("GIT_MMAP_LIMIT", 0);
671+
if (!limit)
672+
limit = SIZE_MAX;
673+
}
674+
if (length > limit)
675+
die("attempting to mmap %"PRIuMAX" over limit %"PRIuMAX,
676+
(uintmax_t)length, (uintmax_t)limit);
677+
}
678+
666679
void *xmmap(void *start, size_t length,
667680
int prot, int flags, int fd, off_t offset)
668681
{
669-
void *ret = mmap(start, length, prot, flags, fd, offset);
682+
void *ret;
683+
684+
mmap_limit_check(length);
685+
ret = mmap(start, length, prot, flags, fd, offset);
670686
if (ret == MAP_FAILED) {
671687
if (!length)
672688
return NULL;
@@ -3076,6 +3092,29 @@ static int index_mem(unsigned char *sha1, void *buf, size_t size,
30763092
return ret;
30773093
}
30783094

3095+
static int index_stream_convert_blob(unsigned char *sha1, int fd,
3096+
const char *path, unsigned flags)
3097+
{
3098+
int ret;
3099+
const int write_object = flags & HASH_WRITE_OBJECT;
3100+
struct strbuf sbuf = STRBUF_INIT;
3101+
3102+
assert(path);
3103+
assert(would_convert_to_git_filter_fd(path));
3104+
3105+
convert_to_git_filter_fd(path, fd, &sbuf,
3106+
write_object ? safe_crlf : SAFE_CRLF_FALSE);
3107+
3108+
if (write_object)
3109+
ret = write_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
3110+
sha1);
3111+
else
3112+
ret = hash_sha1_file(sbuf.buf, sbuf.len, typename(OBJ_BLOB),
3113+
sha1);
3114+
strbuf_release(&sbuf);
3115+
return ret;
3116+
}
3117+
30793118
static int index_pipe(unsigned char *sha1, int fd, enum object_type type,
30803119
const char *path, unsigned flags)
30813120
{
@@ -3141,15 +3180,22 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st,
31413180
enum object_type type, const char *path, unsigned flags)
31423181
{
31433182
int ret;
3144-
size_t size = xsize_t(st->st_size);
31453183

3146-
if (!S_ISREG(st->st_mode))
3184+
/*
3185+
* Call xsize_t() only when needed to avoid potentially unnecessary
3186+
* die() for large files.
3187+
*/
3188+
if (type == OBJ_BLOB && path && would_convert_to_git_filter_fd(path))
3189+
ret = index_stream_convert_blob(sha1, fd, path, flags);
3190+
else if (!S_ISREG(st->st_mode))
31473191
ret = index_pipe(sha1, fd, type, path, flags);
3148-
else if (size <= big_file_threshold || type != OBJ_BLOB ||
3149-
(path && would_convert_to_git(path, NULL, 0, 0)))
3150-
ret = index_core(sha1, fd, size, type, path, flags);
3192+
else if (st->st_size <= big_file_threshold || type != OBJ_BLOB ||
3193+
(path && would_convert_to_git(path)))
3194+
ret = index_core(sha1, fd, xsize_t(st->st_size), type, path,
3195+
flags);
31513196
else
3152-
ret = index_stream(sha1, fd, size, type, path, flags);
3197+
ret = index_stream(sha1, fd, xsize_t(st->st_size), type, path,
3198+
flags);
31533199
close(fd);
31543200
return ret;
31553201
}

t/t0021-conversion.sh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,17 +153,23 @@ test_expect_success 'filter shell-escaped filenames' '
153153
:
154154
'
155155

156-
test_expect_success 'required filter success' '
157-
git config filter.required.smudge cat &&
158-
git config filter.required.clean cat &&
156+
test_expect_success 'required filter should filter data' '
157+
git config filter.required.smudge ./rot13.sh &&
158+
git config filter.required.clean ./rot13.sh &&
159159
git config filter.required.required true &&
160160
161161
echo "*.r filter=required" >.gitattributes &&
162162
163-
echo test >test.r &&
163+
cat test.o >test.r &&
164164
git add test.r &&
165+
165166
rm -f test.r &&
166-
git checkout -- test.r
167+
git checkout -- test.r &&
168+
cmp test.o test.r &&
169+
170+
./rot13.sh <test.o >expected &&
171+
git cat-file blob :test.r >actual &&
172+
cmp expected actual
167173
'
168174

169175
test_expect_success 'required filter smudge failure' '
@@ -190,6 +196,14 @@ test_expect_success 'required filter clean failure' '
190196
test_must_fail git add test.fc
191197
'
192198

199+
test_expect_success 'filtering large input to small output should use little memory' '
200+
git config filter.devnull.clean "cat >/dev/null" &&
201+
git config filter.devnull.required true &&
202+
for i in $(test_seq 1 30); do printf "%1048576d" 1; done >30MB &&
203+
echo "30MB filter=devnull" >.gitattributes &&
204+
GIT_MMAP_LIMIT=1m GIT_ALLOC_LIMIT=1m git add 30MB
205+
'
206+
193207
test_expect_success EXPENSIVE 'filter large file' '
194208
git config filter.largefile.smudge cat &&
195209
git config filter.largefile.clean cat &&

t/t1050-large.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ test_expect_success setup '
1313
echo X | dd of=large2 bs=1k seek=2000 &&
1414
echo X | dd of=large3 bs=1k seek=2000 &&
1515
echo Y | dd of=huge bs=1k seek=2500 &&
16-
GIT_ALLOC_LIMIT=1500 &&
16+
GIT_ALLOC_LIMIT=1500k &&
1717
export GIT_ALLOC_LIMIT
1818
'
1919

0 commit comments

Comments
 (0)