Skip to content

Commit a1bf5ca

Browse files
chiyutianyigitster
authored andcommitted
unpack-objects: low memory footprint for get_data() in dry_run mode
As the name implies, "get_data(size)" will allocate and return a given amount of memory. Allocating memory for a large blob object may cause the system to run out of memory. Before preparing to replace calling of "get_data()" to unpack large blob objects in latter commits, refactor "get_data()" to reduce memory footprint for dry_run mode. Because in dry_run mode, "get_data()" is only used to check the integrity of data, and the returned buffer is not used at all, we can allocate a smaller buffer and use it as zstream output. Make the function return NULL in the dry-run mode, as no callers use the returned buffer. The "find [...]objects/?? -type f | wc -l" test idiom being used here is adapted from the same "find" use added to another test in d9545c7 (fast-import: implement unpack limit, 2016-04-25). Suggested-by: Jiang Xin <[email protected]> Signed-off-by: Han Xin <[email protected]> Signed-off-by: Ævar Arnfjörð Bjarmason <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent ab336e8 commit a1bf5ca

File tree

2 files changed

+67
-11
lines changed

2 files changed

+67
-11
lines changed

builtin/unpack-objects.c

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,27 @@ static void use(int bytes)
9797
display_throughput(progress, consumed_bytes);
9898
}
9999

100+
/*
101+
* Decompress zstream from the standard input into a newly
102+
* allocated buffer of specified size and return the buffer.
103+
* The caller is responsible to free the returned buffer.
104+
*
105+
* But for dry_run mode, "get_data()" is only used to check the
106+
* integrity of data, and the returned buffer is not used at all.
107+
* Therefore, in dry_run mode, "get_data()" will release the small
108+
* allocated buffer which is reused to hold temporary zstream output
109+
* and return NULL instead of returning garbage data.
110+
*/
100111
static void *get_data(unsigned long size)
101112
{
102113
git_zstream stream;
103-
void *buf = xmallocz(size);
114+
unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
115+
void *buf = xmallocz(bufsize);
104116

105117
memset(&stream, 0, sizeof(stream));
106118

107119
stream.next_out = buf;
108-
stream.avail_out = size;
120+
stream.avail_out = bufsize;
109121
stream.next_in = fill(1);
110122
stream.avail_in = len;
111123
git_inflate_init(&stream);
@@ -125,8 +137,17 @@ static void *get_data(unsigned long size)
125137
}
126138
stream.next_in = fill(1);
127139
stream.avail_in = len;
140+
if (dry_run) {
141+
/* reuse the buffer in dry_run mode */
142+
stream.next_out = buf;
143+
stream.avail_out = bufsize > size - stream.total_out ?
144+
size - stream.total_out :
145+
bufsize;
146+
}
128147
}
129148
git_inflate_end(&stream);
149+
if (dry_run)
150+
FREE_AND_NULL(buf);
130151
return buf;
131152
}
132153

@@ -326,10 +347,8 @@ static void unpack_non_delta_entry(enum object_type type, unsigned long size,
326347
{
327348
void *buf = get_data(size);
328349

329-
if (!dry_run && buf)
350+
if (buf)
330351
write_object(nr, type, buf, size);
331-
else
332-
free(buf);
333352
}
334353

335354
static int resolve_against_held(unsigned nr, const struct object_id *base,
@@ -359,10 +378,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
359378
oidread(&base_oid, fill(the_hash_algo->rawsz));
360379
use(the_hash_algo->rawsz);
361380
delta_data = get_data(delta_size);
362-
if (dry_run || !delta_data) {
363-
free(delta_data);
381+
if (!delta_data)
364382
return;
365-
}
366383
if (has_object_file(&base_oid))
367384
; /* Ok we have this one */
368385
else if (resolve_against_held(nr, &base_oid,
@@ -398,10 +415,8 @@ static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
398415
die("offset value out of bound for delta base object");
399416

400417
delta_data = get_data(delta_size);
401-
if (dry_run || !delta_data) {
402-
free(delta_data);
418+
if (!delta_data)
403419
return;
404-
}
405420
lo = 0;
406421
hi = nr;
407422
while (lo < hi) {

t/t5351-unpack-large-objects.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/sh
2+
#
3+
# Copyright (c) 2022 Han Xin
4+
#
5+
6+
test_description='git unpack-objects with large objects'
7+
8+
. ./test-lib.sh
9+
10+
prepare_dest () {
11+
test_when_finished "rm -rf dest.git" &&
12+
git init --bare dest.git
13+
}
14+
15+
test_expect_success "create large objects (1.5 MB) and PACK" '
16+
test-tool genrandom foo 1500000 >big-blob &&
17+
test_commit --append foo big-blob &&
18+
test-tool genrandom bar 1500000 >big-blob &&
19+
test_commit --append bar big-blob &&
20+
PACK=$(echo HEAD | git pack-objects --revs pack)
21+
'
22+
23+
test_expect_success 'set memory limitation to 1MB' '
24+
GIT_ALLOC_LIMIT=1m &&
25+
export GIT_ALLOC_LIMIT
26+
'
27+
28+
test_expect_success 'unpack-objects failed under memory limitation' '
29+
prepare_dest &&
30+
test_must_fail git -C dest.git unpack-objects <pack-$PACK.pack 2>err &&
31+
grep "fatal: attempting to allocate" err
32+
'
33+
34+
test_expect_success 'unpack-objects works with memory limitation in dry-run mode' '
35+
prepare_dest &&
36+
git -C dest.git unpack-objects -n <pack-$PACK.pack &&
37+
test_stdout_line_count = 0 find dest.git/objects -type f &&
38+
test_dir_is_empty dest.git/objects/pack
39+
'
40+
41+
test_done

0 commit comments

Comments
 (0)