Skip to content

Commit d9bae1a

Browse files
peffgitster
authored andcommitted
diff: cache textconv output
Running a textconv filter can take a long time. It's particularly bad for a large file which needs to be spooled to disk, but even for small files, the fork+exec overhead can add up for something like "git log -p". This patch uses the notes-cache mechanism to keep a fast cache of textconv output. Caches are stored in refs/notes/textconv/$x, where $x is the userdiff driver defined in gitattributes. Caching is enabled only if diff.$x.cachetextconv is true. In my test repo, on a commit with 45 jpg and avi files changed and a textconv to show their exif tags: [before] $ time git show >/dev/null real 0m13.724s user 0m12.057s sys 0m1.624s [after, first run] $ git config diff.mfo.cachetextconv true $ time git show >/dev/null real 0m14.252s user 0m12.197s sys 0m1.800s [after, subsequent runs] $ time git show >/dev/null real 0m0.352s user 0m0.148s sys 0m0.200s So for a slight (3.8%) cost on the first run, we achieve an almost 40x speed up on subsequent runs. Signed-off-by: Jeff King <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 840383b commit d9bae1a

File tree

5 files changed

+185
-9
lines changed

5 files changed

+185
-9
lines changed

Documentation/gitattributes.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you
414414
should generate it separately and send it as a comment _in
415415
addition to_ the usual binary diff that you might send.
416416

417+
Because text conversion can be slow, especially when doing a
418+
large number of them with `git log -p`, git provides a mechanism
419+
to cache the output and use it in future diffs. To enable
420+
caching, set the "cachetextconv" variable in your diff driver's
421+
config. For example:
422+
423+
------------------------
424+
[diff "jpg"]
425+
textconv = exif
426+
cachetextconv = true
427+
------------------------
428+
429+
This will cache the result of running "exif" on each blob
430+
indefinitely. If you change the textconv config variable for a
431+
diff driver, git will automatically invalidate the cache entries
432+
and re-run the textconv filter. If you want to invalidate the
433+
cache manually (e.g., because your version of "exif" was updated
434+
and now produces better output), you can remove the cache
435+
manually with `git update-ref -d refs/notes/textconv/jpg` (where
436+
"jpg" is the name of the diff driver, as in the example above).
417437

418438
Performing a three-way merge
419439
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff.c

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = {
4343
};
4444

4545
static void diff_filespec_load_driver(struct diff_filespec *one);
46-
static size_t fill_textconv(const char *cmd,
46+
static size_t fill_textconv(struct userdiff_driver *driver,
4747
struct diff_filespec *df, char **outbuf);
4848

4949
static int parse_diff_color_slot(const char *var, int ofs)
@@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a,
466466
const char *name_b,
467467
struct diff_filespec *one,
468468
struct diff_filespec *two,
469-
const char *textconv_one,
470-
const char *textconv_two,
469+
struct userdiff_driver *textconv_one,
470+
struct userdiff_driver *textconv_two,
471471
struct diff_options *o)
472472
{
473473
int lc_a, lc_b;
@@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const
15691569
options->b_prefix = b;
15701570
}
15711571

1572-
static const char *get_textconv(struct diff_filespec *one)
1572+
static struct userdiff_driver *get_textconv(struct diff_filespec *one)
15731573
{
15741574
if (!DIFF_FILE_VALID(one))
15751575
return NULL;
15761576
if (!S_ISREG(one->mode))
15771577
return NULL;
15781578
diff_filespec_load_driver(one);
1579-
return one->driver->textconv;
1579+
if (!one->driver->textconv)
1580+
return NULL;
1581+
1582+
if (one->driver->textconv_want_cache && !one->driver->textconv_cache) {
1583+
struct notes_cache *c = xmalloc(sizeof(*c));
1584+
struct strbuf name = STRBUF_INIT;
1585+
1586+
strbuf_addf(&name, "textconv/%s", one->driver->name);
1587+
notes_cache_init(c, name.buf, one->driver->textconv);
1588+
one->driver->textconv_cache = c;
1589+
}
1590+
1591+
return one->driver;
15801592
}
15811593

15821594
static void builtin_diff(const char *name_a,
@@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a,
15931605
const char *set = diff_get_color_opt(o, DIFF_METAINFO);
15941606
const char *reset = diff_get_color_opt(o, DIFF_RESET);
15951607
const char *a_prefix, *b_prefix;
1596-
const char *textconv_one = NULL, *textconv_two = NULL;
1608+
struct userdiff_driver *textconv_one = NULL;
1609+
struct userdiff_driver *textconv_two = NULL;
15971610
struct strbuf header = STRBUF_INIT;
15981611

15991612
if (DIFF_OPT_TST(o, SUBMODULE_LOG) &&
@@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec,
38883901
return strbuf_detach(&buf, outsize);
38893902
}
38903903

3891-
static size_t fill_textconv(const char *cmd,
3904+
static size_t fill_textconv(struct userdiff_driver *driver,
38923905
struct diff_filespec *df,
38933906
char **outbuf)
38943907
{
38953908
size_t size;
38963909

3897-
if (!cmd) {
3910+
if (!driver || !driver->textconv) {
38983911
if (!DIFF_FILE_VALID(df)) {
38993912
*outbuf = "";
39003913
return 0;
@@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd,
39053918
return df->size;
39063919
}
39073920

3908-
*outbuf = run_textconv(cmd, df, &size);
3921+
if (driver->textconv_cache) {
3922+
*outbuf = notes_cache_get(driver->textconv_cache, df->sha1,
3923+
&size);
3924+
if (*outbuf)
3925+
return size;
3926+
}
3927+
3928+
*outbuf = run_textconv(driver->textconv, df, &size);
39093929
if (!*outbuf)
39103930
die("unable to read files to diff");
3931+
3932+
if (driver->textconv_cache) {
3933+
/* ignore errors, as we might be in a readonly repository */
3934+
notes_cache_put(driver->textconv_cache, df->sha1, *outbuf,
3935+
size);
3936+
/*
3937+
* we could save up changes and flush them all at the end,
3938+
* but we would need an extra call after all diffing is done.
3939+
* Since generating a cache entry is the slow path anyway,
3940+
* this extra overhead probably isn't a big deal.
3941+
*/
3942+
notes_cache_write(driver->textconv_cache);
3943+
}
3944+
39113945
return size;
39123946
}

t/t4042-diff-textconv-caching.sh

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/bin/sh
2+
3+
test_description='test textconv caching'
4+
. ./test-lib.sh
5+
6+
cat >helper <<'EOF'
7+
#!/bin/sh
8+
sed 's/^/converted: /' "$@" >helper.out
9+
cat helper.out
10+
EOF
11+
chmod +x helper
12+
13+
test_expect_success 'setup' '
14+
echo foo content 1 >foo.bin &&
15+
echo bar content 1 >bar.bin &&
16+
git add . &&
17+
git commit -m one &&
18+
echo foo content 2 >foo.bin &&
19+
echo bar content 2 >bar.bin &&
20+
git commit -a -m two &&
21+
echo "*.bin diff=magic" >.gitattributes &&
22+
git config diff.magic.textconv ./helper &&
23+
git config diff.magic.cachetextconv true
24+
'
25+
26+
cat >expect <<EOF
27+
diff --git a/bar.bin b/bar.bin
28+
index fcf9166..28283d5 100644
29+
--- a/bar.bin
30+
+++ b/bar.bin
31+
@@ -1 +1 @@
32+
-converted: bar content 1
33+
+converted: bar content 2
34+
diff --git a/foo.bin b/foo.bin
35+
index d5b9fe3..1345db2 100644
36+
--- a/foo.bin
37+
+++ b/foo.bin
38+
@@ -1 +1 @@
39+
-converted: foo content 1
40+
+converted: foo content 2
41+
EOF
42+
43+
test_expect_success 'first textconv works' '
44+
git diff HEAD^ HEAD >actual &&
45+
test_cmp expect actual
46+
'
47+
48+
test_expect_success 'cached textconv produces same output' '
49+
git diff HEAD^ HEAD >actual &&
50+
test_cmp expect actual
51+
'
52+
53+
test_expect_success 'cached textconv does not run helper' '
54+
rm -f helper.out &&
55+
git diff HEAD^ HEAD >actual &&
56+
test_cmp expect actual &&
57+
! test -r helper.out
58+
'
59+
60+
cat >expect <<EOF
61+
diff --git a/bar.bin b/bar.bin
62+
index fcf9166..28283d5 100644
63+
--- a/bar.bin
64+
+++ b/bar.bin
65+
@@ -1,2 +1,2 @@
66+
converted: other
67+
-converted: bar content 1
68+
+converted: bar content 2
69+
diff --git a/foo.bin b/foo.bin
70+
index d5b9fe3..1345db2 100644
71+
--- a/foo.bin
72+
+++ b/foo.bin
73+
@@ -1,2 +1,2 @@
74+
converted: other
75+
-converted: foo content 1
76+
+converted: foo content 2
77+
EOF
78+
test_expect_success 'changing textconv invalidates cache' '
79+
echo other >other &&
80+
git config diff.magic.textconv "./helper other" &&
81+
git diff HEAD^ HEAD >actual &&
82+
test_cmp expect actual
83+
'
84+
85+
cat >expect <<EOF
86+
diff --git a/bar.bin b/bar.bin
87+
index fcf9166..28283d5 100644
88+
--- a/bar.bin
89+
+++ b/bar.bin
90+
@@ -1,2 +1,2 @@
91+
converted: other
92+
-converted: bar content 1
93+
+converted: bar content 2
94+
diff --git a/foo.bin b/foo.bin
95+
index d5b9fe3..1345db2 100644
96+
--- a/foo.bin
97+
+++ b/foo.bin
98+
@@ -1 +1 @@
99+
-converted: foo content 1
100+
+converted: foo content 2
101+
EOF
102+
test_expect_success 'switching diff driver produces correct results' '
103+
git config diff.moremagic.textconv ./helper &&
104+
echo foo.bin diff=moremagic >>.gitattributes &&
105+
git diff HEAD^ HEAD >actual &&
106+
test_cmp expect actual
107+
'
108+
109+
test_done

userdiff.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "cache.h"
12
#include "userdiff.h"
23
#include "cache.h"
34
#include "attr.h"
@@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v)
167168
return 1;
168169
}
169170

171+
static int parse_bool(int *b, const char *k, const char *v)
172+
{
173+
*b = git_config_bool(k, v);
174+
return 1;
175+
}
176+
170177
int userdiff_config(const char *k, const char *v)
171178
{
172179
struct userdiff_driver *drv;
@@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v)
181188
return parse_string(&drv->external, k, v);
182189
if ((drv = parse_driver(k, v, "textconv")))
183190
return parse_string(&drv->textconv, k, v);
191+
if ((drv = parse_driver(k, v, "cachetextconv")))
192+
return parse_bool(&drv->textconv_want_cache, k, v);
184193
if ((drv = parse_driver(k, v, "wordregex")))
185194
return parse_string(&drv->word_regex, k, v);
186195

userdiff.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef USERDIFF_H
22
#define USERDIFF_H
33

4+
#include "notes-cache.h"
5+
46
struct userdiff_funcname {
57
const char *pattern;
68
int cflags;
@@ -13,6 +15,8 @@ struct userdiff_driver {
1315
struct userdiff_funcname funcname;
1416
const char *word_regex;
1517
const char *textconv;
18+
struct notes_cache *textconv_cache;
19+
int textconv_want_cache;
1620
};
1721

1822
int userdiff_config(const char *k, const char *v);

0 commit comments

Comments
 (0)