Skip to content

Commit 86bfce1

Browse files
committed
range-diff: first rudimentary implementation
At this stage, `git range-diff` can determine corresponding commits of two related commit ranges. This makes use of the recently introduced implementation of the Hungarian algorithm. The core of this patch is a straight port of the ideas of tbdiff, the apparently dormant project at https://github.com/trast/tbdiff. The output does not at all match `tbdiff`'s output yet, as this patch really concentrates on getting the patch matching part right. Note: due to differences in the diff algorithm (`tbdiff` uses the Python module `difflib`, Git uses its xdiff fork), the cost matrix calculated by `range-diff` is different (but very similar) to the one calculated by `tbdiff`. Therefore, it is possible that they find different matching commits in corner cases (e.g. when a patch was split into two patches of roughly equal length). Signed-off-by: Johannes Schindelin <[email protected]>
1 parent d609980 commit 86bfce1

File tree

4 files changed

+359
-3
lines changed

4 files changed

+359
-3
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,7 @@ LIB_OBJS += progress.o
904904
LIB_OBJS += prompt.o
905905
LIB_OBJS += protocol.o
906906
LIB_OBJS += quote.o
907+
LIB_OBJS += range-diff.o
907908
LIB_OBJS += reachable.o
908909
LIB_OBJS += read-cache.o
909910
LIB_OBJS += reflog-walk.o

builtin/range-diff.c

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "cache.h"
22
#include "builtin.h"
33
#include "parse-options.h"
4+
#include "range-diff.h"
45

56
static const char * const builtin_range_diff_usage[] = {
67
N_("git range-diff [<options>] <old-base>..<old-tip> <new-base>..<new-tip>"),
@@ -17,9 +18,49 @@ int cmd_range_diff(int argc, const char **argv, const char *prefix)
1718
N_("Percentage by which creation is weighted")),
1819
OPT_END()
1920
};
21+
int res = 0;
22+
struct strbuf range1 = STRBUF_INIT, range2 = STRBUF_INIT;
2023

21-
argc = parse_options(argc, argv, NULL, options,
22-
builtin_range_diff_usage, 0);
24+
argc = parse_options(argc, argv, NULL, options, builtin_range_diff_usage,
25+
0);
2326

24-
return 0;
27+
if (argc == 2) {
28+
if (!strstr(argv[0], ".."))
29+
warning(_("no .. in range: '%s'"), argv[0]);
30+
strbuf_addstr(&range1, argv[0]);
31+
32+
if (!strstr(argv[1], ".."))
33+
warning(_("no .. in range: '%s'"), argv[1]);
34+
strbuf_addstr(&range2, argv[1]);
35+
} else if (argc == 3) {
36+
strbuf_addf(&range1, "%s..%s", argv[0], argv[1]);
37+
strbuf_addf(&range2, "%s..%s", argv[0], argv[2]);
38+
} else if (argc == 1) {
39+
const char *b = strstr(argv[0], "..."), *a = argv[0];
40+
int a_len;
41+
42+
if (!b)
43+
die(_("single arg format requires a symmetric range"));
44+
45+
a_len = (int)(b - a);
46+
if (!a_len) {
47+
a = "HEAD";
48+
a_len = strlen(a);
49+
}
50+
b += 3;
51+
if (!*b)
52+
b = "HEAD";
53+
strbuf_addf(&range1, "%s..%.*s", b, a_len, a);
54+
strbuf_addf(&range2, "%.*s..%s", a_len, a, b);
55+
} else {
56+
error(_("need two commit ranges"));
57+
usage_with_options(builtin_range_diff_usage, options);
58+
}
59+
60+
res = show_range_diff(range1.buf, range2.buf, creation_factor);
61+
62+
strbuf_release(&range1);
63+
strbuf_release(&range2);
64+
65+
return res;
2566
}

range-diff.c

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
#include "cache.h"
2+
#include "range-diff.h"
3+
#include "string-list.h"
4+
#include "run-command.h"
5+
#include "argv-array.h"
6+
#include "hashmap.h"
7+
#include "xdiff-interface.h"
8+
#include "linear-assignment.h"
9+
10+
struct patch_util {
11+
/* For the search for an exact match */
12+
struct hashmap_entry e;
13+
const char *diff, *patch;
14+
15+
int i;
16+
int diffsize;
17+
size_t diff_offset;
18+
/* the index of the matching item in the other branch, or -1 */
19+
int matching;
20+
struct object_id oid;
21+
};
22+
23+
/*
24+
* Reads the patches into a string list, with the `util` field being populated
25+
* as struct object_id (will need to be free()d).
26+
*/
27+
static int read_patches(const char *range, struct string_list *list)
28+
{
29+
struct child_process cp = CHILD_PROCESS_INIT;
30+
FILE *in;
31+
struct strbuf buf = STRBUF_INIT, line = STRBUF_INIT;
32+
struct patch_util *util = NULL;
33+
int in_header = 1;
34+
35+
argv_array_pushl(&cp.args, "log", "--no-color", "-p", "--no-merges",
36+
"--reverse", "--date-order", "--decorate=no",
37+
"--no-abbrev-commit", range,
38+
NULL);
39+
cp.out = -1;
40+
cp.no_stdin = 1;
41+
cp.git_cmd = 1;
42+
43+
if (start_command(&cp))
44+
return error_errno(_("could not start `log`"));
45+
in = fdopen(cp.out, "r");
46+
if (!in) {
47+
error_errno(_("could not read `log` output"));
48+
finish_command(&cp);
49+
return -1;
50+
}
51+
52+
while (strbuf_getline(&line, in) != EOF) {
53+
const char *p;
54+
55+
if (skip_prefix(line.buf, "commit ", &p)) {
56+
if (util) {
57+
string_list_append(list, buf.buf)->util = util;
58+
strbuf_reset(&buf);
59+
}
60+
util = xcalloc(sizeof(*util), 1);
61+
if (get_oid(p, &util->oid)) {
62+
error(_("could not parse commit '%s'"), p);
63+
free(util);
64+
string_list_clear(list, 1);
65+
strbuf_release(&buf);
66+
strbuf_release(&line);
67+
fclose(in);
68+
finish_command(&cp);
69+
return -1;
70+
}
71+
util->matching = -1;
72+
in_header = 1;
73+
continue;
74+
}
75+
76+
if (starts_with(line.buf, "diff --git")) {
77+
in_header = 0;
78+
strbuf_addch(&buf, '\n');
79+
if (!util->diff_offset)
80+
util->diff_offset = buf.len;
81+
strbuf_addbuf(&buf, &line);
82+
} else if (in_header) {
83+
if (starts_with(line.buf, "Author: ")) {
84+
strbuf_addbuf(&buf, &line);
85+
strbuf_addstr(&buf, "\n\n");
86+
} else if (starts_with(line.buf, " ")) {
87+
strbuf_addbuf(&buf, &line);
88+
strbuf_addch(&buf, '\n');
89+
}
90+
continue;
91+
} else if (starts_with(line.buf, "@@ "))
92+
strbuf_addstr(&buf, "@@");
93+
else if (line.buf[0] && !starts_with(line.buf, "index "))
94+
/*
95+
* A completely blank (not ' \n', which is context)
96+
* line is not valid in a diff. We skip it
97+
* silently, because this neatly handles the blank
98+
* separator line between commits in git-log
99+
* output.
100+
*/
101+
strbuf_addbuf(&buf, &line);
102+
else
103+
continue;
104+
105+
strbuf_addch(&buf, '\n');
106+
util->diffsize++;
107+
}
108+
fclose(in);
109+
strbuf_release(&line);
110+
111+
if (util)
112+
string_list_append(list, buf.buf)->util = util;
113+
strbuf_release(&buf);
114+
115+
if (finish_command(&cp))
116+
return -1;
117+
118+
return 0;
119+
}
120+
121+
static int patch_util_cmp(const void *dummy, const struct patch_util *a,
122+
const struct patch_util *b, const char *keydata)
123+
{
124+
return strcmp(a->diff, keydata ? keydata : b->diff);
125+
}
126+
127+
static void find_exact_matches(struct string_list *a, struct string_list *b)
128+
{
129+
struct hashmap map;
130+
int i;
131+
132+
hashmap_init(&map, (hashmap_cmp_fn)patch_util_cmp, NULL, 0);
133+
134+
/* First, add the patches of a to a hash map */
135+
for (i = 0; i < a->nr; i++) {
136+
struct patch_util *util = a->items[i].util;
137+
138+
util->i = i;
139+
util->patch = a->items[i].string;
140+
util->diff = util->patch + util->diff_offset;
141+
hashmap_entry_init(util, strhash(util->diff));
142+
hashmap_add(&map, util);
143+
}
144+
145+
/* Now try to find exact matches in b */
146+
for (i = 0; i < b->nr; i++) {
147+
struct patch_util *util = b->items[i].util, *other;
148+
149+
util->i = i;
150+
util->patch = b->items[i].string;
151+
util->diff = util->patch + util->diff_offset;
152+
hashmap_entry_init(util, strhash(util->diff));
153+
other = hashmap_remove(&map, util, NULL);
154+
if (other) {
155+
if (other->matching >= 0)
156+
BUG("already assigned!");
157+
158+
other->matching = i;
159+
util->matching = other->i;
160+
}
161+
}
162+
163+
hashmap_free(&map, 0);
164+
}
165+
166+
static void diffsize_consume(void *data, char *line, unsigned long len)
167+
{
168+
(*(int *)data)++;
169+
}
170+
171+
static int diffsize(const char *a, const char *b)
172+
{
173+
xpparam_t pp = { 0 };
174+
xdemitconf_t cfg = { 0 };
175+
mmfile_t mf1, mf2;
176+
int count = 0;
177+
178+
mf1.ptr = (char *)a;
179+
mf1.size = strlen(a);
180+
mf2.ptr = (char *)b;
181+
mf2.size = strlen(b);
182+
183+
cfg.ctxlen = 3;
184+
if (!xdi_diff_outf(&mf1, &mf2, diffsize_consume, &count, &pp, &cfg))
185+
return count;
186+
187+
error(_("failed to generate diff"));
188+
return COST_MAX;
189+
}
190+
191+
static void get_correspondences(struct string_list *a, struct string_list *b,
192+
int creation_factor)
193+
{
194+
int n = a->nr + b->nr;
195+
int *cost, c, *a2b, *b2a;
196+
int i, j;
197+
198+
ALLOC_ARRAY(cost, st_mult(n, n));
199+
ALLOC_ARRAY(a2b, n);
200+
ALLOC_ARRAY(b2a, n);
201+
202+
for (i = 0; i < a->nr; i++) {
203+
struct patch_util *a_util = a->items[i].util;
204+
205+
for (j = 0; j < b->nr; j++) {
206+
struct patch_util *b_util = b->items[j].util;
207+
208+
if (a_util->matching == j)
209+
c = 0;
210+
else if (a_util->matching < 0 && b_util->matching < 0)
211+
c = diffsize(a_util->diff, b_util->diff);
212+
else
213+
c = COST_MAX;
214+
cost[i + n * j] = c;
215+
}
216+
217+
c = a_util->matching < 0 ?
218+
a_util->diffsize * creation_factor / 100 : COST_MAX;
219+
for (j = b->nr; j < n; j++)
220+
cost[i + n * j] = c;
221+
}
222+
223+
for (j = 0; j < b->nr; j++) {
224+
struct patch_util *util = b->items[j].util;
225+
226+
c = util->matching < 0 ?
227+
util->diffsize * creation_factor / 100 : COST_MAX;
228+
for (i = a->nr; i < n; i++)
229+
cost[i + n * j] = c;
230+
}
231+
232+
for (i = a->nr; i < n; i++)
233+
for (j = b->nr; j < n; j++)
234+
cost[i + n * j] = 0;
235+
236+
compute_assignment(n, n, cost, a2b, b2a);
237+
238+
for (i = 0; i < a->nr; i++)
239+
if (a2b[i] >= 0 && a2b[i] < b->nr) {
240+
struct patch_util *a_util = a->items[i].util;
241+
struct patch_util *b_util = b->items[a2b[i]].util;
242+
243+
a_util->matching = a2b[i];
244+
b_util->matching = i;
245+
}
246+
247+
free(cost);
248+
free(a2b);
249+
free(b2a);
250+
}
251+
252+
static const char *short_oid(struct patch_util *util)
253+
{
254+
return find_unique_abbrev(util->oid.hash, DEFAULT_ABBREV);
255+
}
256+
257+
static void output(struct string_list *a, struct string_list *b)
258+
{
259+
int i;
260+
261+
for (i = 0; i < b->nr; i++) {
262+
struct patch_util *util = b->items[i].util, *prev;
263+
264+
if (util->matching < 0)
265+
printf("-: -------- > %d: %s\n",
266+
i + 1, short_oid(util));
267+
else {
268+
prev = a->items[util->matching].util;
269+
printf("%d: %s ! %d: %s\n",
270+
util->matching + 1, short_oid(prev),
271+
i + 1, short_oid(util));
272+
}
273+
}
274+
275+
for (i = 0; i < a->nr; i++) {
276+
struct patch_util *util = a->items[i].util;
277+
278+
if (util->matching < 0)
279+
printf("%d: %s < -: --------\n",
280+
i + 1, short_oid(util));
281+
}
282+
}
283+
284+
int show_range_diff(const char *range1, const char *range2,
285+
int creation_factor)
286+
{
287+
int res = 0;
288+
289+
struct string_list branch1 = STRING_LIST_INIT_DUP;
290+
struct string_list branch2 = STRING_LIST_INIT_DUP;
291+
292+
if (read_patches(range1, &branch1))
293+
res = error(_("could not parse log for '%s'"), range1);
294+
if (!res && read_patches(range2, &branch2))
295+
res = error(_("could not parse log for '%s'"), range2);
296+
297+
if (!res) {
298+
find_exact_matches(&branch1, &branch2);
299+
get_correspondences(&branch1, &branch2, creation_factor);
300+
output(&branch1, &branch2);
301+
}
302+
303+
string_list_clear(&branch1, 1);
304+
string_list_clear(&branch2, 1);
305+
306+
return res;
307+
}

range-diff.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#ifndef BRANCH_DIFF_H
2+
#define BRANCH_DIFF_H
3+
4+
int show_range_diff(const char *range1, const char *range2,
5+
int creation_factor);
6+
7+
#endif

0 commit comments

Comments
 (0)