Skip to content

Commit e1bc8dc

Browse files
committed
Merge branch 'jc/diffcore'
* jc/diffcore: diffcore-delta.c: Ignore CR in CRLF for text files diffcore-delta.c: update the comment on the algorithm. diffcore_filespec: add is_binary diffcore_count_changes: pass diffcore_filespec
2 parents 792d237 + b9905fe commit e1bc8dc

File tree

6 files changed

+76
-23
lines changed

6 files changed

+76
-23
lines changed

diff.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3005,6 +3005,22 @@ void diffcore_std(struct diff_options *options)
30053005
{
30063006
if (options->quiet)
30073007
return;
3008+
3009+
/*
3010+
* break/rename count similarity differently depending on
3011+
* the binary-ness.
3012+
*/
3013+
if ((options->break_opt != -1) || (options->detect_rename)) {
3014+
struct diff_queue_struct *q = &diff_queued_diff;
3015+
int i;
3016+
3017+
for (i = 0; i < q->nr; i++) {
3018+
struct diff_filepair *p = q->queue[i];
3019+
p->one->is_binary = file_is_binary(p->one);
3020+
p->two->is_binary = file_is_binary(p->two);
3021+
}
3022+
}
3023+
30083024
if (options->break_opt != -1)
30093025
diffcore_break(options->break_opt);
30103026
if (options->detect_rename)

diffcore-break.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ static int should_break(struct diff_filespec *src,
6666
if (base_size < MINIMUM_BREAK_SIZE)
6767
return 0; /* we do not break too small filepair */
6868

69-
if (diffcore_count_changes(src->data, src->size,
70-
dst->data, dst->size,
69+
if (diffcore_count_changes(src, dst,
7170
NULL, NULL,
7271
0,
7372
&src_copied, &literal_added))

diffcore-delta.c

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,20 @@
55
/*
66
* Idea here is very simple.
77
*
8-
* We have total of (sz-N+1) N-byte overlapping sequences in buf whose
9-
* size is sz. If the same N-byte sequence appears in both source and
10-
* destination, we say the byte that starts that sequence is shared
11-
* between them (i.e. copied from source to destination).
8+
* Almost all data we are interested in are text, but sometimes we have
9+
* to deal with binary data. So we cut them into chunks delimited by
10+
* LF byte, or 64-byte sequence, whichever comes first, and hash them.
1211
*
13-
* For each possible N-byte sequence, if the source buffer has more
14-
* instances of it than the destination buffer, that means the
15-
* difference are the number of bytes not copied from source to
16-
* destination. If the counts are the same, everything was copied
17-
* from source to destination. If the destination has more,
18-
* everything was copied, and destination added more.
12+
* For those chunks, if the source buffer has more instances of it
13+
* than the destination buffer, that means the difference are the
14+
* number of bytes not copied from source to destination. If the
15+
* counts are the same, everything was copied from source to
16+
* destination. If the destination has more, everything was copied,
17+
* and destination added more.
1918
*
2019
* We are doing an approximation so we do not really have to waste
2120
* memory by actually storing the sequence. We just hash them into
2221
* somewhere around 2^16 hashbuckets and count the occurrences.
23-
*
24-
* The length of the sequence is arbitrarily set to 8 for now.
2522
*/
2623

2724
/* Wild guess at the initial hash size */
@@ -125,11 +122,14 @@ static struct spanhash_top *add_spanhash(struct spanhash_top *top,
125122
}
126123
}
127124

128-
static struct spanhash_top *hash_chars(unsigned char *buf, unsigned int sz)
125+
static struct spanhash_top *hash_chars(struct diff_filespec *one)
129126
{
130127
int i, n;
131128
unsigned int accum1, accum2, hashval;
132129
struct spanhash_top *hash;
130+
unsigned char *buf = one->data;
131+
unsigned int sz = one->size;
132+
int is_text = !one->is_binary;
133133

134134
i = INITIAL_HASH_SIZE;
135135
hash = xmalloc(sizeof(*hash) + sizeof(struct spanhash) * (1<<i));
@@ -143,6 +143,11 @@ static struct spanhash_top *hash_chars(unsigned char *buf, unsigned int sz)
143143
unsigned int c = *buf++;
144144
unsigned int old_1 = accum1;
145145
sz--;
146+
147+
/* Ignore CR in CRLF sequence if text */
148+
if (is_text && c == '\r' && sz && *buf == '\n')
149+
continue;
150+
146151
accum1 = (accum1 << 7) ^ (accum2 >> 25);
147152
accum2 = (accum2 << 7) ^ (old_1 >> 25);
148153
accum1 += c;
@@ -156,8 +161,8 @@ static struct spanhash_top *hash_chars(unsigned char *buf, unsigned int sz)
156161
return hash;
157162
}
158163

159-
int diffcore_count_changes(void *src, unsigned long src_size,
160-
void *dst, unsigned long dst_size,
164+
int diffcore_count_changes(struct diff_filespec *src,
165+
struct diff_filespec *dst,
161166
void **src_count_p,
162167
void **dst_count_p,
163168
unsigned long delta_limit,
@@ -172,14 +177,14 @@ int diffcore_count_changes(void *src, unsigned long src_size,
172177
if (src_count_p)
173178
src_count = *src_count_p;
174179
if (!src_count) {
175-
src_count = hash_chars(src, src_size);
180+
src_count = hash_chars(src);
176181
if (src_count_p)
177182
*src_count_p = src_count;
178183
}
179184
if (dst_count_p)
180185
dst_count = *dst_count_p;
181186
if (!dst_count) {
182-
dst_count = hash_chars(dst, dst_size);
187+
dst_count = hash_chars(dst);
183188
if (dst_count_p)
184189
*dst_count_p = dst_count;
185190
}

diffcore-rename.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,7 @@ static int estimate_similarity(struct diff_filespec *src,
190190

191191
delta_limit = (unsigned long)
192192
(base_size * (MAX_SCORE-minimum_score) / MAX_SCORE);
193-
if (diffcore_count_changes(src->data, src->size,
194-
dst->data, dst->size,
193+
if (diffcore_count_changes(src, dst,
195194
&src->cnt_data, &dst->cnt_data,
196195
delta_limit,
197196
&src_copied, &literal_added))

diffcore.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct diff_filespec {
3737
#define DIFF_FILE_VALID(spec) (((spec)->mode) != 0)
3838
unsigned should_free : 1; /* data should be free()'ed */
3939
unsigned should_munmap : 1; /* data should be munmap()'ed */
40+
unsigned is_binary : 1; /* data should be considered "binary" */
4041
};
4142

4243
extern struct diff_filespec *alloc_filespec(const char *);
@@ -103,8 +104,8 @@ void diff_debug_queue(const char *, struct diff_queue_struct *);
103104
#define diff_debug_queue(a,b) do {} while(0)
104105
#endif
105106

106-
extern int diffcore_count_changes(void *src, unsigned long src_size,
107-
void *dst, unsigned long dst_size,
107+
extern int diffcore_count_changes(struct diff_filespec *src,
108+
struct diff_filespec *dst,
108109
void **src_count_p,
109110
void **dst_count_p,
110111
unsigned long delta_limit,

t/t0022-crlf-rename.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/sh
2+
3+
test_description='ignore CR in CRLF sequence while computing similiarity'
4+
5+
. ./test-lib.sh
6+
7+
test_expect_success setup '
8+
9+
cat ../t0022-crlf-rename.sh >sample &&
10+
git add sample &&
11+
12+
test_tick &&
13+
git commit -m Initial &&
14+
15+
sed -e "s/\$//" ../t0022-crlf-rename.sh >elpmas &&
16+
git add elpmas &&
17+
rm -f sample &&
18+
19+
test_tick &&
20+
git commit -a -m Second
21+
22+
'
23+
24+
test_expect_success 'diff -M' '
25+
26+
git diff-tree -M -r --name-status HEAD^ HEAD |
27+
sed -e "s/R[0-9]*/RNUM/" >actual &&
28+
echo "RNUM sample elpmas" >expect &&
29+
diff -u expect actual
30+
31+
'
32+
33+
test_done

0 commit comments

Comments
 (0)