Skip to content

Commit 6cbc478

Browse files
committed
Merge branch 'jh/add-index-entry-optim'
"git checkout" that handles a lot of paths has been optimized by reducing the number of unnecessary checks of paths in the has_dir_name() function. * jh/add-index-entry-optim: read-cache: speed up has_dir_name (part 2) read-cache: speed up has_dir_name (part 1) read-cache: speed up add_index_entry during checkout p0006-read-tree-checkout: perf test to time read-tree read-cache: add strcmp_offset function
2 parents 864033a + b986df5 commit 6cbc478

File tree

10 files changed

+446
-2
lines changed

10 files changed

+446
-2
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,7 @@ TEST_PROGRAMS_NEED_X += test-scrap-cache-tree
639639
TEST_PROGRAMS_NEED_X += test-sha1
640640
TEST_PROGRAMS_NEED_X += test-sha1-array
641641
TEST_PROGRAMS_NEED_X += test-sigchain
642+
TEST_PROGRAMS_NEED_X += test-strcmp-offset
642643
TEST_PROGRAMS_NEED_X += test-string-list
643644
TEST_PROGRAMS_NEED_X += test-submodule-config
644645
TEST_PROGRAMS_NEED_X += test-subprocess

cache.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ extern int write_locked_index(struct index_state *, struct lock_file *lock, unsi
599599
extern int discard_index(struct index_state *);
600600
extern int unmerged_index(const struct index_state *);
601601
extern int verify_path(const char *path);
602+
extern int strcmp_offset(const char *s1, const char *s2, size_t *first_change);
602603
extern int index_dir_exists(struct index_state *istate, const char *name, int namelen);
603604
extern void adjust_dirname_case(struct index_state *istate, char *name);
604605
extern struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase);

read-cache.c

Lines changed: 137 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -887,9 +887,32 @@ static int has_file_name(struct index_state *istate,
887887
return retval;
888888
}
889889

890+
891+
/*
892+
* Like strcmp(), but also return the offset of the first change.
893+
* If strings are equal, return the length.
894+
*/
895+
int strcmp_offset(const char *s1, const char *s2, size_t *first_change)
896+
{
897+
size_t k;
898+
899+
if (!first_change)
900+
return strcmp(s1, s2);
901+
902+
for (k = 0; s1[k] == s2[k]; k++)
903+
if (s1[k] == '\0')
904+
break;
905+
906+
*first_change = k;
907+
return (unsigned char)s1[k] - (unsigned char)s2[k];
908+
}
909+
890910
/*
891911
* Do we have another file with a pathname that is a proper
892912
* subset of the name we're trying to add?
913+
*
914+
* That is, is there another file in the index with a path
915+
* that matches a sub-directory in the given entry?
893916
*/
894917
static int has_dir_name(struct index_state *istate,
895918
const struct cache_entry *ce, int pos, int ok_to_replace)
@@ -898,9 +921,51 @@ static int has_dir_name(struct index_state *istate,
898921
int stage = ce_stage(ce);
899922
const char *name = ce->name;
900923
const char *slash = name + ce_namelen(ce);
924+
size_t len_eq_last;
925+
int cmp_last = 0;
926+
927+
/*
928+
* We are frequently called during an iteration on a sorted
929+
* list of pathnames and while building a new index. Therefore,
930+
* there is a high probability that this entry will eventually
931+
* be appended to the index, rather than inserted in the middle.
932+
* If we can confirm that, we can avoid binary searches on the
933+
* components of the pathname.
934+
*
935+
* Compare the entry's full path with the last path in the index.
936+
*/
937+
if (istate->cache_nr > 0) {
938+
cmp_last = strcmp_offset(name,
939+
istate->cache[istate->cache_nr - 1]->name,
940+
&len_eq_last);
941+
if (cmp_last > 0) {
942+
if (len_eq_last == 0) {
943+
/*
944+
* The entry sorts AFTER the last one in the
945+
* index and their paths have no common prefix,
946+
* so there cannot be a F/D conflict.
947+
*/
948+
return retval;
949+
} else {
950+
/*
951+
* The entry sorts AFTER the last one in the
952+
* index, but has a common prefix. Fall through
953+
* to the loop below to disect the entry's path
954+
* and see where the difference is.
955+
*/
956+
}
957+
} else if (cmp_last == 0) {
958+
/*
959+
* The entry exactly matches the last one in the
960+
* index, but because of multiple stage and CE_REMOVE
961+
* items, we fall through and let the regular search
962+
* code handle it.
963+
*/
964+
}
965+
}
901966

902967
for (;;) {
903-
int len;
968+
size_t len;
904969

905970
for (;;) {
906971
if (*--slash == '/')
@@ -910,6 +975,67 @@ static int has_dir_name(struct index_state *istate,
910975
}
911976
len = slash - name;
912977

978+
if (cmp_last > 0) {
979+
/*
980+
* (len + 1) is a directory boundary (including
981+
* the trailing slash). And since the loop is
982+
* decrementing "slash", the first iteration is
983+
* the longest directory prefix; subsequent
984+
* iterations consider parent directories.
985+
*/
986+
987+
if (len + 1 <= len_eq_last) {
988+
/*
989+
* The directory prefix (including the trailing
990+
* slash) also appears as a prefix in the last
991+
* entry, so the remainder cannot collide (because
992+
* strcmp said the whole path was greater).
993+
*
994+
* EQ: last: xxx/A
995+
* this: xxx/B
996+
*
997+
* LT: last: xxx/file_A
998+
* this: xxx/file_B
999+
*/
1000+
return retval;
1001+
}
1002+
1003+
if (len > len_eq_last) {
1004+
/*
1005+
* This part of the directory prefix (excluding
1006+
* the trailing slash) is longer than the known
1007+
* equal portions, so this sub-directory cannot
1008+
* collide with a file.
1009+
*
1010+
* GT: last: xxxA
1011+
* this: xxxB/file
1012+
*/
1013+
return retval;
1014+
}
1015+
1016+
if (istate->cache_nr > 0 &&
1017+
ce_namelen(istate->cache[istate->cache_nr - 1]) > len) {
1018+
/*
1019+
* The directory prefix lines up with part of
1020+
* a longer file or directory name, but sorts
1021+
* after it, so this sub-directory cannot
1022+
* collide with a file.
1023+
*
1024+
* last: xxx/yy-file (because '-' sorts before '/')
1025+
* this: xxx/yy/abc
1026+
*/
1027+
return retval;
1028+
}
1029+
1030+
/*
1031+
* This is a possible collision. Fall through and
1032+
* let the regular search code handle it.
1033+
*
1034+
* last: xxx
1035+
* this: xxx/file
1036+
*/
1037+
}
1038+
9131039
pos = index_name_stage_pos(istate, name, len, stage);
9141040
if (pos >= 0) {
9151041
/*
@@ -1001,7 +1127,16 @@ static int add_index_entry_with_check(struct index_state *istate, struct cache_e
10011127

10021128
if (!(option & ADD_CACHE_KEEP_CACHE_TREE))
10031129
cache_tree_invalidate_path(istate, ce->name);
1004-
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
1130+
1131+
/*
1132+
* If this entry's path sorts after the last entry in the index,
1133+
* we can avoid searching for it.
1134+
*/
1135+
if (istate->cache_nr > 0 &&
1136+
strcmp(ce->name, istate->cache[istate->cache_nr - 1]->name) > 0)
1137+
pos = -istate->cache_nr - 1;
1138+
else
1139+
pos = index_name_stage_pos(istate, ce->name, ce_namelen(ce), ce_stage(ce));
10051140

10061141
/* existing match? Just replace it. */
10071142
if (pos >= 0) {

t/helper/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
/test-sha1
2929
/test-sha1-array
3030
/test-sigchain
31+
/test-strcmp-offset
3132
/test-string-list
3233
/test-submodule-config
3334
/test-subprocess

t/helper/test-strcmp-offset.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#include "cache.h"
2+
3+
int cmd_main(int argc, const char **argv)
4+
{
5+
int result;
6+
size_t offset;
7+
8+
if (!argv[1] || !argv[2])
9+
die("usage: %s <string1> <string2>", argv[0]);
10+
11+
result = strcmp_offset(argv[1], argv[2], &offset);
12+
13+
/*
14+
* Because differnt CRTs behave differently, only rely on signs
15+
* of the result values.
16+
*/
17+
result = (result < 0 ? -1 :
18+
result > 0 ? 1 :
19+
0);
20+
printf("%d %"PRIuMAX"\n", result, (uintmax_t)offset);
21+
return 0;
22+
}

t/perf/p0006-read-tree-checkout.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/sh
2+
#
3+
# This test measures the performance of various read-tree
4+
# and checkout operations. It is primarily interested in
5+
# the algorithmic costs of index operations and recursive
6+
# tree traversal -- and NOT disk I/O on thousands of files.
7+
8+
test_description="Tests performance of read-tree"
9+
10+
. ./perf-lib.sh
11+
12+
test_perf_default_repo
13+
14+
# If the test repo was generated by ./repos/many-files.sh
15+
# then we know something about the data shape and branches,
16+
# so we can isolate testing to the ballast-related commits
17+
# and setup sparse-checkout so we don't have to populate
18+
# the ballast files and directories.
19+
#
20+
# Otherwise, we make some general assumptions about the
21+
# repo and consider the entire history of the current
22+
# branch to be the ballast.
23+
24+
test_expect_success "setup repo" '
25+
if git rev-parse --verify refs/heads/p0006-ballast^{commit}
26+
then
27+
echo Assuming synthetic repo from many-files.sh
28+
git branch br_base master
29+
git branch br_ballast p0006-ballast^
30+
git branch br_ballast_alias p0006-ballast^
31+
git branch br_ballast_plus_1 p0006-ballast
32+
git config --local core.sparsecheckout 1
33+
cat >.git/info/sparse-checkout <<-EOF
34+
/*
35+
!ballast/*
36+
EOF
37+
else
38+
echo Assuming non-synthetic repo...
39+
git branch br_base $(git rev-list HEAD | tail -n 1)
40+
git branch br_ballast HEAD^ || error "no ancestor commit from current head"
41+
git branch br_ballast_alias HEAD^
42+
git branch br_ballast_plus_1 HEAD
43+
fi &&
44+
git checkout -q br_ballast &&
45+
nr_files=$(git ls-files | wc -l)
46+
'
47+
48+
test_perf "read-tree br_base br_ballast ($nr_files)" '
49+
git read-tree -m br_base br_ballast -n
50+
'
51+
52+
test_perf "switch between br_base br_ballast ($nr_files)" '
53+
git checkout -q br_base &&
54+
git checkout -q br_ballast
55+
'
56+
57+
test_perf "switch between br_ballast br_ballast_plus_1 ($nr_files)" '
58+
git checkout -q br_ballast_plus_1 &&
59+
git checkout -q br_ballast
60+
'
61+
62+
test_perf "switch between aliases ($nr_files)" '
63+
git checkout -q br_ballast_alias &&
64+
git checkout -q br_ballast
65+
'
66+
67+
test_done

t/perf/repos/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
gen-*/

t/perf/repos/inflate-repo.sh

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/bin/sh
2+
# Inflate the size of an EXISTING repo.
3+
#
4+
# This script should be run inside the worktree of a TEST repo.
5+
# It will use the contents of the current HEAD to generate a
6+
# commit containing copies of the current worktree such that the
7+
# total size of the commit has at least <target_size> files.
8+
#
9+
# Usage: [-t target_size] [-b branch_name]
10+
11+
set -e
12+
13+
target_size=10000
14+
branch_name=p0006-ballast
15+
ballast=ballast
16+
17+
while test "$#" -ne 0
18+
do
19+
case "$1" in
20+
-b)
21+
shift;
22+
test "$#" -ne 0 || { echo 'error: -b requires an argument' >&2; exit 1; }
23+
branch_name=$1;
24+
shift ;;
25+
-t)
26+
shift;
27+
test "$#" -ne 0 || { echo 'error: -t requires an argument' >&2; exit 1; }
28+
target_size=$1;
29+
shift ;;
30+
*)
31+
echo "error: unknown option '$1'" >&2; exit 1 ;;
32+
esac
33+
done
34+
35+
git ls-tree -r HEAD >GEN_src_list
36+
nr_src_files=$(cat GEN_src_list | wc -l)
37+
38+
src_branch=$(git symbolic-ref --short HEAD)
39+
40+
echo "Branch $src_branch initially has $nr_src_files files."
41+
42+
if test $target_size -le $nr_src_files
43+
then
44+
echo "Repository already exceeds target size $target_size."
45+
rm GEN_src_list
46+
exit 1
47+
fi
48+
49+
# Create well-known branch and add 1 file change to start
50+
# if off before the ballast.
51+
git checkout -b $branch_name HEAD
52+
echo "$target_size" > inflate-repo.params
53+
git add inflate-repo.params
54+
git commit -q -m params
55+
56+
# Create ballast for in our branch.
57+
copy=1
58+
nr_files=$nr_src_files
59+
while test $nr_files -lt $target_size
60+
do
61+
sed -e "s| | $ballast/$copy/|" <GEN_src_list |
62+
git update-index --index-info
63+
64+
nr_files=$(expr $nr_files + $nr_src_files)
65+
copy=$(expr $copy + 1)
66+
done
67+
rm GEN_src_list
68+
git commit -q -m "ballast"
69+
70+
# Modify 1 file and commit.
71+
echo "$target_size" >> inflate-repo.params
72+
git add inflate-repo.params
73+
git commit -q -m "ballast plus 1"
74+
75+
nr_files=$(git ls-files | wc -l)
76+
77+
# Checkout master to put repo in canonical state (because
78+
# the perf test may need to clone and enable sparse-checkout
79+
# before attempting to checkout a commit with the ballast
80+
# (because it may contain 100K directories and 1M files)).
81+
git checkout $src_branch
82+
83+
echo "Repository inflated. Branch $branch_name has $nr_files files."
84+
85+
exit 0

0 commit comments

Comments
 (0)