Skip to content

Commit 79d4875

Browse files
committed
Avoid redundant rsync operations
In noncluster environments in a high-availability configuration, there is only one target server to restore repository data onto. However, the current implementation performs one rsync task per node in the replication network. In a setup with one primary instance and two passive replicas, this would amount to three rsync tasks with identical or almost identical file lists. Aside from the rsync task for transferring repository data from the backup snapshot onto the target server, the other rsync operations per replica are unnecessary. Avoiding these redundant rsync tasks reduces the runtime of ghe-restore-repositories by about 12 % for a customer with roughly 1 TB of repository data. With GHE_PARALLEL_ENABLED=yes configured, this change also prevents a race condition, which could occur when multiple rsync processes try to write to the same file on the target server at the same time.
1 parent 11c0e5f commit 79d4875

File tree

1 file changed

+42
-35
lines changed

1 file changed

+42
-35
lines changed

share/github-backup-utils/ghe-restore-repositories

Lines changed: 42 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -102,42 +102,49 @@ done > $tmp_list
102102
IFS=$OLDIFS
103103
bm_end "$(basename $0) - Building network list"
104104

105-
# The server returns a list of routes:
106-
#
107-
# a/nw/a8/3f/02/100000855 dgit-node1 dgit-node2 dgit-node3
108-
# a/nw/a8/bc/8d/100000880 dgit-node1 dgit-node2 dgit-node4
109-
# a/nw/a5/06/81/100000659 dgit-node3 dgit-node2 dgit-node4
110-
# ...
111-
#
112-
# One route per line.
113-
#
114-
# NOTE: The route generation is performed on the appliance as it is considerably
115-
# more performant than performing over an SSH pipe.
116-
#
117-
bm_start "$(basename $0) - Transferring network list"
118-
cat $tmp_list | ghe-ssh "$GHE_HOSTNAME" -- sponge $remote_tmp_list
119-
cat $tmp_list | ghe_debug
120-
bm_end "$(basename $0) - Transferring network list"
121-
122-
bm_start "$(basename $0) - Generating routes"
123-
restore_routes_script="github-env ./bin/dgit-cluster-restore-routes"
124-
if ghe-ssh "$GHE_HOSTNAME" test -e /usr/local/share/enterprise/ghe-restore-network-routes; then
125-
restore_routes_script="/usr/local/share/enterprise/ghe-restore-network-routes"
105+
if $CLUSTER; then
106+
# The server returns a list of routes:
107+
#
108+
# a/nw/a8/3f/02/100000855 dgit-node1 dgit-node2 dgit-node3
109+
# a/nw/a8/bc/8d/100000880 dgit-node1 dgit-node2 dgit-node4
110+
# a/nw/a5/06/81/100000659 dgit-node3 dgit-node2 dgit-node4
111+
# ...
112+
#
113+
# One route per line.
114+
#
115+
# NOTE: The route generation is performed on the appliance as it is considerably
116+
# more performant than performing over an SSH pipe.
117+
#
118+
bm_start "$(basename $0) - Transferring network list"
119+
cat $tmp_list | ghe-ssh "$GHE_HOSTNAME" -- sponge $remote_tmp_list
120+
cat $tmp_list | ghe_debug
121+
bm_end "$(basename $0) - Transferring network list"
122+
123+
bm_start "$(basename $0) - Generating routes"
124+
restore_routes_script="github-env ./bin/dgit-cluster-restore-routes"
125+
if ghe-ssh "$GHE_HOSTNAME" test -e /usr/local/share/enterprise/ghe-restore-network-routes; then
126+
restore_routes_script="/usr/local/share/enterprise/ghe-restore-network-routes"
127+
fi
128+
echo "cat $remote_tmp_list | $restore_routes_script | grep 'git-server-' > $remote_routes_list" | ghe-ssh "$GHE_HOSTNAME" -- /bin/bash
129+
ghe-ssh "$GHE_HOSTNAME" -- cat $remote_routes_list | ghe_debug
130+
bm_end "$(basename $0) - Generating routes"
131+
132+
bm_start "$(basename $0) - Fetching routes"
133+
ghe-ssh "$GHE_HOSTNAME" -- gzip -c $remote_routes_list | gzip -d > $routes_list
134+
cat $routes_list | ghe_debug
135+
bm_end "$(basename $0) - Fetching routes"
136+
137+
bm_start "$(basename $0) - Processing routes"
138+
139+
cat $routes_list | awk -v tempdir="$tempdir" '{ for(i=2;i<=NF;i++){ print $1 > (tempdir"/"$i".rsync") }}'
140+
cat $routes_list | awk '{ n = split($1, p, "/"); printf p[n] " /data/repositories/" $1; $1=""; print $0}' > $to_restore
141+
ghe_debug "\n$(find "$tempdir" -maxdepth 1 -name '*.rsync')"
142+
bm_end "$(basename $0) - Processing routes"
143+
else
144+
# In noncluster setups, the primary instance owns all repository networks, so all network paths
145+
# are to be synchronized to the primary instance.
146+
cp "$tmp_list" "$tempdir/git-server-primary.rsync"
126147
fi
127-
echo "cat $remote_tmp_list | $restore_routes_script | grep 'git-server-' > $remote_routes_list" | ghe-ssh "$GHE_HOSTNAME" -- /bin/bash
128-
ghe-ssh "$GHE_HOSTNAME" -- cat $remote_routes_list | ghe_debug
129-
bm_end "$(basename $0) - Generating routes"
130-
131-
bm_start "$(basename $0) - Fetching routes"
132-
ghe-ssh "$GHE_HOSTNAME" -- gzip -c $remote_routes_list | gzip -d > $routes_list
133-
cat $routes_list | ghe_debug
134-
bm_end "$(basename $0) - Fetching routes"
135-
136-
bm_start "$(basename $0) - Processing routes"
137-
cat $routes_list | awk -v tempdir="$tempdir" '{ for(i=2;i<=NF;i++){ print $1 > (tempdir"/"$i".rsync") }}'
138-
cat $routes_list | awk '{ n = split($1, p, "/"); printf p[n] " /data/repositories/" $1; $1=""; print $0}' > $to_restore
139-
ghe_debug "\n$(find "$tempdir" -maxdepth 1 -name '*.rsync')"
140-
bm_end "$(basename $0) - Processing routes"
141148

142149
if [ -z "$(find "$tempdir" -maxdepth 1 -name '*.rsync')" ]; then
143150
log_warn "Warning: no routes found, skipping repositories restore ..."

0 commit comments

Comments
 (0)