Run uniq | sort | uniq

Patrick Reynolds · Patrick Reynolds · commit 239e9927eed4 · 2020-01-30T22:38:44.000-05:00
The data we're sorting has clusters of duplicates in the input, because
`dirname` reduces all repos in the same network (i.e., forks) to the same
network path.  Running `uniq` before `sort` eliminates those duplicates,
which means `sort` requires less CPU and RAM to do its thing.

We still need `uniq` on the output end, because there's no guarantee that
all duplicates in the input are clustered.

I've run tests, and the cost of `uniq` is small enough that it does no
harm if the input has no duplicates at all.
diff --git a/share/github-backup-utils/ghe-backup-repositories b/share/github-backup-utils/ghe-backup-repositories
@@ -366,8 +366,8 @@ bm_end "$(basename $0) - Special Data Directories Sync"
 
 if [ -z "$GHE_SKIP_ROUTE_VERIFICATION" ]; then
   bm_start "$(basename $0) - Verifying Routes"
-  cat $tempdir/*.rsync | sort | uniq > $tempdir/source_routes
-  (cd $backup_dir/ && find * -mindepth 5 -maxdepth 6 -type d -name \*.git | fix_paths_for_ghe_version | sort | uniq) > $tempdir/destination_routes
+  cat $tempdir/*.rsync | uniq | sort | uniq > $tempdir/source_routes
+  (cd $backup_dir/ && find * -mindepth 5 -maxdepth 6 -type d -name \*.git | fix_paths_for_ghe_version | uniq | sort | uniq) > $tempdir/destination_routes
 
   git --no-pager diff --unified=0 --no-prefix -- $tempdir/source_routes $tempdir/destination_routes || echo "Warning: One or more repository networks and/or gists were not found on the source appliance. Please contact GitHub Enterprise Support for assistance."
 
diff --git a/share/github-backup-utils/ghe-backup-storage b/share/github-backup-utils/ghe-backup-storage
@@ -141,8 +141,8 @@ bm_end "$(basename $0) - Storage object sync"
 if [ -z "$GHE_SKIP_ROUTE_VERIFICATION" ]; then
   bm_start "$(basename $0) - Verifying Routes"
 
-  cat $tempdir/*.rsync | sort | uniq > $tempdir/source_routes
-  (cd $backup_dir/ && find * -mindepth 3 -maxdepth 3 -type f -print | sort | uniq) > $tempdir/destination_routes
+  cat $tempdir/*.rsync | uniq | sort | uniq > $tempdir/source_routes
+  (cd $backup_dir/ && find * -mindepth 3 -maxdepth 3 -type f -print | uniq | sort | uniq) > $tempdir/destination_routes
 
   git --no-pager diff --unified=0 --no-prefix -- $tempdir/source_routes $tempdir/destination_routes || echo "Warning: One or more storage objects were not found on the source appliance. Please contact GitHub Enterprise Support for assistance."