Skip to content

Commit 6596df8

Browse files
Merge pull request #254 from github/ls/improve-purge
improve git-purge-files script
2 parents 22f02f2 + 2cd1b3d commit 6596df8

File tree

2 files changed

+208
-34
lines changed

2 files changed

+208
-34
lines changed

scripts/git-purge-files

Lines changed: 137 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,149 @@
11
#!/usr/bin/perl
22
#
3-
# Purge files from Git repositories.
4-
#
5-
# Attention:
6-
# You want to run this script on a case sensitive file-system (e.g.
7-
# ext4 on Linux). Otherwise the resulting Git repository will not
8-
# contain changes that modify the casing of file paths.
9-
#
10-
# Usage:
11-
# git-purge-files [path-regex1] [path-regex2] ...
12-
#
13-
# Examples:
14-
# Remove the file "test.bin" from all directories:
15-
# git-purge-path "/test.bin$"
16-
#
17-
# Remove all "*.bin" files from all directories:
18-
# git-purge-path "\.bin$"
19-
#
20-
# Remove all files in the "/foo" directory:
21-
# git-purge-path "^/foo/$"
22-
#
23-
# Author: Lars Schneider, https://github.com/larsxschneider
3+
# Purge files from Git repositories
244
#
255

6+
use 5.010;
267
use strict;
278
use warnings;
9+
use version;
10+
use Getopt::Std;
11+
use File::Temp qw/ tempdir /;
12+
13+
sub usage() {
14+
print STDERR <<END;
15+
NAME
16+
git-purge-files - Purge files from Git repositories
17+
18+
SYNOPSIS
19+
git-purge-files [-c] [-d] [-h] [<path-regex>] ...
20+
21+
22+
DESCRIPTION
23+
This command purges files from a Git history by rewriting all
24+
commits. Please note that this changes all commit hashes in the
25+
history and therefore all branches and tags.
26+
27+
You want to run this script on a case sensitive file-system (e.g.
28+
ext4 on Linux). Otherwise the resulting Git repository will not
29+
contain changes that modify the casing of file paths.
30+
31+
OPTIONS
32+
<path-regex>...
33+
A list of regular expressions that defines what files should
34+
be purged from the history. Use a `/` to anchor a path to the
35+
root of the repository.
36+
37+
-c
38+
Run in checking mode. The script will run the underlaying
39+
`git fast-export | git fast-import` command without any
40+
modifications to the data stream. Afterwards the input
41+
repository is compared against the output repository.
42+
43+
For large repositories we recommend to run this script in
44+
checking mode (-c) mode first in order to determine if it can
45+
run in the much faster diff mode (-d) mode.
46+
47+
ATTENTION: Although we run a check here, the repository
48+
under test is rewritten and potentially modified!
49+
50+
-d
51+
Enable diff mode. This makes the underlaying `git fast-export`
52+
output only the file differences between two commits. This
53+
mode is quicker but more error prone. It is not recommended
54+
in production usage.
55+
56+
See examples for potential problems here:
57+
https://public-inbox.org/git/CABPp-BFLJ48BZ97Y9mr4i3q7HMqjq18cXMgSYdxqD1cMzH8Spg\@mail.gmail.com/
58+
59+
-h
60+
This help.
2861
29-
my $path_regex = join( "|", @ARGV );
62+
EXAMPLES
63+
o Remove the file "test.bin" from all directories:
3064
31-
open( my $pipe_in, "git fast-export --progress=10000 --no-data --all --signed-tags=warn-strip --tag-of-filtered-object=rewrite |" ) or die $!;
32-
open( my $pipe_out, "| git fast-import --force --quiet" ) or die $!;
65+
\$ git-purge-path "/test.bin$"
66+
67+
o Remove all "*.bin" files from all directories:
68+
69+
\$ git-purge-path "\.bin$"
70+
71+
o Remove all files in the "/foo" directory:
72+
73+
\$ git-purge-path "^/foo/$"
74+
END
75+
exit(1);
76+
}
77+
78+
our($opt_h, $opt_d, $opt_c);
79+
getopts("hdc") or usage();
80+
usage if $opt_h;
81+
82+
my ($git_version) = `git --version` =~ /([0-9]+([.][0-9]+)+)/;
83+
84+
my $export_opts = "--all --no-data --progress=1000 --signed-tags=warn-strip --tag-of-filtered-object=rewrite --use-done-feature";
85+
$export_opts .= " --reencode=no" if (version->parse($git_version) ge version->parse('2.23.0'));
86+
$export_opts .= " --full-tree" if (not $opt_d);
87+
88+
print $export_opts;
89+
90+
my $import_opts = "--done --force --quiet";
91+
92+
if ($opt_c) {
93+
say "Checking 'git fast-export | git fast-import' pipeline... ";
94+
95+
# Print the changed files, author, committer, branches, and commit message
96+
# for every commit of the Git repository. We intentionally do not output
97+
# and compare any hashes here as commit and tree hashes can change due to
98+
# slightly different object serialization methods in older Git clients.
99+
# E.g. directories have been encoded as 40000 instead of 04000 for a brief
100+
# period in ~2009 and "git fast-export | git fast-import" would fix that
101+
# which would lead to different hashes.
102+
my $git_log = "git log --all --numstat --full-history --format='%nauthor: %an <%ae> %at%ncommitter: %cn <%ce> %ct%nbranch: %S%nbody: %B%n%n---' --no-renames";
103+
my $tmp = tempdir('git-purge-files-XXXXX', TMPDIR => 1);
104+
105+
if (
106+
system("$git_log > $tmp/expected") or
107+
system("git fast-export $export_opts | git fast-import $import_opts") or
108+
system("$git_log > $tmp/result") or
109+
system("diff $tmp/expected $tmp/result")
110+
) {
111+
say "";
112+
say "Failure! Rewriting the repository with `git-purge-files` might alter the history.";
113+
say "Inspect the following files to review the difference:";
114+
say " - $tmp/expected";
115+
say " - $tmp/result";
116+
say "Try to omit the `-d` option!" if ($opt_d);
117+
exit 1;
118+
} else {
119+
say "Success!";
120+
exit 0;
33121

34-
LOOP: while ( my $cmd = <$pipe_in> ) {
35-
my $data = "";
36-
if ( $cmd =~ /^data ([0-9]+)$/ ) {
37-
# skip data blocks
38-
my $skip_bytes = $1;
39-
read($pipe_in, $data, $skip_bytes);
40122
}
41-
elsif ( $cmd =~ /^M [0-9]{6} [0-9a-f]{40} (.+)$/ ) {
42-
my $pathname = $1;
43-
next LOOP if ("/" . $pathname) =~ /$path_regex/o
123+
} else {
124+
say "Purging files...\n";
125+
126+
exit 0 if (@ARGV == 0);
127+
my $path_regex = join( "|", @ARGV );
128+
my $start_time = time;
129+
130+
open( my $pipe_in, "git fast-export $export_opts |" ) or die $!;
131+
open( my $pipe_out, "| git fast-import $import_opts" ) or die $!;
132+
133+
LOOP: while ( my $cmd = <$pipe_in> ) {
134+
my $data = "";
135+
if ( $cmd =~ /^data ([0-9]+)$/ ) {
136+
# skip data blocks
137+
my $skip_bytes = $1;
138+
read($pipe_in, $data, $skip_bytes);
139+
}
140+
elsif ( $cmd =~ /^M [0-9]{6} [0-9a-f]{40} (.+)$/ ) {
141+
my $pathname = $1;
142+
next LOOP if ("/" . $pathname) =~ /$path_regex/o
143+
}
144+
print {$pipe_out} $cmd . $data;
44145
}
45-
print {$pipe_out} $cmd . $data;
146+
147+
my $duration = time - $start_time;
148+
say "Done! Execution time: $duration s";
46149
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env bash
2+
3+
out=/dev/null
4+
5+
function test_expect_success {
6+
if ! eval "$* >$out"; then
7+
echo "FAILURE: $(basename "${BASH_SOURCE[0]}: $*")"
8+
exit 1
9+
fi
10+
}
11+
12+
function test_expect_failure {
13+
if eval "$* >$out"; then
14+
echo "SUCCESS although FAILURE expected: $(basename "${BASH_SOURCE[0]}: $*")"
15+
exit 1
16+
fi
17+
}
18+
19+
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/..">/dev/null && pwd)"
20+
test_dir="$script_dir/tmp"
21+
22+
rm -rf "$test_dir"
23+
mkdir "$test_dir"
24+
pushd "$test_dir" >/dev/null
25+
26+
git init -q .
27+
28+
mkdir foo
29+
echo "foo" >foo/baz
30+
git add .
31+
git commit -qm "add foo dir with file"
32+
33+
ln -s foo bar
34+
git add .
35+
git commit -qm "add bar dir as link"
36+
37+
rm bar
38+
mkdir bar
39+
echo "bar" >bar/baz
40+
git add .
41+
git commit -qm "remove link and make bar dir real"
42+
43+
test_expect_success ../git-purge-files -c
44+
45+
popd >/dev/null
46+
47+
rm -rf "$test_dir"
48+
mkdir "$test_dir"
49+
pushd "$test_dir" >/dev/null
50+
51+
git init -q .
52+
53+
mkdir foo
54+
echo "foo" >foo/baz
55+
git add .
56+
git commit -qm "add foo dir with file"
57+
58+
ln -s foo bar
59+
git add .
60+
git commit -qm "add bar dir as link"
61+
62+
rm bar
63+
mkdir bar
64+
echo "bar" >bar/baz
65+
git add .
66+
git commit -qm "remove link and make bar dir real"
67+
68+
# see https://public-inbox.org/git/[email protected]/
69+
test_expect_failure ../git-purge-files -c -d
70+
71+
popd >/dev/null

0 commit comments

Comments
 (0)