|
1 | 1 | #!/usr/bin/perl
|
2 | 2 | #
|
3 |
| -# Purge files from Git repositories. |
4 |
| -# |
5 |
| -# Attention: |
6 |
| -# You want to run this script on a case sensitive file-system (e.g. |
7 |
| -# ext4 on Linux). Otherwise the resulting Git repository will not |
8 |
| -# contain changes that modify the casing of file paths. |
9 |
| -# |
10 |
| -# Usage: |
11 |
| -# git-purge-files [path-regex1] [path-regex2] ... |
12 |
| -# |
13 |
| -# Examples: |
14 |
| -# Remove the file "test.bin" from all directories: |
15 |
| -# git-purge-path "/test.bin$" |
16 |
| -# |
17 |
| -# Remove all "*.bin" files from all directories: |
18 |
| -# git-purge-path "\.bin$" |
19 |
| -# |
20 |
| -# Remove all files in the "/foo" directory: |
21 |
| -# git-purge-path "^/foo/$" |
22 |
| -# |
23 |
| -# Author: Lars Schneider, https://github.com/larsxschneider |
| 3 | +# Purge files from Git repositories |
24 | 4 | #
|
25 | 5 |
|
| 6 | +use 5.010; |
26 | 7 | use strict;
|
27 | 8 | use warnings;
|
| 9 | +use version; |
| 10 | +use Getopt::Std; |
| 11 | +use File::Temp qw/ tempdir /; |
| 12 | + |
| 13 | +sub usage() { |
| 14 | + print STDERR <<END; |
| 15 | +NAME |
| 16 | + git-purge-files - Purge files from Git repositories |
| 17 | +
|
| 18 | +SYNOPSIS |
| 19 | + git-purge-files [-c] [-d] [-h] [<path-regex>] ... |
| 20 | +
|
| 21 | +
|
| 22 | +DESCRIPTION |
| 23 | + This command purges files from a Git history by rewriting all |
| 24 | + commits. Please note that this changes all commit hashes in the |
| 25 | + history and therefore all branches and tags. |
| 26 | +
|
| 27 | + You want to run this script on a case sensitive file-system (e.g. |
| 28 | + ext4 on Linux). Otherwise the resulting Git repository will not |
| 29 | + contain changes that modify the casing of file paths. |
| 30 | +
|
| 31 | +OPTIONS |
| 32 | + <path-regex>... |
| 33 | + A list of regular expressions that defines what files should |
| 34 | + be purged from the history. Use a `/` to anchor a path to the |
| 35 | + root of the repository. |
| 36 | +
|
| 37 | + -c |
| 38 | + Run in checking mode. The script will run the underlaying |
| 39 | + `git fast-export | git fast-import` command without any |
| 40 | + modifications to the data stream. Afterwards the input |
| 41 | + repository is compared against the output repository. |
| 42 | +
|
| 43 | + For large repositories we recommend to run this script in |
| 44 | + checking mode (-c) mode first in order to determine if it can |
| 45 | + run in the much faster diff mode (-d) mode. |
| 46 | +
|
| 47 | + ATTENTION: Although we run a check here, the repository |
| 48 | + under test is rewritten and potentially modified! |
| 49 | +
|
| 50 | + -d |
| 51 | + Enable diff mode. This makes the underlaying `git fast-export` |
| 52 | + output only the file differences between two commits. This |
| 53 | + mode is quicker but more error prone. It is not recommended |
| 54 | + in production usage. |
| 55 | +
|
| 56 | + See examples for potential problems here: |
| 57 | + https://public-inbox.org/git/CABPp-BFLJ48BZ97Y9mr4i3q7HMqjq18cXMgSYdxqD1cMzH8Spg\@mail.gmail.com/ |
| 58 | +
|
| 59 | + -h |
| 60 | + This help. |
28 | 61 |
|
29 |
| -my $path_regex = join( "|", @ARGV ); |
| 62 | +EXAMPLES |
| 63 | + o Remove the file "test.bin" from all directories: |
30 | 64 |
|
31 |
| -open( my $pipe_in, "git fast-export --progress=10000 --no-data --all --signed-tags=warn-strip --tag-of-filtered-object=rewrite |" ) or die $!; |
32 |
| -open( my $pipe_out, "| git fast-import --force --quiet" ) or die $!; |
| 65 | + \$ git-purge-path "/test.bin$" |
| 66 | +
|
| 67 | + o Remove all "*.bin" files from all directories: |
| 68 | +
|
| 69 | + \$ git-purge-path "\.bin$" |
| 70 | +
|
| 71 | + o Remove all files in the "/foo" directory: |
| 72 | +
|
| 73 | + \$ git-purge-path "^/foo/$" |
| 74 | +END |
| 75 | + exit(1); |
| 76 | +} |
| 77 | + |
| 78 | +our($opt_h, $opt_d, $opt_c); |
| 79 | +getopts("hdc") or usage(); |
| 80 | +usage if $opt_h; |
| 81 | + |
| 82 | +my ($git_version) = `git --version` =~ /([0-9]+([.][0-9]+)+)/; |
| 83 | + |
| 84 | +my $export_opts = "--all --no-data --progress=1000 --signed-tags=warn-strip --tag-of-filtered-object=rewrite --use-done-feature"; |
| 85 | +$export_opts .= " --reencode=no" if (version->parse($git_version) ge version->parse('2.23.0')); |
| 86 | +$export_opts .= " --full-tree" if (not $opt_d); |
| 87 | + |
| 88 | +print $export_opts; |
| 89 | + |
| 90 | +my $import_opts = "--done --force --quiet"; |
| 91 | + |
| 92 | +if ($opt_c) { |
| 93 | + say "Checking 'git fast-export | git fast-import' pipeline... "; |
| 94 | + |
| 95 | + # Print the changed files, author, committer, branches, and commit message |
| 96 | + # for every commit of the Git repository. We intentionally do not output |
| 97 | + # and compare any hashes here as commit and tree hashes can change due to |
| 98 | + # slightly different object serialization methods in older Git clients. |
| 99 | + # E.g. directories have been encoded as 40000 instead of 04000 for a brief |
| 100 | + # period in ~2009 and "git fast-export | git fast-import" would fix that |
| 101 | + # which would lead to different hashes. |
| 102 | + my $git_log = "git log --all --numstat --full-history --format='%nauthor: %an <%ae> %at%ncommitter: %cn <%ce> %ct%nbranch: %S%nbody: %B%n%n---' --no-renames"; |
| 103 | + my $tmp = tempdir('git-purge-files-XXXXX', TMPDIR => 1); |
| 104 | + |
| 105 | + if ( |
| 106 | + system("$git_log > $tmp/expected") or |
| 107 | + system("git fast-export $export_opts | git fast-import $import_opts") or |
| 108 | + system("$git_log > $tmp/result") or |
| 109 | + system("diff $tmp/expected $tmp/result") |
| 110 | + ) { |
| 111 | + say ""; |
| 112 | + say "Failure! Rewriting the repository with `git-purge-files` might alter the history."; |
| 113 | + say "Inspect the following files to review the difference:"; |
| 114 | + say " - $tmp/expected"; |
| 115 | + say " - $tmp/result"; |
| 116 | + say "Try to omit the `-d` option!" if ($opt_d); |
| 117 | + exit 1; |
| 118 | + } else { |
| 119 | + say "Success!"; |
| 120 | + exit 0; |
33 | 121 |
|
34 |
| -LOOP: while ( my $cmd = <$pipe_in> ) { |
35 |
| - my $data = ""; |
36 |
| - if ( $cmd =~ /^data ([0-9]+)$/ ) { |
37 |
| - # skip data blocks |
38 |
| - my $skip_bytes = $1; |
39 |
| - read($pipe_in, $data, $skip_bytes); |
40 | 122 | }
|
41 |
| - elsif ( $cmd =~ /^M [0-9]{6} [0-9a-f]{40} (.+)$/ ) { |
42 |
| - my $pathname = $1; |
43 |
| - next LOOP if ("/" . $pathname) =~ /$path_regex/o |
| 123 | +} else { |
| 124 | + say "Purging files...\n"; |
| 125 | + |
| 126 | + exit 0 if (@ARGV == 0); |
| 127 | + my $path_regex = join( "|", @ARGV ); |
| 128 | + my $start_time = time; |
| 129 | + |
| 130 | + open( my $pipe_in, "git fast-export $export_opts |" ) or die $!; |
| 131 | + open( my $pipe_out, "| git fast-import $import_opts" ) or die $!; |
| 132 | + |
| 133 | + LOOP: while ( my $cmd = <$pipe_in> ) { |
| 134 | + my $data = ""; |
| 135 | + if ( $cmd =~ /^data ([0-9]+)$/ ) { |
| 136 | + # skip data blocks |
| 137 | + my $skip_bytes = $1; |
| 138 | + read($pipe_in, $data, $skip_bytes); |
| 139 | + } |
| 140 | + elsif ( $cmd =~ /^M [0-9]{6} [0-9a-f]{40} (.+)$/ ) { |
| 141 | + my $pathname = $1; |
| 142 | + next LOOP if ("/" . $pathname) =~ /$path_regex/o |
| 143 | + } |
| 144 | + print {$pipe_out} $cmd . $data; |
44 | 145 | }
|
45 |
| - print {$pipe_out} $cmd . $data; |
| 146 | + |
| 147 | + my $duration = time - $start_time; |
| 148 | + say "Done! Execution time: $duration s"; |
46 | 149 | }
|
0 commit comments