Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions bin/backup.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
use strict;
use warnings;
use v5.36;

use feature qw< state >;
use Getopt::Long;
use MetaCPAN::Logger qw< :log :dlog >;
use Cpanel::JSON::XS qw< decode_json encode_json >;
use DateTime ();
use IO::Zlib ();
use Path::Tiny qw< path >;
use Try::Tiny qw< catch try >;

use MetaCPAN::ES;
use MetaCPAN::Ingest qw< home >;

# config

# args
my $batch_size = 100;
my $size = 1000;
my $index = "cpan";

my ( $type, $purge, $dry_run, $restore );
GetOptions(
"batch_size=i" => \$batch_size,
"purge" => \$purge,
"dry_run" => \$dry_run,
"size=i" => \$size,
"index=s" => \$index,
"type=s" => \$type,
"restore=s" => \$restore,
);

# setup
my $home = path( home() );

run_restore() if $restore;
run_purge() if $purge;
run_backup() unless $restore or $purge;

1;

###

sub run_restore () {
my $restore_path;
$restore_path = path($restore);
$restore_path->exists or die "$restore doesn't exist\n";

log_info { 'Restoring from ', $restore_path };

my @bulk;

my $fh = IO::Zlib->new( $restore_path->stringify, 'rb' );

my %es_store;
my %bulk_store;

while ( my $line = $fh->readline ) {

state $line_count = 0;
++$line_count;
my $raw;

try { $raw = decode_json($line) }
catch {
log_warn {"cannot decode JSON: $line --- $&"};
};

# Create our bulk_helper if we need,
# incase a backup has mixed _index or _type
# create a new bulk helper for each
my $key = $raw->{_index} . $raw->{_type};

$es_store{$key} ||= MetaCPAN::ES->new(
index => $raw->{_index},
type => $raw->{_type},
);
my $es = $es_store{$key};

$bulk_store{$key} ||= $es->bulk( max_count => $batch_size );
my $bulk = $bulk_store{$key};

my $parent = $raw->{fields}->{_parent};

if ( $raw->{_type} eq 'author' ) {

# Hack for dodgy lat / lon's
if ( my $loc = $raw->{_source}->{location} ) {

my $lat = $loc->[1];
my $lon = $loc->[0];

if ( $lat > 90 or $lat < -90 ) {

# Invalid latitude
delete $raw->{_source}->{location};
}
elsif ( $lon > 180 or $lon < -180 ) {

# Invalid longitude
delete $raw->{_source}->{location};
}
}
}

if ( $es->exists( id => $raw->{_id} ) ) {

$bulk->update( {
id => $raw->{_id},
doc => $raw->{_source},
doc_as_upsert => 1,
} );

}
else {

$bulk->create( {
id => $raw->{_id},
$parent ? ( parent => $parent ) : (),
source => $raw->{_source},
} );
}
}

# Flush anything left over just incase
$_->index_refresh for values %es_store;
$_->flush for values %bulk_store;

log_info {'done'};
}

sub run_purge () {
my $now = DateTime->now;
my $backup = $home->child(qw< var backup >);

$backup->visit(
sub {
my $file = shift;
return if $file->is_dir;

my $mtime = DateTime->from_epoch( epoch => $file->stat->mtime );

# keep a daily backup for one week
return if $mtime > $now->clone->subtract( days => 7 );

# after that keep weekly backups
if ( $mtime->clone->truncate( to => 'week' )
!= $mtime->clone->truncate( to => 'day' ) )
{
log_info {"Removing old backup $file"};
return log_info {'Not (dry run)'} if $dry_run;
$file->remove;
}
},
{ recurse => 1 }
);
}

sub run_backup {
my $filename = join( '-',
DateTime->now->strftime('%F'),
grep {defined} $index, $type );

my $file = $home->child( qw< var backup >, "$filename.json.gz" );
$file->parent->mkpath unless ( -e $file->parent );
my $fh = IO::Zlib->new( "$file", 'wb4' );

my $es = MetaCPAN::ES->new(
index => $index,
( $type ? ( type => $type ) : () )
);
my $scroll = $es->scroll(
size => $size,
fields => [qw< _parent _source >],
scroll => '1m',
);

log_info { 'Backing up ', $scroll->total, ' documents' };

while ( my $result = $scroll->next ) {
print $fh encode_json($result), $/;
}

close $fh;
log_info {'done'};
}

__END__

=head1 NAME

MetaCPAN::Script::Backup - Backup indices and types

=head1 SYNOPSIS

$ bin/backup --index user --type account

$ bin/backup --purge

$ bin/backup --restore path

=head1 DESCRIPTION

Creates C<.json.gz> files in C<var/backup>. These files contain
one record per line.

=head2 purge

Purges old backups. Backups from the current week are kept.
2 changes: 1 addition & 1 deletion lib/MetaCPAN/ES.pm
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ sub new ( $class, %args ) {
nodes => [$node],
),
index => $index,
type => $args{type},
( $args{type} ? ( type => $args{type} ) : () ),
}, $class;
}

Expand Down
16 changes: 16 additions & 0 deletions lib/MetaCPAN/Ingest.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use v5.36;

use Digest::SHA;
use Encode qw< decode_utf8 >;
use IPC::Run3 ();
use LWP::UserAgent;
use Path::Tiny qw< path >;
use PAUSE::Permissions ();
Expand All @@ -27,6 +28,7 @@ use Sub::Exporter -setup => {
extract_section
fix_version
handle_error
home
minion
numify_version
read_00whois
Expand Down Expand Up @@ -137,6 +139,20 @@ sub handle_error ( $error, $die_always ) {
Carp::croak $error if $die_always;
}

sub home () {
IPC::Run3::run3(
[ qw< git rev-parse --show-toplevel > ], # TODO: use alternative persistent path that's accessible from the container
\undef, \my $stdout, \my $stderr
);

die $stderr if ($?);

chomp $stdout;
die "Failed to find git dir: '$stdout'" unless -d $stdout;

return $stdout;
}

sub minion () {
require 'Mojo::Server';
return Mojo::Server->new->build_app('MetaCPAN::API')->minion;
Expand Down