@@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;
22
33use Moose::Role;
44
5+ use Log::Contextual qw( :log ) ;
56use MetaCPAN::ESConfig qw( es_doc_path ) ;
6- use MetaCPAN::Util qw( digest true false ) ;
7+ use MetaCPAN::Util qw( true false ) ;
78use Ref::Util qw( is_arrayref ) ;
89
9- sub get_cpan_author_contributors {
10- my ( $self , $author , $release , $distribution ) = @_ ;
11- my @ret ;
12- my $es = $self -> es;
13-
14- my $type = $self -> model-> doc(' release' );
15- my $data ;
16- eval {
17- $data = $type -> get_contributors( $author , $release );
18- 1;
19- } or return [];
20-
21- for my $d ( @{ $data -> {contributors } } ) {
22- next unless exists $d -> {pauseid };
23-
24- # skip existing records
25- my $id = digest( $d -> {pauseid }, $release );
26- my $exists = $es -> exists ( es_doc_path(' contributor' ), id => $id , );
27- next if $exists ;
28-
29- $d -> {release_author } = $author ;
30- $d -> {release_name } = $release ;
31- $d -> {distribution } = $distribution ;
32- push @ret , $d ;
10+ sub update_contributors {
11+ my ( $self , $query ) = @_ ;
12+
13+ my $scroll = $self -> es-> scroll_helper(
14+ es_doc_path(' release' ),
15+ body => {
16+ query => $query ,
17+ sort => [' _doc' ],
18+ _source => [ qw<
19+ name
20+ author
21+ distribution
22+ metadata.author
23+ metadata.x_contributors
24+ > ],
25+ },
26+ );
27+
28+ my $bulk = $self -> es-> bulk_helper( es_doc_path(' contributor' ) );
29+
30+ while ( my $release = $scroll -> next ) {
31+ log_debug { ' updating contributors for ' . $release -> {_source }{name } };
32+ my $actions = $self -> release_contributor_update_actions(
33+ $release -> {_source } );
34+ for my $action (@$actions ) {
35+ $bulk -> add_action(%$action );
36+ }
3337 }
3438
35- return \ @ret ;
39+ $bulk -> flush ;
3640}
3741
38- sub update_release_contirbutors {
39- my ( $self , $data , $timeout ) = @_ ;
40- return unless $data and is_arrayref($data );
41-
42- my $bulk = $self -> es-> bulk_helper( es_doc_path(' contributor' ),
43- timeout => $timeout || ' 5m' , );
44-
45- for my $d ( @{$data } ) {
46- my $id = digest( $d -> {pauseid }, $d -> {release_name } );
47- $bulk -> update( {
48- id => $id ,
49- doc => {
50- pauseid => $d -> {pauseid },
51- release_name => $d -> {release_name },
52- release_author => $d -> {release_author },
53- distribution => $d -> {distribution },
42+ sub release_contributor_update_actions {
43+ my ( $self , $release ) = @_ ;
44+ my @actions ;
45+
46+ my $res = $self -> es-> search(
47+ es_doc_path(' contributor' ),
48+ body => {
49+ query => {
50+ bool => {
51+ must => [
52+ { term => { release_name => $release -> {name } } },
53+ { term => { release_author => $release -> {author } } },
54+ ],
55+ }
5456 },
55- doc_as_upsert => true,
56- } );
57+ sort => [' _doc' ],
58+ size => 500,
59+ _source => false,
60+ },
61+ );
62+ my @ids = map $_ -> {_id }, @{ $res -> {hits }{hits } };
63+ push @actions , map +{ delete => { id => $_ } }, @ids ;
64+
65+ my $contribs = $self -> get_contributors($release );
66+ my @docs = map {
67+ ;
68+ my $contrib = $_ ;
69+ {
70+ release_name => $release -> {name },
71+ release_author => $release -> {author },
72+ distribution => $release -> {distribution },
73+ map +( defined $contrib -> {$_ } ? ( $_ => $contrib -> {$_ } ) : () ),
74+ qw( pauseid name email)
75+ };
76+ } @$contribs ;
77+ push @actions , map +{ create => { _source => $_ } }, @docs ;
78+ return \@actions ;
79+ }
80+
81+ sub get_contributors {
82+ my ( $self , $release ) = @_ ;
83+
84+ my $author_name = $release -> {author };
85+ my $contribs = $release -> {metadata }{x_contributors } || [];
86+ my $authors = $release -> {metadata }{author } || [];
87+
88+ for ( \( $contribs , $authors ) ) {
89+
90+ # If a sole contributor is a string upgrade it to an array...
91+ $$_ = [$$_ ]
92+ if !ref $$_ ;
93+
94+ # but if it's any other kind of value don't die trying to parse it.
95+ $$_ = []
96+ unless Ref::Util::is_arrayref($$_ );
5797 }
98+ $authors = [ grep { $_ ne ' unknown' } @$authors ];
5899
59- $bulk -> flush;
100+ my $author = eval {
101+ $self -> es-> get_source( es_doc_path(' author' ), id => $author_name );
102+ }
103+ or return [];
104+
105+ my $author_email = $author -> {email };
106+
107+ my $author_info = {
108+ email => [
109+ lc " $author_name \@ cpan.org" ,
110+ (
111+ Ref::Util::is_arrayref($author_email )
112+ ? @{$author_email }
113+ : $author_email
114+ ),
115+ ],
116+ name => $author_name ,
117+ };
118+ my %seen = map { $_ => $author_info }
119+ ( @{ $author_info -> {email } }, $author_info -> {name }, );
120+
121+ my @contribs = map {
122+ my $name = $_ ;
123+ my $email ;
124+ if ( $name =~ s /\s *<([^<>]+@[^<>]+)>// ) {
125+ $email = $1 ;
126+ }
127+ my $info ;
128+ my $dupe ;
129+ if ( $email and $info = $seen {$email } ) {
130+ $dupe = 1;
131+ }
132+ elsif ( $info = $seen {$name } ) {
133+ $dupe = 1;
134+ }
135+ else {
136+ $info = {
137+ name => $name ,
138+ email => [],
139+ };
140+ }
141+ $seen {$name } ||= $info ;
142+ if ($email ) {
143+ push @{ $info -> {email } }, $email
144+ unless grep { $_ eq $email } @{ $info -> {email } };
145+ $seen {$email } ||= $info ;
146+ }
147+ $dupe ? () : $info ;
148+ } ( @$authors , @$contribs );
149+
150+ my %want_email ;
151+ for my $contrib (@contribs ) {
152+
153+ # heuristic to autofill pause accounts
154+ if ( !$contrib -> {pauseid } ) {
155+ my ($pauseid )
156+ = map { / ^(.*)\@ cpan\. org$ / ? $1 : () }
157+ @{ $contrib -> {email } };
158+ $contrib -> {pauseid } = uc $pauseid
159+ if $pauseid ;
160+
161+ }
162+
163+ push @{ $want_email {$_ } }, $contrib for @{ $contrib -> {email } };
164+ }
165+
166+ if (%want_email ) {
167+ my $check_author = $self -> es-> search(
168+ es_doc_path(' author' ),
169+ body => {
170+ query => { terms => { email => [ sort keys %want_email ] } },
171+ _source => [ ' email' , ' pauseid' ],
172+ size => 100,
173+ },
174+ );
175+
176+ for my $author ( @{ $check_author -> {hits }{hits } } ) {
177+ my $emails = $author -> {_source }{email };
178+ $emails = [$emails ]
179+ if !ref $emails ;
180+ my $pauseid = uc $author -> {_source }{pauseid };
181+ for my $email (@$emails ) {
182+ for my $contrib ( @{ $want_email {$email } } ) {
183+ $contrib -> {pauseid } = $pauseid ;
184+ }
185+ }
186+ }
187+ }
188+
189+ return \@contribs ;
60190}
61191
62192no Moose::Role;
0 commit comments