@@ -47,6 +47,8 @@ def __init__(self):
4747 self .activity_by_year_week_peak = 0
4848
4949 self .authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed}
50+ self .author_emails = {} # email -> canonical_author_name
51+ self .author_name_counts = {} # email -> {name -> count}
5052
5153 self .total_commits = 0
5254 self .total_files = 0
@@ -135,6 +137,28 @@ def save_cache(self, cachefile):
135137
136138
137139class GitDataCollector (DataCollector ):
140+ def get_canonical_author (self , author , email ):
141+ """
142+ Get the canonical author name for a given email.
143+ If multiple names are used with the same email, uses the most frequently used name.
144+ """
145+ if email not in self .author_emails :
146+ # First time seeing this email
147+ self .author_emails [email ] = author
148+ self .author_name_counts [email ] = {author : 1 }
149+ return author
150+
151+ # Track name usage for this email
152+ if author not in self .author_name_counts [email ]:
153+ self .author_name_counts [email ][author ] = 0
154+ self .author_name_counts [email ][author ] += 1
155+
156+ # Update canonical name to the most frequently used one
157+ most_used_name = max (self .author_name_counts [email ].items (), key = lambda x : x [1 ])[0 ]
158+ self .author_emails [email ] = most_used_name
159+
160+ return self .author_emails [email ]
161+
138162 def collect (self , dir ):
139163 DataCollector .collect (self , dir )
140164
@@ -193,8 +217,8 @@ def collect(self, dir):
193217 ]
194218 prev = None
195219 for tag in reversed (tags_sorted_by_date_desc ):
196- # Modify command to only include commits within our range
197- cmd = f'git shortlog -s "{ tag } "'
220+ # Modify command to only include commits within our range and include email
221+ cmd = f'git shortlog -s -e "{ tag } "'
198222 if prev is not None :
199223 cmd += f' "^{ prev } "'
200224 # Intersect with our commit range
@@ -210,7 +234,17 @@ def collect(self, dir):
210234 if len (parts ) < 3 :
211235 continue
212236 commits = int (parts [1 ])
213- author = parts [2 ]
237+ author_and_email = parts [2 ]
238+ # Parse "Name <email>" format
239+ if "<" in author_and_email and ">" in author_and_email :
240+ author , mail = author_and_email .split ("<" , 1 )
241+ author = author .rstrip ()
242+ mail = mail .rstrip (">" )
243+ # Get canonical author name based on email
244+ author = self .get_canonical_author (author , mail )
245+ else :
246+ # Fallback if no email found
247+ author = author_and_email
214248 self .tags [tag ]["commits" ] += commits
215249 self .tags [tag ]["authors" ][author ] = commits
216250
@@ -234,6 +268,10 @@ def collect(self, dir):
234268 author , mail = parts [4 ].split ("<" , 1 )
235269 author = author .rstrip ()
236270 mail = mail .rstrip (">" )
271+
272+ # Get canonical author name based on email
273+ author = self .get_canonical_author (author , mail )
274+
237275 domain = "?"
238276 if mail .find ("@" ) != - 1 :
239277 domain = mail .rsplit ("@" , 1 )[1 ]
@@ -301,7 +339,11 @@ def collect(self, dir):
301339
302340 # author stats
303341 if author not in self .authors :
304- self .authors [author ] = {}
342+ self .authors [author ] = {
343+ "lines_added" : 0 ,
344+ "lines_removed" : 0 ,
345+ "commits" : 0 ,
346+ }
305347 # commits, note again that commits may be in any date order because of cherry-picking and patches
306348 if "last_commit_stamp" not in self .authors [author ]:
307349 self .authors [author ]["last_commit_stamp" ] = stamp
@@ -544,7 +586,7 @@ def collect(self, dir):
544586 # committed what, not just through mainline)
545587 lines = get_pipe_output (
546588 [
547- 'git log --shortstat --date-order --pretty=format:"%%at %%aN" %s'
589+ 'git log --shortstat --date-order --pretty=format:"%%at %%aN <%%aE> " %s'
548590 % (get_log_range ("HEAD" , False ))
549591 ]
550592 ).split ("\n " )
@@ -558,13 +600,26 @@ def collect(self, dir):
558600 if len (line ) == 0 :
559601 continue
560602
561- # <stamp> <author>
603+ # <stamp> <author> <email>
562604 if re .search ("files? changed" , line ) is None :
563605 pos = line .find (" " )
564606 if pos != - 1 :
565607 try :
566608 oldstamp = stamp
567- (stamp , author ) = (int (line [:pos ]), line [pos + 1 :])
609+ stamp_str , author_and_email = line [:pos ], line [pos + 1 :]
610+ stamp = int (stamp_str )
611+
612+ # Parse "Name <email>" format
613+ if "<" in author_and_email and ">" in author_and_email :
614+ author , mail = author_and_email .split ("<" , 1 )
615+ author = author .rstrip ()
616+ mail = mail .rstrip (">" )
617+ # Get canonical author name based on email
618+ author = self .get_canonical_author (author , mail )
619+ else :
620+ # Fallback if no email found (shouldn't happen with new format)
621+ author = author_and_email
622+
568623 if oldstamp > stamp :
569624 # clock skew, keep old timestamp to avoid having ugly graph
570625 stamp = oldstamp
0 commit comments