Skip to content

Commit 42711b8

Browse files
jackbravoshenxianpeng
authored andcommitted
feat: Merge author by email
1 parent e20ae3a commit 42711b8

File tree

1 file changed

+62
-7
lines changed

1 file changed

+62
-7
lines changed

gitstats/main.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ def __init__(self):
4747
self.activity_by_year_week_peak = 0
4848

4949
self.authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed}
50+
self.author_emails = {} # email -> canonical_author_name
51+
self.author_name_counts = {} # email -> {name -> count}
5052

5153
self.total_commits = 0
5254
self.total_files = 0
@@ -135,6 +137,28 @@ def save_cache(self, cachefile):
135137

136138

137139
class GitDataCollector(DataCollector):
140+
def get_canonical_author(self, author, email):
141+
"""
142+
Get the canonical author name for a given email.
143+
If multiple names are used with the same email, uses the most frequently used name.
144+
"""
145+
if email not in self.author_emails:
146+
# First time seeing this email
147+
self.author_emails[email] = author
148+
self.author_name_counts[email] = {author: 1}
149+
return author
150+
151+
# Track name usage for this email
152+
if author not in self.author_name_counts[email]:
153+
self.author_name_counts[email][author] = 0
154+
self.author_name_counts[email][author] += 1
155+
156+
# Update canonical name to the most frequently used one
157+
most_used_name = max(self.author_name_counts[email].items(), key=lambda x: x[1])[0]
158+
self.author_emails[email] = most_used_name
159+
160+
return self.author_emails[email]
161+
138162
def collect(self, dir):
139163
DataCollector.collect(self, dir)
140164

@@ -193,8 +217,8 @@ def collect(self, dir):
193217
]
194218
prev = None
195219
for tag in reversed(tags_sorted_by_date_desc):
196-
# Modify command to only include commits within our range
197-
cmd = f'git shortlog -s "{tag}"'
220+
# Modify command to only include commits within our range and include email
221+
cmd = f'git shortlog -s -e "{tag}"'
198222
if prev is not None:
199223
cmd += f' "^{prev}"'
200224
# Intersect with our commit range
@@ -210,7 +234,17 @@ def collect(self, dir):
210234
if len(parts) < 3:
211235
continue
212236
commits = int(parts[1])
213-
author = parts[2]
237+
author_and_email = parts[2]
238+
# Parse "Name <email>" format
239+
if "<" in author_and_email and ">" in author_and_email:
240+
author, mail = author_and_email.split("<", 1)
241+
author = author.rstrip()
242+
mail = mail.rstrip(">")
243+
# Get canonical author name based on email
244+
author = self.get_canonical_author(author, mail)
245+
else:
246+
# Fallback if no email found
247+
author = author_and_email
214248
self.tags[tag]["commits"] += commits
215249
self.tags[tag]["authors"][author] = commits
216250

@@ -234,6 +268,10 @@ def collect(self, dir):
234268
author, mail = parts[4].split("<", 1)
235269
author = author.rstrip()
236270
mail = mail.rstrip(">")
271+
272+
# Get canonical author name based on email
273+
author = self.get_canonical_author(author, mail)
274+
237275
domain = "?"
238276
if mail.find("@") != -1:
239277
domain = mail.rsplit("@", 1)[1]
@@ -301,7 +339,11 @@ def collect(self, dir):
301339

302340
# author stats
303341
if author not in self.authors:
304-
self.authors[author] = {}
342+
self.authors[author] = {
343+
"lines_added": 0,
344+
"lines_removed": 0,
345+
"commits": 0,
346+
}
305347
# commits, note again that commits may be in any date order because of cherry-picking and patches
306348
if "last_commit_stamp" not in self.authors[author]:
307349
self.authors[author]["last_commit_stamp"] = stamp
@@ -544,7 +586,7 @@ def collect(self, dir):
544586
# committed what, not just through mainline)
545587
lines = get_pipe_output(
546588
[
547-
'git log --shortstat --date-order --pretty=format:"%%at %%aN" %s'
589+
'git log --shortstat --date-order --pretty=format:"%%at %%aN <%%aE>" %s'
548590
% (get_log_range("HEAD", False))
549591
]
550592
).split("\n")
@@ -558,13 +600,26 @@ def collect(self, dir):
558600
if len(line) == 0:
559601
continue
560602

561-
# <stamp> <author>
603+
# <stamp> <author> <email>
562604
if re.search("files? changed", line) is None:
563605
pos = line.find(" ")
564606
if pos != -1:
565607
try:
566608
oldstamp = stamp
567-
(stamp, author) = (int(line[:pos]), line[pos + 1 :])
609+
stamp_str, author_and_email = line[:pos], line[pos + 1:]
610+
stamp = int(stamp_str)
611+
612+
# Parse "Name <email>" format
613+
if "<" in author_and_email and ">" in author_and_email:
614+
author, mail = author_and_email.split("<", 1)
615+
author = author.rstrip()
616+
mail = mail.rstrip(">")
617+
# Get canonical author name based on email
618+
author = self.get_canonical_author(author, mail)
619+
else:
620+
# Fallback if no email found (shouldn't happen with new format)
621+
author = author_and_email
622+
568623
if oldstamp > stamp:
569624
# clock skew, keep old timestamp to avoid having ugly graph
570625
stamp = oldstamp

0 commit comments

Comments
 (0)