Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 64 additions & 7 deletions gitstats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def __init__(self):
self.activity_by_year_week_peak = 0

self.authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed}
self.author_emails = {} # email -> canonical_author_name
self.author_name_counts = {} # email -> {name -> count}

self.total_commits = 0
self.total_files = 0
Expand Down Expand Up @@ -135,6 +137,30 @@ def save_cache(self, cachefile):


class GitDataCollector(DataCollector):
def get_canonical_author(self, author, email):
"""
Get the canonical author name for a given email.
If multiple names are used with the same email, uses the most frequently used name.
"""
if email not in self.author_emails:
# First time seeing this email
self.author_emails[email] = author
self.author_name_counts[email] = {author: 1}
return author

# Track name usage for this email
if author not in self.author_name_counts[email]:
self.author_name_counts[email][author] = 0
self.author_name_counts[email][author] += 1

# Update canonical name to the most frequently used one
most_used_name = max(
self.author_name_counts[email].items(), key=lambda x: x[1]
)[0]
self.author_emails[email] = most_used_name

return self.author_emails[email]

def collect(self, dir):
DataCollector.collect(self, dir)

Expand Down Expand Up @@ -193,8 +219,8 @@ def collect(self, dir):
]
prev = None
for tag in reversed(tags_sorted_by_date_desc):
# Modify command to only include commits within our range
cmd = f'git shortlog -s "{tag}"'
# Modify command to only include commits within our range and include email
cmd = f'git shortlog -s -e "{tag}"'
if prev is not None:
Comment on lines +222 to 224
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Canonicalization can undercount per-tag author commits when multiple names share an email

When git shortlog -s -e emits multiple lines for the same email (different author names), get_canonical_author() can map them to the same canonical author. In that case:

self.tags[tag]["authors"][author] = commits

will overwrite the previous entry instead of summing, so the per-author counts for that tag become incorrect even though self.tags[tag]["commits"] is correct.

Consider accumulating instead of assigning:

-                self.tags[tag]["commits"] += commits
-                self.tags[tag]["authors"][author] = commits
+                self.tags[tag]["commits"] += commits
+                self.tags[tag]["authors"][author] = (
+                    self.tags[tag]["authors"].get(author, 0) + commits
+                )

This preserves the intended “merge by email” semantics for tag statistics.

Also applies to: 239-251

🤖 Prompt for AI Agents
In gitstats/main.py around lines 222-224 (and similarly for 239-251), the code
assigns self.tags[tag]["authors"][author] = commits after canonicalizing by
email which overwrites counts when multiple names map to the same canonical
author; change the assignment to accumulate the commit counts instead (e.g.,
read current = self.tags[tag]["authors"].get(author, 0) and set to current +
commits) and apply the same accumulation logic in the other block so per-tag
author counts are summed rather than overwritten.

cmd += f' "^{prev}"'
# Intersect with our commit range
Expand All @@ -210,7 +236,17 @@ def collect(self, dir):
if len(parts) < 3:
continue
commits = int(parts[1])
author = parts[2]
author_and_email = parts[2]
# Parse "Name <email>" format
if "<" in author_and_email and ">" in author_and_email:
author, mail = author_and_email.split("<", 1)
author = author.rstrip()
mail = mail.rstrip(">")
# Get canonical author name based on email
author = self.get_canonical_author(author, mail)
else:
# Fallback if no email found
author = author_and_email
self.tags[tag]["commits"] += commits
self.tags[tag]["authors"][author] = commits

Expand All @@ -234,6 +270,10 @@ def collect(self, dir):
author, mail = parts[4].split("<", 1)
author = author.rstrip()
mail = mail.rstrip(">")

# Get canonical author name based on email
author = self.get_canonical_author(author, mail)

domain = "?"
if mail.find("@") != -1:
domain = mail.rsplit("@", 1)[1]
Expand Down Expand Up @@ -301,7 +341,11 @@ def collect(self, dir):

# author stats
if author not in self.authors:
self.authors[author] = {}
self.authors[author] = {
"lines_added": 0,
"lines_removed": 0,
"commits": 0,
}
# commits, note again that commits may be in any date order because of cherry-picking and patches
if "last_commit_stamp" not in self.authors[author]:
self.authors[author]["last_commit_stamp"] = stamp
Expand Down Expand Up @@ -544,7 +588,7 @@ def collect(self, dir):
# committed what, not just through mainline)
lines = get_pipe_output(
[
'git log --shortstat --date-order --pretty=format:"%%at %%aN" %s'
'git log --shortstat --date-order --pretty=format:"%%at %%aN <%%aE>" %s'
% (get_log_range("HEAD", False))
]
).split("\n")
Expand All @@ -558,13 +602,26 @@ def collect(self, dir):
if len(line) == 0:
continue

# <stamp> <author>
# <stamp> <author> <email>
if re.search("files? changed", line) is None:
pos = line.find(" ")
if pos != -1:
try:
oldstamp = stamp
(stamp, author) = (int(line[:pos]), line[pos + 1 :])
stamp_str, author_and_email = line[:pos], line[pos + 1 :]
stamp = int(stamp_str)

# Parse "Name <email>" format
if "<" in author_and_email and ">" in author_and_email:
author, mail = author_and_email.split("<", 1)
author = author.rstrip()
mail = mail.rstrip(">")
# Get canonical author name based on email
author = self.get_canonical_author(author, mail)
else:
# Fallback if no email found (shouldn't happen with new format)
author = author_and_email

if oldstamp > stamp:
# clock skew, keep old timestamp to avoid having ugly graph
stamp = oldstamp
Expand Down