diff --git a/gitstats/main.py b/gitstats/main.py index 7a7b603..205fca1 100755 --- a/gitstats/main.py +++ b/gitstats/main.py @@ -47,6 +47,8 @@ def __init__(self): self.activity_by_year_week_peak = 0 self.authors = {} # name -> {commits, first_commit_stamp, last_commit_stamp, last_active_day, active_days, lines_added, lines_removed} + self.author_emails = {} # email -> canonical_author_name + self.author_name_counts = {} # email -> {name -> count} self.total_commits = 0 self.total_files = 0 @@ -135,6 +137,30 @@ def save_cache(self, cachefile): class GitDataCollector(DataCollector): + def get_canonical_author(self, author, email): + """ + Get the canonical author name for a given email. + If multiple names are used with the same email, uses the most frequently used name. + """ + if email not in self.author_emails: + # First time seeing this email + self.author_emails[email] = author + self.author_name_counts[email] = {author: 1} + return author + + # Track name usage for this email + if author not in self.author_name_counts[email]: + self.author_name_counts[email][author] = 0 + self.author_name_counts[email][author] += 1 + + # Update canonical name to the most frequently used one + most_used_name = max( + self.author_name_counts[email].items(), key=lambda x: x[1] + )[0] + self.author_emails[email] = most_used_name + + return self.author_emails[email] + def collect(self, dir): DataCollector.collect(self, dir) @@ -193,8 +219,8 @@ def collect(self, dir): ] prev = None for tag in reversed(tags_sorted_by_date_desc): - # Modify command to only include commits within our range - cmd = f'git shortlog -s "{tag}"' + # Modify command to only include commits within our range and include email + cmd = f'git shortlog -s -e "{tag}"' if prev is not None: cmd += f' "^{prev}"' # Intersect with our commit range @@ -210,7 +236,17 @@ def collect(self, dir): if len(parts) < 3: continue commits = int(parts[1]) - author = parts[2] + author_and_email = parts[2] + # Parse "Name " format + if "<" in author_and_email and ">" in author_and_email: + author, mail = author_and_email.split("<", 1) + author = author.rstrip() + mail = mail.rstrip(">") + # Get canonical author name based on email + author = self.get_canonical_author(author, mail) + else: + # Fallback if no email found + author = author_and_email self.tags[tag]["commits"] += commits self.tags[tag]["authors"][author] = commits @@ -234,6 +270,10 @@ def collect(self, dir): author, mail = parts[4].split("<", 1) author = author.rstrip() mail = mail.rstrip(">") + + # Get canonical author name based on email + author = self.get_canonical_author(author, mail) + domain = "?" if mail.find("@") != -1: domain = mail.rsplit("@", 1)[1] @@ -301,7 +341,11 @@ def collect(self, dir): # author stats if author not in self.authors: - self.authors[author] = {} + self.authors[author] = { + "lines_added": 0, + "lines_removed": 0, + "commits": 0, + } # commits, note again that commits may be in any date order because of cherry-picking and patches if "last_commit_stamp" not in self.authors[author]: self.authors[author]["last_commit_stamp"] = stamp @@ -544,7 +588,7 @@ def collect(self, dir): # committed what, not just through mainline) lines = get_pipe_output( [ - 'git log --shortstat --date-order --pretty=format:"%%at %%aN" %s' + 'git log --shortstat --date-order --pretty=format:"%%at %%aN <%%aE>" %s' % (get_log_range("HEAD", False)) ] ).split("\n") @@ -558,13 +602,26 @@ def collect(self, dir): if len(line) == 0: continue - # + # if re.search("files? changed", line) is None: pos = line.find(" ") if pos != -1: try: oldstamp = stamp - (stamp, author) = (int(line[:pos]), line[pos + 1 :]) + stamp_str, author_and_email = line[:pos], line[pos + 1 :] + stamp = int(stamp_str) + + # Parse "Name " format + if "<" in author_and_email and ">" in author_and_email: + author, mail = author_and_email.split("<", 1) + author = author.rstrip() + mail = mail.rstrip(">") + # Get canonical author name based on email + author = self.get_canonical_author(author, mail) + else: + # Fallback if no email found (shouldn't happen with new format) + author = author_and_email + if oldstamp > stamp: # clock skew, keep old timestamp to avoid having ugly graph stamp = oldstamp