Skip to content

Commit b6efc9f

Browse files
authored
Merge pull request #1102 from FilippoSimini/main
add summarize-commits.yml and summarize_commits.py
2 parents fdcc28b + 0e2edda commit b6efc9f

File tree

2 files changed

+138
-0
lines changed

2 files changed

+138
-0
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Export All Commits
2+
3+
on:
4+
workflow_dispatch: # Allows you to run this manually from the Actions tab
5+
6+
jobs:
7+
get-history:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Checkout Repository
11+
uses: actions/checkout@v4
12+
with:
13+
fetch-depth: 0 # Crucial: 0 fetches all history for all branches and tags
14+
15+
- name: Extract Log for Each File
16+
run: |
17+
# Find all files (excluding the .git folder) and run git log on each
18+
find . -type f -name "*.md" -not -path '*/.*' | while read -r file; do
19+
git log --stat --pretty=format:"__|__$file||%H||%s||%an||%ae||%ar||%ad||%B" -- "$file" >> commit_history.txt
20+
echo "_/^\_" >> commit_history.txt
21+
done && [ -s commit_history.txt ] && sed -i '$d' commit_history.txt
22+
23+
- name: Install dependencies
24+
run: |
25+
python3 -m pip install --upgrade pip
26+
pip install pandas
27+
28+
- name: Process with Python
29+
run: |
30+
# Set PYTHONPATH to the current directory so imports work correctly
31+
export PYTHONPATH=$PYTHONPATH:$(pwd)
32+
python3 scripts/summarize_commits.py
33+
34+
- name: Upload Results
35+
uses: actions/upload-artifact@v4
36+
with:
37+
name: repository-commit-log
38+
path: |
39+
commit_history.txt
40+
commits_summary.csv

scripts/summarize_commits.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import pandas as pd
2+
import re
3+
import datetime
4+
5+
6+
base_github_url = 'https://github.com/argonne-lcf/user-guides/blob/main/docs/'
7+
base_doc_url = 'https://docs.alcf.anl.gov/'
8+
9+
10+
def split_git_log(l, delim='\|\|', file_path=None):
11+
splits = l.split(delim)
12+
fullmsg_stat = re.split(r'\n [^\n]* \| ', splits[-1])
13+
try:
14+
fullmsg, stat = fullmsg_stat[0].strip(), int(fullmsg_stat[-1].split(' ')[0])
15+
except (IndexError, ValueError) as e:
16+
fullmsg, stat = fullmsg_stat, 0
17+
return splits[:-1] + [fullmsg, stat]
18+
19+
20+
def create_df_commits(commits_output, d='\|\|', s='__\|__'):
21+
# Extract author names from commit lines
22+
commits_parsed = [split_git_log(l, delim=d) for l in commits_output.split(s)[1:]]
23+
columns = ['file_path', 'commit_hash', 'message_title', 'author_name', 'author_email', 'relative_date', 'commit_date', 'full_message', 'num_edits']
24+
df_comm = pd.DataFrame(commits_parsed, columns=columns)
25+
df_comm['commit_hash'] = df_comm['commit_hash'].str[:7]
26+
df_comm['commit_date'] = pd.to_datetime(df_comm['commit_date'], format="%a %b %d %H:%M:%S %Y %z", utc=True).dt.tz_localize(None)
27+
return df_comm
28+
29+
30+
def process_file_commits(df_comm):
31+
df = pd.DataFrame([[]])
32+
33+
# edits and activity
34+
edits_total, edits_this_year = agg_col_ever_and_this_year(df_comm, col='num_edits', func='sum')
35+
df['edits_total'] = edits_total
36+
df['edits_this_year'] = edits_this_year
37+
commits_total, commits_this_year = agg_col_ever_and_this_year(df_comm, col='num_edits', func='count')
38+
df['commits_total'] = commits_total
39+
df['commits_this_year'] = commits_this_year
40+
df['date_last_commit'] = df_comm['commit_date'].max()
41+
42+
# authors
43+
user_col = 'author_email'
44+
top4_authors_w_most_edits = sort_authors_by_number_of_edits(df_comm, in_the_last_year=False, user_col=user_col)[:4].tolist()
45+
df["top4_authors_w_most_edits"] = [top4_authors_w_most_edits]
46+
df["author_w_most_edits"] = top4_authors_w_most_edits[0]
47+
try:
48+
author_w_most_edits = sort_authors_by_number_of_edits(df_comm, in_the_last_year=True, user_col=user_col)[0]
49+
except IndexError:
50+
author_w_most_edits = None
51+
df["author_w_most_edits_this_year"] = author_w_most_edits
52+
53+
# system and page name
54+
file_path = df_comm['file_path'].values[0].removeprefix('./docs/')
55+
df['system'] = file_path.split('/')[0]
56+
df['name'] = file_path.split('/')[-1]
57+
# add github url
58+
df['github_url'] = base_github_url + file_path
59+
df['url'] = df['github_url'].str.replace(base_github_url, base_doc_url)
60+
df['url'] = df['url'].str.replace(".md$", "/", regex=True).values
61+
return df
62+
63+
64+
def sort_authors_by_number_of_edits(df_comm, in_the_last_year=True, user_col='author_name'):
65+
c = df_comm
66+
if in_the_last_year:
67+
one_year_ago = datetime.datetime.today() - datetime.timedelta(days=365)
68+
authors_by_num_edits = c[c['commit_date'] > one_year_ago].groupby(user_col)['num_edits'].sum()
69+
else:
70+
authors_by_num_edits = c.groupby(user_col)['num_edits'].sum()
71+
return authors_by_num_edits.sort_values(ascending=False).index.values
72+
73+
74+
def agg_col_ever_and_this_year(df_comm, col, func='sum'):
75+
c = df_comm
76+
one_year_ago = datetime.datetime.today() - datetime.timedelta(days=365)
77+
tot_this_year = c[c['commit_date'] > one_year_ago][col].agg(func)
78+
tot_ever = c[col].agg(func)
79+
return tot_ever, tot_this_year
80+
81+
82+
def main(commit_history_path, d='||', s='__|__', file_delim='_/^\_'):
83+
with open(commit_history_path, 'r', encoding='utf-8') as file:
84+
commit_history = file.read().split(file_delim)
85+
86+
log_entries = []
87+
for commits_output in commit_history:
88+
df_comm = create_df_commits(commits_output, d=d, s=s)
89+
df = process_file_commits(df_comm)
90+
log_entries.append(df)
91+
92+
_df = pd.concat(log_entries)
93+
_df = _df.sort_values(['system', 'date_last_commit'], ascending=[True, False]).reset_index(drop=True)
94+
_df.to_csv('commits_summary.csv', index=False)
95+
96+
97+
if __name__ == '__main__':
98+
main('commit_history.txt')

0 commit comments

Comments
 (0)