Skip to content

Commit 80db614

Browse files
authored
Add step to weekly workflow to consolidate GitHub metrics in overview spreadsheet (#21)
* add workflow * cleanup * add shared constants * wip * undo comment
1 parent 0cda807 commit 80db614

File tree

10 files changed

+158
-31
lines changed

10 files changed

+158
-31
lines changed

.github/workflows/manual.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Manual collection
1+
name: Manual Collection
22

33
on:
44
workflow_dispatch:

.github/workflows/weekly_collection.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,16 @@ jobs:
2525
run: |
2626
uv pip install -U pip
2727
uv pip install .
28-
- name: Collect Github Data
28+
- name: Collect GitHub Data
2929
run: |
3030
uv run github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c weekly.yaml
3131
env:
3232
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
33-
33+
- name: Consolidate GitHub Data
34+
run: |
35+
uv run github-analytics consolidate -v -c weekly.yaml
36+
env:
37+
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
3438
alert:
3539
needs: [weekly_github_collection]
3640
runs-on: ubuntu-latest

github_analytics/__main__.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import yaml
1111

12+
from github_analytics.consolidate import consolidate_metrics
1213
from github_analytics.main import collect_projects, collect_traffic
1314
from github_analytics.summarize import summarize_metrics
1415

@@ -140,6 +141,19 @@ def _summarize(args, parser):
140141
)
141142

142143

144+
def _consolidate(args, parser):
145+
config = _load_config(args.config_file)
146+
output_folder = args.output_folder or config.get('output_folder', '.')
147+
projects = config['projects']
148+
149+
consolidate_metrics(
150+
projects=projects,
151+
output_folder=output_folder,
152+
dry_run=args.dry_run,
153+
verbose=args.verbose,
154+
)
155+
156+
143157
def _get_parser():
144158
# Logging
145159
logging_args = argparse.ArgumentParser(add_help=False)
@@ -163,7 +177,7 @@ def _get_parser():
163177
action = parser.add_subparsers(title='action')
164178
action.required = True
165179

166-
# collect
180+
# Collect
167181
collect = action.add_parser('collect', help='Collect github metrics.', parents=[logging_args])
168182
collect.set_defaults(action=_collect)
169183

@@ -172,7 +186,7 @@ def _get_parser():
172186
'--output-folder',
173187
type=str,
174188
required=False,
175-
help='Output folder path. Defaults to .',
189+
help='Output folder path. Defaults to output folder in config-file.',
176190
)
177191
collect.add_argument('-t', '--token', type=str, required=False, help='Github Token to use.')
178192
collect.add_argument(
@@ -203,6 +217,31 @@ def _get_parser():
203217
action='store_false',
204218
help='Start from scratch instead of incrementing over existing data.',
205219
)
220+
# Consolidate
221+
consolidate = action.add_parser(
222+
'consolidate', help='Consolidate github metrics', parents=[logging_args]
223+
)
224+
consolidate.set_defaults(action=_consolidate)
225+
consolidate.add_argument(
226+
'-c',
227+
'--config-file',
228+
type=str,
229+
default='config.yaml',
230+
help='Path to the configuration file.',
231+
)
232+
consolidate.add_argument(
233+
'-d',
234+
'--dry-run',
235+
action='store_true',
236+
help='Do not actually create the conslidated overview file. Just calculate it.',
237+
)
238+
consolidate.add_argument(
239+
'-o',
240+
'--output-folder',
241+
type=str,
242+
required=False,
243+
help='Output folder path. Defaults to output folder in config-file.',
244+
)
206245

207246
# Traffic
208247
traffic = action.add_parser(
@@ -229,6 +268,8 @@ def _get_parser():
229268
help='Projects to collect. Defaults to ALL if not given',
230269
)
231270
traffic.add_argument('-r', '--repositories', nargs='*', help='List of repositories to add.')
271+
272+
# Summarize
232273
summarize = action.add_parser(
233274
'summarize', help='Summarize the github analytics information.', parents=[logging_args]
234275
)

github_analytics/consolidate.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Consolidate Overview Function."""
2+
3+
import logging
4+
import os
5+
6+
import pandas as pd
7+
from tqdm import tqdm
8+
9+
from github_analytics.constants import (
10+
ECOSYSTEM_COLUMN_NAME,
11+
METRIC_COLUMN_NAME,
12+
METRICS_SHEET_NAME,
13+
VALUE_COLUMN_NAME,
14+
)
15+
from github_analytics.output import create_spreadsheet, load_spreadsheet
16+
17+
OUTPUT_FILENAME = 'Consolidated_Overview'
18+
SHEET_NAME = 'Overview'
19+
20+
LOGGER = logging.getLogger(__name__)
21+
22+
23+
def consolidate_metrics(projects, output_folder, dry_run=False, verbose=True):
24+
"""Consolidate GitHub Metrics from multiple spreadsheets on Google Drive.
25+
26+
Args:
27+
projects (list[str]):
28+
List of projects/ecosysems to consolidate. The project must
29+
exactly match the file in the Google Drive folder.
30+
31+
output_path (str):
32+
Output path on Google Drive that contains the Google Spreasheets.
33+
34+
dry_run (bool):
35+
Whether of not to actually upload the results to Google Drive.
36+
If True, it just calculate the results. Defaults to False.
37+
38+
verbose (bool):
39+
If True, will output the dataframes of the summary metrics
40+
(one dataframe for each sheet). Defaults to False.
41+
"""
42+
rows = []
43+
for project in tqdm(projects):
44+
row_info = {ECOSYSTEM_COLUMN_NAME: project}
45+
filepath = os.path.join(output_folder, project)
46+
df = load_spreadsheet(filepath, sheet_name=METRICS_SHEET_NAME)
47+
row = df[[METRIC_COLUMN_NAME, VALUE_COLUMN_NAME]].T
48+
row = row.reset_index(drop=True)
49+
50+
row = row.rename(columns=row.iloc[0])
51+
row = row.drop(labels=row.index[0])
52+
53+
row_values = row.to_dict(orient='records')
54+
row_values = row_values[0]
55+
row_info.update(row_values)
56+
if verbose:
57+
LOGGER.info(f' {project} values: {row_info}')
58+
rows.append(row_info)
59+
60+
consolidated_df = pd.DataFrame(rows)
61+
sheets = {SHEET_NAME: consolidated_df}
62+
if verbose:
63+
LOGGER.info(f'Sheet Name: {SHEET_NAME}')
64+
LOGGER.info(consolidated_df.to_string())
65+
if not dry_run:
66+
output_path = os.path.join(output_folder, OUTPUT_FILENAME)
67+
create_spreadsheet(output_path=output_path, sheets=sheets)

github_analytics/constants.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""Shared constants between functions."""
2+
3+
ECOSYSTEM_COLUMN_NAME = 'Ecosystem'
4+
5+
METRIC_COLUMN_NAME = 'metric'
6+
VALUE_COLUMN_NAME = 'value'
7+
8+
METRICS_SHEET_NAME = 'Metrics'

github_analytics/drive.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ def split_drive_path(path):
2727
"""Extract the folder and filename from the google drive path string."""
2828
assert is_drive_path(path), f'{path} is not a google drive path'
2929
folder, filename = path[9:].split('/')
30-
3130
return folder, filename
3231

3332

github_analytics/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import pandas as pd
88

9+
from github_analytics.constants import METRICS_SHEET_NAME
910
from github_analytics.drive import get_or_create_gdrive_folder
1011
from github_analytics.github.repository import RepositoryClient
1112
from github_analytics.github.repository_owner import RepositoryOwnerClient
@@ -227,7 +228,7 @@ def collect_project_metrics(
227228
}
228229
if add_metrics:
229230
metrics = compute_metrics(issues, pull_requests, users, contributors, stargazers)
230-
sheets = dict({'Metrics': metrics}, **sheets)
231+
sheets = dict({METRICS_SHEET_NAME: metrics}, **sheets)
231232

232233
if output_path:
233234
create_spreadsheet(output_path, sheets)

github_analytics/metrics.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pandas as pd
44

5+
from github_analytics.constants import METRIC_COLUMN_NAME, VALUE_COLUMN_NAME
6+
57

68
def compute_metrics(issues, pull_requests, users, contributors, stargazers):
79
"""Compute metrics for the given data.
@@ -39,48 +41,48 @@ def compute_metrics(issues, pull_requests, users, contributors, stargazers):
3941

4042
return pd.DataFrame([
4143
{
42-
'metric': 'num_issues',
43-
'value': num_issues,
44+
METRIC_COLUMN_NAME: 'num_issues',
45+
VALUE_COLUMN_NAME: num_issues,
4446
'description': 'Total number of Issues',
4547
},
4648
{
47-
'metric': 'num_pull_requests',
48-
'value': num_pull_requests,
49+
METRIC_COLUMN_NAME: 'num_pull_requests',
50+
VALUE_COLUMN_NAME: num_pull_requests,
4951
'description': 'Total number of Pull Requests',
5052
},
5153
{
52-
'metric': 'num_users',
53-
'value': num_users,
54+
METRIC_COLUMN_NAME: 'num_users',
55+
VALUE_COLUMN_NAME: num_users,
5456
'description': 'Total number of Issue Users',
5557
},
5658
{
57-
'metric': 'num_contgributors',
58-
'value': num_contributors,
59+
METRIC_COLUMN_NAME: 'num_contgributors',
60+
VALUE_COLUMN_NAME: num_contributors,
5961
'description': 'Total number of Contributors',
6062
},
6163
{
62-
'metric': 'num_stargazers',
63-
'value': num_stargazers,
64+
METRIC_COLUMN_NAME: 'num_stargazers',
65+
VALUE_COLUMN_NAME: num_stargazers,
6466
'description': 'Total number of Stargazers',
6567
},
6668
{
67-
'metric': 'num_non_contributor_users',
68-
'value': num_non_contrib_users,
69+
METRIC_COLUMN_NAME: 'num_non_contributor_users',
70+
VALUE_COLUMN_NAME: num_non_contrib_users,
6971
'description': 'Total number of Users that are not Contributors',
7072
},
7173
{
72-
'metric': 'num_non_contributor_stargazers',
73-
'value': num_non_contrib_stars,
74+
METRIC_COLUMN_NAME: 'num_non_contributor_stargazers',
75+
VALUE_COLUMN_NAME: num_non_contrib_stars,
7476
'description': 'Total number of Stargazers that are not Contributors',
7577
},
7678
{
77-
'metric': 'USR',
78-
'value': usr,
79+
METRIC_COLUMN_NAME: 'USR',
80+
VALUE_COLUMN_NAME: usr,
7981
'description': 'Users / Stargazers ratio',
8082
},
8183
{
82-
'metric': 'USR-C',
83-
'value': usrc,
84+
METRIC_COLUMN_NAME: 'USR-C',
85+
VALUE_COLUMN_NAME: usrc,
8486
'description': 'USR Excluding Contributors',
8587
},
8688
])

github_analytics/output.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,15 @@ def load_spreadsheet(spreadsheet, sheet_name=None):
9797
path = spreadsheet
9898

9999
sheets = pd.read_excel(spreadsheet, sheet_name=sheet_name)
100-
for sheet in sheets.values(): # noqa
100+
if not sheet_name:
101+
for sheet in sheets.values(): # noqa
102+
for column in DATE_COLUMNS:
103+
if column in sheet:
104+
sheet[column] = pd.to_datetime(sheet[column], utc=True).dt.tz_convert(None)
105+
else:
101106
for column in DATE_COLUMNS:
102-
if column in sheet:
103-
sheet[column] = pd.to_datetime(sheet[column], utc=True).dt.tz_convert(None)
107+
if column in sheets:
108+
sheets[column] = pd.to_datetime(sheets[column], utc=True).dt.tz_convert(None)
104109

105110
LOGGER.info('Loaded spreadsheet %s', path)
106111

github_analytics/summarize.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import pandas as pd
77

8+
from github_analytics.constants import ECOSYSTEM_COLUMN_NAME
89
from github_analytics.output import create_spreadsheet, load_spreadsheet
910
from github_analytics.time_utils import get_current_year, get_min_max_dt_in_year
1011

@@ -13,7 +14,6 @@
1314

1415
LOGGER = logging.getLogger(__name__)
1516

16-
ECOSYSTEM_COLUMN_NAME = 'Ecosystem'
1717
TOTAL_COLUMN_NAME = 'Total Since Beginning'
1818
OUTPUT_FILENAME = 'GitHub_Summary'
1919
SHEET_NAMES = ['Unique users', 'User issues', 'vendor-mapping']
@@ -60,10 +60,10 @@ def summarize_metrics(
6060
6161
dry_run (bool):
6262
Whether of not to actually upload the summary results.
63-
If true, it just calculate the summary results. Defaults to False.
63+
If True, it just calculate the summary results. Defaults to False.
6464
6565
verbose (bool):
66-
If true, will output the dataframes of the summary metrics
66+
If True, will output the dataframes of the summary metrics
6767
(one dataframe for each sheet). Defaults to False.
6868
6969
"""

0 commit comments

Comments
 (0)