diff --git a/.github/workflows/manual.yaml b/.github/workflows/manual.yaml index e423c05..055b614 100644 --- a/.github/workflows/manual.yaml +++ b/.github/workflows/manual.yaml @@ -1,4 +1,4 @@ -name: Manual collection +name: Manual Collection on: workflow_dispatch: diff --git a/.github/workflows/weekly_collection.yaml b/.github/workflows/weekly_collection.yaml index f0c1f98..526f335 100644 --- a/.github/workflows/weekly_collection.yaml +++ b/.github/workflows/weekly_collection.yaml @@ -25,12 +25,16 @@ jobs: run: | uv pip install -U pip uv pip install . - - name: Collect Github Data + - name: Collect GitHub Data run: | uv run github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c weekly.yaml env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} - + - name: Consolidate GitHub Data + run: | + uv run github-analytics consolidate -v -c weekly.yaml + env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} alert: needs: [weekly_github_collection] runs-on: ubuntu-latest diff --git a/github_analytics/__main__.py b/github_analytics/__main__.py index 05e101a..9d19bfe 100644 --- a/github_analytics/__main__.py +++ b/github_analytics/__main__.py @@ -9,6 +9,7 @@ import yaml +from github_analytics.consolidate import consolidate_metrics from github_analytics.main import collect_projects, collect_traffic from github_analytics.summarize import summarize_metrics @@ -140,6 +141,19 @@ def _summarize(args, parser): ) +def _consolidate(args, parser): + config = _load_config(args.config_file) + output_folder = args.output_folder or config.get('output_folder', '.') + projects = config['projects'] + + consolidate_metrics( + projects=projects, + output_folder=output_folder, + dry_run=args.dry_run, + verbose=args.verbose, + ) + + def _get_parser(): # Logging logging_args = argparse.ArgumentParser(add_help=False) @@ -163,7 +177,7 @@ def _get_parser(): action = parser.add_subparsers(title='action') action.required = True - # collect + # Collect collect = action.add_parser('collect', help='Collect github metrics.', parents=[logging_args]) collect.set_defaults(action=_collect) @@ -172,7 +186,7 @@ def _get_parser(): '--output-folder', type=str, required=False, - help='Output folder path. Defaults to .', + help='Output folder path. Defaults to output folder in config-file.', ) collect.add_argument('-t', '--token', type=str, required=False, help='Github Token to use.') collect.add_argument( @@ -203,6 +217,31 @@ def _get_parser(): action='store_false', help='Start from scratch instead of incrementing over existing data.', ) + # Consolidate + consolidate = action.add_parser( + 'consolidate', help='Consolidate github metrics', parents=[logging_args] + ) + consolidate.set_defaults(action=_consolidate) + consolidate.add_argument( + '-c', + '--config-file', + type=str, + default='config.yaml', + help='Path to the configuration file.', + ) + consolidate.add_argument( + '-d', + '--dry-run', + action='store_true', + help='Do not actually create the conslidated overview file. Just calculate it.', + ) + consolidate.add_argument( + '-o', + '--output-folder', + type=str, + required=False, + help='Output folder path. Defaults to output folder in config-file.', + ) # Traffic traffic = action.add_parser( @@ -229,6 +268,8 @@ def _get_parser(): help='Projects to collect. Defaults to ALL if not given', ) traffic.add_argument('-r', '--repositories', nargs='*', help='List of repositories to add.') + + # Summarize summarize = action.add_parser( 'summarize', help='Summarize the github analytics information.', parents=[logging_args] ) diff --git a/github_analytics/consolidate.py b/github_analytics/consolidate.py new file mode 100644 index 0000000..1bdf68a --- /dev/null +++ b/github_analytics/consolidate.py @@ -0,0 +1,67 @@ +"""Consolidate Overview Function.""" + +import logging +import os + +import pandas as pd +from tqdm import tqdm + +from github_analytics.constants import ( + ECOSYSTEM_COLUMN_NAME, + METRIC_COLUMN_NAME, + METRICS_SHEET_NAME, + VALUE_COLUMN_NAME, +) +from github_analytics.output import create_spreadsheet, load_spreadsheet + +OUTPUT_FILENAME = 'Consolidated_Overview' +SHEET_NAME = 'Overview' + +LOGGER = logging.getLogger(__name__) + + +def consolidate_metrics(projects, output_folder, dry_run=False, verbose=True): + """Consolidate GitHub Metrics from multiple spreadsheets on Google Drive. + + Args: + projects (list[str]): + List of projects/ecosysems to consolidate. The project must + exactly match the file in the Google Drive folder. + + output_path (str): + Output path on Google Drive that contains the Google Spreasheets. + + dry_run (bool): + Whether of not to actually upload the results to Google Drive. + If True, it just calculate the results. Defaults to False. + + verbose (bool): + If True, will output the dataframes of the summary metrics + (one dataframe for each sheet). Defaults to False. + """ + rows = [] + for project in tqdm(projects): + row_info = {ECOSYSTEM_COLUMN_NAME: project} + filepath = os.path.join(output_folder, project) + df = load_spreadsheet(filepath, sheet_name=METRICS_SHEET_NAME) + row = df[[METRIC_COLUMN_NAME, VALUE_COLUMN_NAME]].T + row = row.reset_index(drop=True) + + row = row.rename(columns=row.iloc[0]) + row = row.drop(labels=row.index[0]) + + row_values = row.to_dict(orient='records') + row_values = row_values[0] + row_info.update(row_values) + if verbose: + LOGGER.info(f' {project} values: {row_info}') + rows.append(row_info) + + consolidated_df = pd.DataFrame(rows) + sheets = {SHEET_NAME: consolidated_df} + if verbose: + LOGGER.info(f'Sheet Name: {SHEET_NAME}') + LOGGER.info(consolidated_df.to_string()) + if not dry_run: + output_path = os.path.join(output_folder, OUTPUT_FILENAME) + create_spreadsheet(output_path=output_path, sheets=sheets) diff --git a/github_analytics/constants.py b/github_analytics/constants.py new file mode 100644 index 0000000..c1ee717 --- /dev/null +++ b/github_analytics/constants.py @@ -0,0 +1,8 @@ +"""Shared constants between functions.""" + +ECOSYSTEM_COLUMN_NAME = 'Ecosystem' + +METRIC_COLUMN_NAME = 'metric' +VALUE_COLUMN_NAME = 'value' + +METRICS_SHEET_NAME = 'Metrics' diff --git a/github_analytics/drive.py b/github_analytics/drive.py index 0a0332a..d7b201e 100644 --- a/github_analytics/drive.py +++ b/github_analytics/drive.py @@ -27,7 +27,6 @@ def split_drive_path(path): """Extract the folder and filename from the google drive path string.""" assert is_drive_path(path), f'{path} is not a google drive path' folder, filename = path[9:].split('/') - return folder, filename diff --git a/github_analytics/main.py b/github_analytics/main.py index 85a3ed2..e75a6ed 100644 --- a/github_analytics/main.py +++ b/github_analytics/main.py @@ -6,6 +6,7 @@ import pandas as pd +from github_analytics.constants import METRICS_SHEET_NAME from github_analytics.drive import get_or_create_gdrive_folder from github_analytics.github.repository import RepositoryClient from github_analytics.github.repository_owner import RepositoryOwnerClient @@ -227,7 +228,7 @@ def collect_project_metrics( } if add_metrics: metrics = compute_metrics(issues, pull_requests, users, contributors, stargazers) - sheets = dict({'Metrics': metrics}, **sheets) + sheets = dict({METRICS_SHEET_NAME: metrics}, **sheets) if output_path: create_spreadsheet(output_path, sheets) diff --git a/github_analytics/metrics.py b/github_analytics/metrics.py index 4726ba2..16ba62b 100644 --- a/github_analytics/metrics.py +++ b/github_analytics/metrics.py @@ -2,6 +2,8 @@ import pandas as pd +from github_analytics.constants import METRIC_COLUMN_NAME, VALUE_COLUMN_NAME + def compute_metrics(issues, pull_requests, users, contributors, stargazers): """Compute metrics for the given data. @@ -39,48 +41,48 @@ def compute_metrics(issues, pull_requests, users, contributors, stargazers): return pd.DataFrame([ { - 'metric': 'num_issues', - 'value': num_issues, + METRIC_COLUMN_NAME: 'num_issues', + VALUE_COLUMN_NAME: num_issues, 'description': 'Total number of Issues', }, { - 'metric': 'num_pull_requests', - 'value': num_pull_requests, + METRIC_COLUMN_NAME: 'num_pull_requests', + VALUE_COLUMN_NAME: num_pull_requests, 'description': 'Total number of Pull Requests', }, { - 'metric': 'num_users', - 'value': num_users, + METRIC_COLUMN_NAME: 'num_users', + VALUE_COLUMN_NAME: num_users, 'description': 'Total number of Issue Users', }, { - 'metric': 'num_contgributors', - 'value': num_contributors, + METRIC_COLUMN_NAME: 'num_contgributors', + VALUE_COLUMN_NAME: num_contributors, 'description': 'Total number of Contributors', }, { - 'metric': 'num_stargazers', - 'value': num_stargazers, + METRIC_COLUMN_NAME: 'num_stargazers', + VALUE_COLUMN_NAME: num_stargazers, 'description': 'Total number of Stargazers', }, { - 'metric': 'num_non_contributor_users', - 'value': num_non_contrib_users, + METRIC_COLUMN_NAME: 'num_non_contributor_users', + VALUE_COLUMN_NAME: num_non_contrib_users, 'description': 'Total number of Users that are not Contributors', }, { - 'metric': 'num_non_contributor_stargazers', - 'value': num_non_contrib_stars, + METRIC_COLUMN_NAME: 'num_non_contributor_stargazers', + VALUE_COLUMN_NAME: num_non_contrib_stars, 'description': 'Total number of Stargazers that are not Contributors', }, { - 'metric': 'USR', - 'value': usr, + METRIC_COLUMN_NAME: 'USR', + VALUE_COLUMN_NAME: usr, 'description': 'Users / Stargazers ratio', }, { - 'metric': 'USR-C', - 'value': usrc, + METRIC_COLUMN_NAME: 'USR-C', + VALUE_COLUMN_NAME: usrc, 'description': 'USR Excluding Contributors', }, ]) diff --git a/github_analytics/output.py b/github_analytics/output.py index 946ef16..8167a1f 100644 --- a/github_analytics/output.py +++ b/github_analytics/output.py @@ -97,10 +97,15 @@ def load_spreadsheet(spreadsheet, sheet_name=None): path = spreadsheet sheets = pd.read_excel(spreadsheet, sheet_name=sheet_name) - for sheet in sheets.values(): # noqa + if not sheet_name: + for sheet in sheets.values(): # noqa + for column in DATE_COLUMNS: + if column in sheet: + sheet[column] = pd.to_datetime(sheet[column], utc=True).dt.tz_convert(None) + else: for column in DATE_COLUMNS: - if column in sheet: - sheet[column] = pd.to_datetime(sheet[column], utc=True).dt.tz_convert(None) + if column in sheets: + sheets[column] = pd.to_datetime(sheets[column], utc=True).dt.tz_convert(None) LOGGER.info('Loaded spreadsheet %s', path) diff --git a/github_analytics/summarize.py b/github_analytics/summarize.py index 21891e7..ac98929 100644 --- a/github_analytics/summarize.py +++ b/github_analytics/summarize.py @@ -5,6 +5,7 @@ import pandas as pd +from github_analytics.constants import ECOSYSTEM_COLUMN_NAME from github_analytics.output import create_spreadsheet, load_spreadsheet from github_analytics.time_utils import get_current_year, get_min_max_dt_in_year @@ -13,7 +14,6 @@ LOGGER = logging.getLogger(__name__) -ECOSYSTEM_COLUMN_NAME = 'Ecosystem' TOTAL_COLUMN_NAME = 'Total Since Beginning' OUTPUT_FILENAME = 'GitHub_Summary' SHEET_NAMES = ['Unique users', 'User issues', 'vendor-mapping'] @@ -60,10 +60,10 @@ def summarize_metrics( dry_run (bool): Whether of not to actually upload the summary results. - If true, it just calculate the summary results. Defaults to False. + If True, it just calculate the summary results. Defaults to False. verbose (bool): - If true, will output the dataframes of the summary metrics + If True, will output the dataframes of the summary metrics (one dataframe for each sheet). Defaults to False. """