diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml index 0de6ec7..28ad27a 100644 --- a/.github/workflows/daily_collection.yaml +++ b/.github/workflows/daily_collection.yaml @@ -13,6 +13,7 @@ on: jobs: daily_github_collection: + environment: daily runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -21,16 +22,18 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install . - name: Collect GitHub Data run: | - uv run gitmetrics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml + uv run gitmetrics collect -q \ + --add-metrics \ + --config-file daily.yaml \ + --token ${{ secrets.GITHUB_TOKEN }} \ + --output-folder ${{ secrets.OUTPUT_FOLDER }} env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} - alert: needs: [daily_github_collection] runs-on: ubuntu-latest @@ -42,9 +45,8 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install .[dev] - name: Slack alert if failure run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} diff --git a/.github/workflows/daily_summarize.yaml b/.github/workflows/daily_summarize.yaml index 43088dc..59a575c 100644 --- a/.github/workflows/daily_summarize.yaml +++ b/.github/workflows/daily_summarize.yaml @@ -13,6 +13,7 @@ on: jobs: summarize: + environment: daily runs-on: ubuntu-latest timeout-minutes: 5 steps: @@ -22,14 +23,13 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install . - name: Run Summarize run: | uv run gitmetrics summarize \ - --input-folder gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2 + --input-folder ${{ secrets.OUTPUT_FOLDER }} env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} - uses: actions/checkout@v4 @@ -60,9 +60,8 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install .[dev] - name: Slack alert if failure run: | diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 8883312..f9e4f1d 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -18,9 +18,8 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install .[dev] - name: Run lint checks run: uv run invoke lint \ No newline at end of file diff --git a/.github/workflows/traffic_collection.yaml b/.github/workflows/traffic_collection.yaml index 8cb06a8..1717a4c 100644 --- a/.github/workflows/traffic_collection.yaml +++ b/.github/workflows/traffic_collection.yaml @@ -13,6 +13,7 @@ on: jobs: daily_traffic_collection: + environment: traffic runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -21,13 +22,15 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install . - name: Collect GitHub Traffic Data run: | - uv run gitmetrics traffic -v -t ${{ secrets.GH_TRAFFIC_TOKEN }} -c traffic_config.yaml + uv run gitmetrics traffic \ + --config-file traffic_config.yaml \ + --token ${{ secrets.GH_TRAFFIC_TOKEN }} \ + --output-folder ${{ secrets.OUTPUT_FOLDER }} env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} alert: @@ -41,9 +44,8 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install .[dev] - name: Slack alert if failure run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} diff --git a/.github/workflows/weekly_collection.yaml b/.github/workflows/weekly_collection.yaml index 8d552b5..da2bbb3 100644 --- a/.github/workflows/weekly_collection.yaml +++ b/.github/workflows/weekly_collection.yaml @@ -13,6 +13,7 @@ on: jobs: weekly_github_collection: + environment: weekly runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -21,18 +22,23 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install . - name: Collect GitHub Data run: | - uv run gitmetrics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c weekly.yaml + uv run gitmetrics collect -q \ + --add-metrics \ + --config-file weekly.yaml \ + --token ${{ secrets.GITHUB_TOKEN }} \ + --output-folder ${{ secrets.OUTPUT_FOLDER }} env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} - name: Consolidate GitHub Data run: | - uv run gitmetrics consolidate -v -c weekly.yaml + uv run gitmetrics consolidate \ + --config-file weekly.yaml \ + --output-folder ${{ secrets.OUTPUT_FOLDER }} env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} alert: @@ -46,9 +52,8 @@ jobs: with: enable-cache: true activate-environment: true - - name: Install pip and dependencies + - name: Install dependencies run: | - uv pip install -U pip uv pip install .[dev] - name: Slack alert if failure run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} diff --git a/README.md b/README.md index e1bf817..3744fa5 100644 --- a/README.md +++ b/README.md @@ -1,127 +1,17 @@ -# GitMetrics - -Scripts to extract multiple metrics from GitHub Projects. - -## Install - -```bash -pip install git+ssh://git@github.com/datacebo/gitmetrics -``` - -### Development - -For development, clone the repository and install `dev-requirements.txt`: - -```bash -git clone git@github.com:datacebo/gitmetrics -cd gitmetrics -pip install -e .[test,dev] -``` +
+
+

+ This repository is part of The Synthetic Data Vault Project, a project from DataCebo. +

+
-# Local Usage - -To collect metrics from GitHub by running `gitmetrics` on your computer you need to provide: - -1. A GitHub Token. Documentation about how to create a Personal Access Token can be found - [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) -2. A list of GitHub Repositories for which to collect the metrics. The repositories need - to be given as `{org-name}/{repo-name}`, like `sdv-dev/SDV`. -3. (Optional) A filename where the output will be stored. If a name containing the `.xlsx` - extension is given (like `path/to/my-filename.xlsx`), it will be used as provided. - Otherwise, a filename will be created as `github-metrics-{name}-{today}.xlsx` within - the same folder where the script is run. For example, if `sdv` is passed as the name, - and the script is run on November, 9th, 2021, the output file will be - `github-metrics-sdv-2021-11-09.xlsx`. - -## Python Interface - -In order to run the collection script from python, the `collect_project_metrics` function -needs to be imported from the `gitmetrics` package and executed passing the values -indicated above. - -**NOTE**: For detailed output, logging must be enabled as shown in the example below. - -```python3 ->>> import logging ->>> logging.basicConfig(level=logging.INFO) ->>> from gitmetrics import collect_project_metrics ->>> repositories = ['sdv-dev/RDT', 'sdv-dev/SDV', 'sdv-dev/Copulas', 'sdv-dev/CTGAN'] ->>> output_name = 'sdv-dev' ->>> token = '' ->>> collect_project_metrics(token, repositories, output_name) -INFO:gitmetrics.main:Getting information for repository sdv-dev/RDT -100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 195.00it/s] -100%|███████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 364.64it/s] -100%|███████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 91020.09it/s] -INFO:gitmetrics.main:Getting information for repository sdv-dev/SDV -100%|███████████████████████████████████████████████████████████████| 389/389 [00:02<00:00, 193.20it/s] -100%|███████████████████████████████████████████████████████████████| 219/219 [00:00<00:00, 231.17it/s] -100%|███████████████████████████████████████████████████████████████| 561/561 [00:03<00:00, 158.39it/s] -INFO:gitmetrics.main:Getting information for repository sdv-dev/Copulas -100%|███████████████████████████████████████████████████████████████| 138/138 [00:00<00:00, 333.27it/s] -100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 287.29it/s] -100%|███████████████████████████████████████████████████████████████| 245/245 [00:01<00:00, 204.88it/s] -INFO:gitmetrics.main:Getting information for repository sdv-dev/CTGAN -100%|███████████████████████████████████████████████████████████████| 113/113 [00:00<00:00, 287.26it/s] -100%|██████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 134824.44it/s] -100%|███████████████████████████████████████████████████████████████| 498/498 [00:02<00:00, 171.11it/s] -INFO:gitmetrics.main:Getting 164 missing users - 99%|██████████████████████████████████████████████████████████████▌| 163/164 [00:01<00:00, 121.99it/s] -INFO:gitmetrics.output:Creating file github-metrics-sdv-dev-2021-11-12.xlsx -``` +# GitMetrics - -## Command Line Interface - -In order to run the collection script from the command line, the `gitmetrics collect` command -must be called passing the following optional arguments: - -- `-c / --config-file CONFIG_FILE`: Path to the config file to use. Defaults to `config.yaml`. - Format of the `config.yaml` file is documented below. -- `-o / --output-folder OUTPUT_FILDER`: Path to the folder in which spreadsheets will be created. - Defaults to the value given in the config file, or to `'.'` if there is none, and supports - `gdrive://` format for Google Drive folders. -- `-p / --projects PROJECT [PROJECT [PROJECT...]]`: Names of the projects to pull. These will be - used to search for repository lists inside the config file. If not given, defaults to all the - projects found in the config file. -- `-r / --repositories REPOSITORY [REPOSITORY [REPOSITORY...]]`: Optional, list of repositories - to extract for the indicated project. If this is given, one and only one `project` must be - passed, which will be used as the name for the output spreadsheet. -- `-m / --add-metrics`: If indicated, add a `Metrics` tab with the project metrics to the - spreadsheet. -- `-n / --not-incremental`: If indicated, collect data from scratch instead of doing it - incrementally over the existing data. -- `-t / --token`: GitHub token to use. If not given, it will be requested in a prompt. -- `-l / --logfile LOGFILE`: Write logs to the indicated logfile. -- `-v / --verbose`: Be more verbose. - -```bash -$ gitmetrics github -p sdv-dev -c config.yaml -Please input your GitHub Token: -2021-11-12 15:42:43,100 - INFO - Getting information for repository sdv-dev/RDT -100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 300.87it/s] -100%|███████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 324.25it/s] -100%|███████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 88276.02it/s] -2021-11-12 15:42:45,862 - INFO - Getting information for repository sdv-dev/SDV -100%|███████████████████████████████████████████████████████████████| 389/389 [00:01<00:00, 203.20it/s] -100%|███████████████████████████████████████████████████████████████| 219/219 [00:00<00:00, 228.34it/s] -100%|███████████████████████████████████████████████████████████████| 561/561 [00:03<00:00, 152.64it/s] -2021-11-12 15:42:54,465 - INFO - Getting information for repository sdv-dev/CTGAN -100%|███████████████████████████████████████████████████████████████| 113/113 [00:00<00:00, 283.67it/s] -100%|██████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 134486.70it/s] -100%|███████████████████████████████████████████████████████████████| 498/498 [00:02<00:00, 179.84it/s] -2021-11-12 15:42:59,545 - INFO - Getting information for repository sdv-dev/Copulas -100%|███████████████████████████████████████████████████████████████| 138/138 [00:00<00:00, 318.99it/s] -100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 303.94it/s] -100%|███████████████████████████████████████████████████████████████| 245/245 [00:01<00:00, 170.51it/s] -2021-11-12 15:43:04,178 - INFO - Getting 164 missing users - 99%|██████████████████████████████████████████████████████████████▌| 163/164 [00:01<00:00, 110.06it/s] -2021-11-12 15:43:05,688 - INFO - Creating file github-metrics-sdv-dev-2021-11-12.xlsx -``` +**GitMetrics** extracts metrics from GitHub Projects, generating spreadsheets with repository analytics. ## Output -The result is a spreadsheet that will contain 5 tabs: +The result is a spreadsheet that will contain 5 tabs (for each given project): - **Issues**: Where all the issues are listed, including data about @@ -139,28 +29,73 @@ The result is a spreadsheet that will contain 5 tabs: Where the unique users that stargazed the repositories are listed with all the information existing in their profile -Optionally, and additional spreadsheet called **Metrics** will be created with the +Optionally, an additional spreadsheet called **Metrics** will be created with the aggregation metrics for the entire project. -## Google Drive Integration -GitMetrics is capable of reading and writing results in Google Spreadsheets. +# Install +Install gitmetrics using pip: +```shell +pip install git+ssh://git@github.com/datacebo/gitmetrics +``` + +## Local Usage +Collect metrics from GitHub by running `gitmetrics` on your computer. You need to provide the following: + +1. A GitHub Token. Documentation about how to create a Personal Access Token can be found + [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) +2. A list of GitHub Repositories for which to collect the metrics, defined in a YAML file. The repositories need to be given as `{org-name}/{repo-name}`, (e.g. `sdv-dev/SDV`). See [daily.yaml](./daily.yaml) for an example. +3. (__Optional__) A filename where the output will be stored. If a name containing the `.xlsx` + extension is given (like `path/to/my-filename.xlsx`), it will be used as provided. + Otherwise, a filename will be created as `github-metrics-{name}-{today}.xlsx` within + the same folder where the script is run. + - For example, if `sdv` is passed as the name, + and the script is run on November, 9th, 2021, the output file will be + `github-metrics-sdv-2021-11-09.xlsx`. + +You can run gitmetrics with the following CLI command: -For this to work, the following things are required: +```shell +gitmetrics collect --token {GITHUB_TOKEN} --add-metrics --config-file daily.yaml +``` + +## Google Drive Integration + +GitMetrics is capable of reading and writing results in Google Spreadsheets. The following is required: 1. The `output_path` needs to be given as a Google Drive path with the following format: - `gdrive:///`. For example: `gdrive://1OPhUPTFWN994QnbcrSojQ9Egf9s7MuHV/sdv-dev` + `gdrive:///`. 2. A set of Google Drive Credentials need to be provided in the format required by `PyDrive`. The credentials can be stored in a `credentials.json` file within the working directory, alongside the corresponding `settings.yaml` file, or passed via the `PYDRIVE_CREDENTIALS` environment variable. - -# GitMetrics Configuration - -The GitMetrics script can be configured using a YAML file that indicates which repositories -to collect and where to store the collected data, as well as when to execute the collection -of data using GitHub Actions. - -For more details about how to configure this, check the [CONFIGURATION.md](CONFIGURATION.md) -document. + - See [instructions from PyDrive](https://pythonhosted.org/PyDrive/quickstart.html). + +## Workflows +1. **Weekly Collection**: On a weekly basis, this workflow collects GitHub metrics for the repositories defined in [weekly.yaml](./weekly.yaml). +2. **Daily Collection**: On a daily basis, this workflow collects GitHub metrics for the repositories defined in [daily.yaml](./daily.yaml). +3. **Daily Summarize**: On a daily basis, this workflow summarizes the GitHub metrics (from the daily collection). The summarized data is published to a GitHub repo: [GitHub_Summary.xlsx](https://github.com/sdv-dev/sdv-dev.github.io/blob/gatsby-home/assets/GitHub_Summary.xlsx) + +--- + + +
+
+ +[The Synthetic Data Vault Project](https://sdv.dev) was first created at MIT's [Data to AI Lab]( +https://dai.lids.mit.edu/) in 2016. After 4 years of research and traction with enterprise, we +created [DataCebo](https://datacebo.com) in 2020 with the goal of growing the project. +Today, DataCebo is the proud developer of SDV, the largest ecosystem for +synthetic data generation & evaluation. It is home to multiple libraries that support synthetic +data, including: + +* 🔄 Data discovery & transformation. Reverse the transforms to reproduce realistic data. +* 🧠 Multiple machine learning models -- ranging from Copulas to Deep Learning -- to create tabular, + multi table and time series data. +* 📊 Measuring quality and privacy of synthetic data, and comparing different synthetic data \ No newline at end of file diff --git a/config.yaml b/config.yaml index 1d4c5fe..795ceb0 100644 --- a/config.yaml +++ b/config.yaml @@ -1,4 +1,3 @@ -output_folder: gdrive://1THOw8GIdnF2tTwKfj6dyDA9iS_HHd6XV projects: sdv-dev: - sdv-dev/SDV diff --git a/daily.yaml b/daily.yaml index 1e2f4a1..9eeb1b9 100644 --- a/daily.yaml +++ b/daily.yaml @@ -1,4 +1,3 @@ -output_folder: gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2 import_config: config.yaml projects: sdv-dev: diff --git a/gitmetrics/__main__.py b/gitmetrics/__main__.py index 8634469..963d297 100644 --- a/gitmetrics/__main__.py +++ b/gitmetrics/__main__.py @@ -78,12 +78,10 @@ def _collect(args, parser): projects[project] = config_projects[project] - output_folder = args.output_folder or config.get('output_folder', '.') - collect_projects( token=token, projects=projects, - output_folder=output_folder, + output_folder=args.output_folder, quiet=args.quiet, incremental=args.incremental, add_metrics=args.add_metrics, @@ -118,12 +116,10 @@ def _traffic_collection(args, parser): projects[project] = config_projects[project] - output_folder = args.output_folder or config.get('output_folder', '.') - collect_traffic( token=token, projects=projects, - output_folder=output_folder, + output_folder=args.output_folder, ) @@ -143,12 +139,11 @@ def _summarize(args, parser): def _consolidate(args, parser): config = _load_config(args.config_file) - output_folder = args.output_folder or config.get('output_folder', '.') projects = config['projects'] consolidate_metrics( projects=projects, - output_folder=output_folder, + output_folder=args.output_folder, dry_run=args.dry_run, verbose=args.verbose, ) @@ -185,8 +180,8 @@ def _get_parser(): '-o', '--output-folder', type=str, - required=False, - help='Output folder path. Defaults to output folder in config-file.', + required=True, + help='Output folder path.', ) collect.add_argument('-t', '--token', type=str, required=False, help='GitHub Token to use.') collect.add_argument( @@ -239,8 +234,8 @@ def _get_parser(): '-o', '--output-folder', type=str, - required=False, - help='Output folder path. Defaults to output folder in config-file.', + required=True, + help='Output folder path.', ) # Traffic @@ -258,7 +253,7 @@ def _get_parser(): help='Path to the configuration file.', ) traffic.add_argument( - '-o', '--output-folder', type=str, required=False, help='Output folder path.' + '-o', '--output-folder', type=str, required=True, help='Output folder path.' ) traffic.add_argument( '-p', @@ -286,7 +281,7 @@ def _get_parser(): '--input-folder', type=str, required=True, - help='Path to the folder containing xslx files, with the calculated GitHub metrics.', + help='Path to the folder containing xlsx files, with the calculated GitHub metrics.', ) summarize.add_argument( '-d', diff --git a/summarize_config.yaml b/summarize_config.yaml index 71205c1..02613c9 100644 --- a/summarize_config.yaml +++ b/summarize_config.yaml @@ -1,4 +1,3 @@ -output-folder: gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2 projects: - ecosystem: "sdv" base_project: "sdv" diff --git a/traffic_config.yaml b/traffic_config.yaml index 0bce55b..5577df3 100644 --- a/traffic_config.yaml +++ b/traffic_config.yaml @@ -1,4 +1,3 @@ -output_folder: gdrive://17PsWi_gDy55Ofz5QXFQtCEyBcM3v3BDJ projects: sdv-dev: - sdv-dev/SDV diff --git a/weekly.yaml b/weekly.yaml index 9234cb9..0ea2c96 100644 --- a/weekly.yaml +++ b/weekly.yaml @@ -1,4 +1,3 @@ -output_folder: gdrive://1MxHzo-QmxnypvekJTPZeSy4mQi4pAID2 import_config: config.yaml projects: PyTorchLightning: