diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml
index 0de6ec7..28ad27a 100644
--- a/.github/workflows/daily_collection.yaml
+++ b/.github/workflows/daily_collection.yaml
@@ -13,6 +13,7 @@ on:
jobs:
daily_github_collection:
+ environment: daily
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
@@ -21,16 +22,18 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .
- name: Collect GitHub Data
run: |
- uv run gitmetrics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml
+ uv run gitmetrics collect -q \
+ --add-metrics \
+ --config-file daily.yaml \
+ --token ${{ secrets.GITHUB_TOKEN }} \
+ --output-folder ${{ secrets.OUTPUT_FOLDER }}
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
-
alert:
needs: [daily_github_collection]
runs-on: ubuntu-latest
@@ -42,9 +45,8 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .[dev]
- name: Slack alert if failure
run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
diff --git a/.github/workflows/daily_summarize.yaml b/.github/workflows/daily_summarize.yaml
index 43088dc..59a575c 100644
--- a/.github/workflows/daily_summarize.yaml
+++ b/.github/workflows/daily_summarize.yaml
@@ -13,6 +13,7 @@ on:
jobs:
summarize:
+ environment: daily
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
@@ -22,14 +23,13 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .
- name: Run Summarize
run: |
uv run gitmetrics summarize \
- --input-folder gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2
+ --input-folder ${{ secrets.OUTPUT_FOLDER }}
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
- uses: actions/checkout@v4
@@ -60,9 +60,8 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .[dev]
- name: Slack alert if failure
run: |
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 8883312..f9e4f1d 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -18,9 +18,8 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .[dev]
- name: Run lint checks
run: uv run invoke lint
\ No newline at end of file
diff --git a/.github/workflows/traffic_collection.yaml b/.github/workflows/traffic_collection.yaml
index 8cb06a8..1717a4c 100644
--- a/.github/workflows/traffic_collection.yaml
+++ b/.github/workflows/traffic_collection.yaml
@@ -13,6 +13,7 @@ on:
jobs:
daily_traffic_collection:
+ environment: traffic
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
@@ -21,13 +22,15 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .
- name: Collect GitHub Traffic Data
run: |
- uv run gitmetrics traffic -v -t ${{ secrets.GH_TRAFFIC_TOKEN }} -c traffic_config.yaml
+ uv run gitmetrics traffic \
+ --config-file traffic_config.yaml \
+ --token ${{ secrets.GH_TRAFFIC_TOKEN }} \
+ --output-folder ${{ secrets.OUTPUT_FOLDER }}
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
alert:
@@ -41,9 +44,8 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .[dev]
- name: Slack alert if failure
run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
diff --git a/.github/workflows/weekly_collection.yaml b/.github/workflows/weekly_collection.yaml
index 8d552b5..da2bbb3 100644
--- a/.github/workflows/weekly_collection.yaml
+++ b/.github/workflows/weekly_collection.yaml
@@ -13,6 +13,7 @@ on:
jobs:
weekly_github_collection:
+ environment: weekly
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
@@ -21,18 +22,23 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .
- name: Collect GitHub Data
run: |
- uv run gitmetrics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c weekly.yaml
+ uv run gitmetrics collect -q \
+ --add-metrics \
+ --config-file weekly.yaml \
+ --token ${{ secrets.GITHUB_TOKEN }} \
+ --output-folder ${{ secrets.OUTPUT_FOLDER }}
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
- name: Consolidate GitHub Data
run: |
- uv run gitmetrics consolidate -v -c weekly.yaml
+ uv run gitmetrics consolidate \
+ --config-file weekly.yaml \
+ --output-folder ${{ secrets.OUTPUT_FOLDER }}
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
alert:
@@ -46,9 +52,8 @@ jobs:
with:
enable-cache: true
activate-environment: true
- - name: Install pip and dependencies
+ - name: Install dependencies
run: |
- uv pip install -U pip
uv pip install .[dev]
- name: Slack alert if failure
run: python -m gitmetrics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
diff --git a/README.md b/README.md
index e1bf817..3744fa5 100644
--- a/README.md
+++ b/README.md
@@ -1,127 +1,17 @@
-# GitMetrics
-
-Scripts to extract multiple metrics from GitHub Projects.
-
-## Install
-
-```bash
-pip install git+ssh://git@github.com/datacebo/gitmetrics
-```
-
-### Development
-
-For development, clone the repository and install `dev-requirements.txt`:
-
-```bash
-git clone git@github.com:datacebo/gitmetrics
-cd gitmetrics
-pip install -e .[test,dev]
-```
+
+
+
+ This repository is part of The Synthetic Data Vault Project, a project from DataCebo.
+
+
-# Local Usage
-
-To collect metrics from GitHub by running `gitmetrics` on your computer you need to provide:
-
-1. A GitHub Token. Documentation about how to create a Personal Access Token can be found
- [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
-2. A list of GitHub Repositories for which to collect the metrics. The repositories need
- to be given as `{org-name}/{repo-name}`, like `sdv-dev/SDV`.
-3. (Optional) A filename where the output will be stored. If a name containing the `.xlsx`
- extension is given (like `path/to/my-filename.xlsx`), it will be used as provided.
- Otherwise, a filename will be created as `github-metrics-{name}-{today}.xlsx` within
- the same folder where the script is run. For example, if `sdv` is passed as the name,
- and the script is run on November, 9th, 2021, the output file will be
- `github-metrics-sdv-2021-11-09.xlsx`.
-
-## Python Interface
-
-In order to run the collection script from python, the `collect_project_metrics` function
-needs to be imported from the `gitmetrics` package and executed passing the values
-indicated above.
-
-**NOTE**: For detailed output, logging must be enabled as shown in the example below.
-
-```python3
->>> import logging
->>> logging.basicConfig(level=logging.INFO)
->>> from gitmetrics import collect_project_metrics
->>> repositories = ['sdv-dev/RDT', 'sdv-dev/SDV', 'sdv-dev/Copulas', 'sdv-dev/CTGAN']
->>> output_name = 'sdv-dev'
->>> token = '
'
->>> collect_project_metrics(token, repositories, output_name)
-INFO:gitmetrics.main:Getting information for repository sdv-dev/RDT
-100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 195.00it/s]
-100%|███████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 364.64it/s]
-100%|███████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 91020.09it/s]
-INFO:gitmetrics.main:Getting information for repository sdv-dev/SDV
-100%|███████████████████████████████████████████████████████████████| 389/389 [00:02<00:00, 193.20it/s]
-100%|███████████████████████████████████████████████████████████████| 219/219 [00:00<00:00, 231.17it/s]
-100%|███████████████████████████████████████████████████████████████| 561/561 [00:03<00:00, 158.39it/s]
-INFO:gitmetrics.main:Getting information for repository sdv-dev/Copulas
-100%|███████████████████████████████████████████████████████████████| 138/138 [00:00<00:00, 333.27it/s]
-100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 287.29it/s]
-100%|███████████████████████████████████████████████████████████████| 245/245 [00:01<00:00, 204.88it/s]
-INFO:gitmetrics.main:Getting information for repository sdv-dev/CTGAN
-100%|███████████████████████████████████████████████████████████████| 113/113 [00:00<00:00, 287.26it/s]
-100%|██████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 134824.44it/s]
-100%|███████████████████████████████████████████████████████████████| 498/498 [00:02<00:00, 171.11it/s]
-INFO:gitmetrics.main:Getting 164 missing users
- 99%|██████████████████████████████████████████████████████████████▌| 163/164 [00:01<00:00, 121.99it/s]
-INFO:gitmetrics.output:Creating file github-metrics-sdv-dev-2021-11-12.xlsx
-```
+# GitMetrics
-
-## Command Line Interface
-
-In order to run the collection script from the command line, the `gitmetrics collect` command
-must be called passing the following optional arguments:
-
-- `-c / --config-file CONFIG_FILE`: Path to the config file to use. Defaults to `config.yaml`.
- Format of the `config.yaml` file is documented below.
-- `-o / --output-folder OUTPUT_FILDER`: Path to the folder in which spreadsheets will be created.
- Defaults to the value given in the config file, or to `'.'` if there is none, and supports
- `gdrive://` format for Google Drive folders.
-- `-p / --projects PROJECT [PROJECT [PROJECT...]]`: Names of the projects to pull. These will be
- used to search for repository lists inside the config file. If not given, defaults to all the
- projects found in the config file.
-- `-r / --repositories REPOSITORY [REPOSITORY [REPOSITORY...]]`: Optional, list of repositories
- to extract for the indicated project. If this is given, one and only one `project` must be
- passed, which will be used as the name for the output spreadsheet.
-- `-m / --add-metrics`: If indicated, add a `Metrics` tab with the project metrics to the
- spreadsheet.
-- `-n / --not-incremental`: If indicated, collect data from scratch instead of doing it
- incrementally over the existing data.
-- `-t / --token`: GitHub token to use. If not given, it will be requested in a prompt.
-- `-l / --logfile LOGFILE`: Write logs to the indicated logfile.
-- `-v / --verbose`: Be more verbose.
-
-```bash
-$ gitmetrics github -p sdv-dev -c config.yaml
-Please input your GitHub Token:
-2021-11-12 15:42:43,100 - INFO - Getting information for repository sdv-dev/RDT
-100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 300.87it/s]
-100%|███████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 324.25it/s]
-100%|███████████████████████████████████████████████████████████████| 37/37 [00:00<00:00, 88276.02it/s]
-2021-11-12 15:42:45,862 - INFO - Getting information for repository sdv-dev/SDV
-100%|███████████████████████████████████████████████████████████████| 389/389 [00:01<00:00, 203.20it/s]
-100%|███████████████████████████████████████████████████████████████| 219/219 [00:00<00:00, 228.34it/s]
-100%|███████████████████████████████████████████████████████████████| 561/561 [00:03<00:00, 152.64it/s]
-2021-11-12 15:42:54,465 - INFO - Getting information for repository sdv-dev/CTGAN
-100%|███████████████████████████████████████████████████████████████| 113/113 [00:00<00:00, 283.67it/s]
-100%|██████████████████████████████████████████████████████████████| 64/64 [00:00<00:00, 134486.70it/s]
-100%|███████████████████████████████████████████████████████████████| 498/498 [00:02<00:00, 179.84it/s]
-2021-11-12 15:42:59,545 - INFO - Getting information for repository sdv-dev/Copulas
-100%|███████████████████████████████████████████████████████████████| 138/138 [00:00<00:00, 318.99it/s]
-100%|███████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 303.94it/s]
-100%|███████████████████████████████████████████████████████████████| 245/245 [00:01<00:00, 170.51it/s]
-2021-11-12 15:43:04,178 - INFO - Getting 164 missing users
- 99%|██████████████████████████████████████████████████████████████▌| 163/164 [00:01<00:00, 110.06it/s]
-2021-11-12 15:43:05,688 - INFO - Creating file github-metrics-sdv-dev-2021-11-12.xlsx
-```
+**GitMetrics** extracts metrics from GitHub Projects, generating spreadsheets with repository analytics.
## Output
-The result is a spreadsheet that will contain 5 tabs:
+The result is a spreadsheet that will contain 5 tabs (for each given project):
- **Issues**:
Where all the issues are listed, including data about
@@ -139,28 +29,73 @@ The result is a spreadsheet that will contain 5 tabs:
Where the unique users that stargazed the repositories
are listed with all the information existing in their profile
-Optionally, and additional spreadsheet called **Metrics** will be created with the
+Optionally, an additional spreadsheet called **Metrics** will be created with the
aggregation metrics for the entire project.
-## Google Drive Integration
-GitMetrics is capable of reading and writing results in Google Spreadsheets.
+# Install
+Install gitmetrics using pip:
+```shell
+pip install git+ssh://git@github.com/datacebo/gitmetrics
+```
+
+## Local Usage
+Collect metrics from GitHub by running `gitmetrics` on your computer. You need to provide the following:
+
+1. A GitHub Token. Documentation about how to create a Personal Access Token can be found
+ [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+2. A list of GitHub Repositories for which to collect the metrics, defined in a YAML file. The repositories need to be given as `{org-name}/{repo-name}`, (e.g. `sdv-dev/SDV`). See [daily.yaml](./daily.yaml) for an example.
+3. (__Optional__) A filename where the output will be stored. If a name containing the `.xlsx`
+ extension is given (like `path/to/my-filename.xlsx`), it will be used as provided.
+ Otherwise, a filename will be created as `github-metrics-{name}-{today}.xlsx` within
+ the same folder where the script is run.
+ - For example, if `sdv` is passed as the name,
+ and the script is run on November, 9th, 2021, the output file will be
+ `github-metrics-sdv-2021-11-09.xlsx`.
+
+You can run gitmetrics with the following CLI command:
-For this to work, the following things are required:
+```shell
+gitmetrics collect --token {GITHUB_TOKEN} --add-metrics --config-file daily.yaml
+```
+
+## Google Drive Integration
+
+GitMetrics is capable of reading and writing results in Google Spreadsheets. The following is required:
1. The `output_path` needs to be given as a Google Drive path with the following format:
- `gdrive:///`. For example: `gdrive://1OPhUPTFWN994QnbcrSojQ9Egf9s7MuHV/sdv-dev`
+ `gdrive:///`.
2. A set of Google Drive Credentials need to be provided in the format required by `PyDrive`. The
credentials can be stored in a `credentials.json` file within the working directory, alongside
the corresponding `settings.yaml` file, or passed via the `PYDRIVE_CREDENTIALS` environment
variable.
-
-# GitMetrics Configuration
-
-The GitMetrics script can be configured using a YAML file that indicates which repositories
-to collect and where to store the collected data, as well as when to execute the collection
-of data using GitHub Actions.
-
-For more details about how to configure this, check the [CONFIGURATION.md](CONFIGURATION.md)
-document.
+ - See [instructions from PyDrive](https://pythonhosted.org/PyDrive/quickstart.html).
+
+## Workflows
+1. **Weekly Collection**: On a weekly basis, this workflow collects GitHub metrics for the repositories defined in [weekly.yaml](./weekly.yaml).
+2. **Daily Collection**: On a daily basis, this workflow collects GitHub metrics for the repositories defined in [daily.yaml](./daily.yaml).
+3. **Daily Summarize**: On a daily basis, this workflow summarizes the GitHub metrics (from the daily collection). The summarized data is published to a GitHub repo: [GitHub_Summary.xlsx](https://github.com/sdv-dev/sdv-dev.github.io/blob/gatsby-home/assets/GitHub_Summary.xlsx)
+
+---
+
+
+
+
+
+[The Synthetic Data Vault Project](https://sdv.dev) was first created at MIT's [Data to AI Lab](
+https://dai.lids.mit.edu/) in 2016. After 4 years of research and traction with enterprise, we
+created [DataCebo](https://datacebo.com) in 2020 with the goal of growing the project.
+Today, DataCebo is the proud developer of SDV, the largest ecosystem for
+synthetic data generation & evaluation. It is home to multiple libraries that support synthetic
+data, including:
+
+* 🔄 Data discovery & transformation. Reverse the transforms to reproduce realistic data.
+* 🧠 Multiple machine learning models -- ranging from Copulas to Deep Learning -- to create tabular,
+ multi table and time series data.
+* 📊 Measuring quality and privacy of synthetic data, and comparing different synthetic data
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index 1d4c5fe..795ceb0 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,4 +1,3 @@
-output_folder: gdrive://1THOw8GIdnF2tTwKfj6dyDA9iS_HHd6XV
projects:
sdv-dev:
- sdv-dev/SDV
diff --git a/daily.yaml b/daily.yaml
index 1e2f4a1..9eeb1b9 100644
--- a/daily.yaml
+++ b/daily.yaml
@@ -1,4 +1,3 @@
-output_folder: gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2
import_config: config.yaml
projects:
sdv-dev:
diff --git a/gitmetrics/__main__.py b/gitmetrics/__main__.py
index 8634469..963d297 100644
--- a/gitmetrics/__main__.py
+++ b/gitmetrics/__main__.py
@@ -78,12 +78,10 @@ def _collect(args, parser):
projects[project] = config_projects[project]
- output_folder = args.output_folder or config.get('output_folder', '.')
-
collect_projects(
token=token,
projects=projects,
- output_folder=output_folder,
+ output_folder=args.output_folder,
quiet=args.quiet,
incremental=args.incremental,
add_metrics=args.add_metrics,
@@ -118,12 +116,10 @@ def _traffic_collection(args, parser):
projects[project] = config_projects[project]
- output_folder = args.output_folder or config.get('output_folder', '.')
-
collect_traffic(
token=token,
projects=projects,
- output_folder=output_folder,
+ output_folder=args.output_folder,
)
@@ -143,12 +139,11 @@ def _summarize(args, parser):
def _consolidate(args, parser):
config = _load_config(args.config_file)
- output_folder = args.output_folder or config.get('output_folder', '.')
projects = config['projects']
consolidate_metrics(
projects=projects,
- output_folder=output_folder,
+ output_folder=args.output_folder,
dry_run=args.dry_run,
verbose=args.verbose,
)
@@ -185,8 +180,8 @@ def _get_parser():
'-o',
'--output-folder',
type=str,
- required=False,
- help='Output folder path. Defaults to output folder in config-file.',
+ required=True,
+ help='Output folder path.',
)
collect.add_argument('-t', '--token', type=str, required=False, help='GitHub Token to use.')
collect.add_argument(
@@ -239,8 +234,8 @@ def _get_parser():
'-o',
'--output-folder',
type=str,
- required=False,
- help='Output folder path. Defaults to output folder in config-file.',
+ required=True,
+ help='Output folder path.',
)
# Traffic
@@ -258,7 +253,7 @@ def _get_parser():
help='Path to the configuration file.',
)
traffic.add_argument(
- '-o', '--output-folder', type=str, required=False, help='Output folder path.'
+ '-o', '--output-folder', type=str, required=True, help='Output folder path.'
)
traffic.add_argument(
'-p',
@@ -286,7 +281,7 @@ def _get_parser():
'--input-folder',
type=str,
required=True,
- help='Path to the folder containing xslx files, with the calculated GitHub metrics.',
+ help='Path to the folder containing xlsx files, with the calculated GitHub metrics.',
)
summarize.add_argument(
'-d',
diff --git a/summarize_config.yaml b/summarize_config.yaml
index 71205c1..02613c9 100644
--- a/summarize_config.yaml
+++ b/summarize_config.yaml
@@ -1,4 +1,3 @@
-output-folder: gdrive://1ZvsuVbFAUk3BN-n6Pv_lUBLwviHZSxM2
projects:
- ecosystem: "sdv"
base_project: "sdv"
diff --git a/traffic_config.yaml b/traffic_config.yaml
index 0bce55b..5577df3 100644
--- a/traffic_config.yaml
+++ b/traffic_config.yaml
@@ -1,4 +1,3 @@
-output_folder: gdrive://17PsWi_gDy55Ofz5QXFQtCEyBcM3v3BDJ
projects:
sdv-dev:
- sdv-dev/SDV
diff --git a/weekly.yaml b/weekly.yaml
index 9234cb9..0ea2c96 100644
--- a/weekly.yaml
+++ b/weekly.yaml
@@ -1,4 +1,3 @@
-output_folder: gdrive://1MxHzo-QmxnypvekJTPZeSy4mQi4pAID2
import_config: config.yaml
projects:
PyTorchLightning: