diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index cf43f27..55dd278 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -23,3 +23,4 @@ jobs: github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} + diff --git a/.github/workflows/traffic_collection.yaml b/.github/workflows/traffic_collection.yaml new file mode 100644 index 0000000..fa3d0ac --- /dev/null +++ b/.github/workflows/traffic_collection.yaml @@ -0,0 +1,49 @@ +name: Biweekly Traffic collection + +on: + workflow_dispatch: + inputs: + slack_channel: + description: Slack channel to post the error message to if the builds fail. + required: false + default: "sdv-alerts-debug" + + schedule: + - cron: "0 0 */14 * *" # Runs every 14 days at midnight UTC + +jobs: + collect_traffic: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install . + - name: Collect Github Traffic Data + run: | + github-analytics traffic -v -t ${{ secrets.PERSONAL_ACCESS_TOKEN }} -c traffic_config.yaml + env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} + alert: + needs: [collect_traffic] + runs-on: ubuntu-latest + if: failure() + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Install slack dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke + python -m pip install -e .[dev] + - name: Slack alert if failure + run: python -m github_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} + env: + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} diff --git a/github_analytics/__main__.py b/github_analytics/__main__.py index 24aaa91..0ae90a2 100644 --- a/github_analytics/__main__.py +++ b/github_analytics/__main__.py @@ -9,7 +9,7 @@ import yaml -from github_analytics.main import collect_projects +from github_analytics.main import collect_projects, collect_traffic LOGGER = logging.getLogger(__name__) @@ -87,6 +87,43 @@ def _collect(args, parser): ) +def _traffic_collection(args, parser): + token = args.token or os.getenv('GITHUB_TOKEN') + if token is None: + token = input('Please input your Github Token: ') + + config = _load_config(args.config_file) + config_projects = config['projects'] + + projects = {} + if args.repositories: + if not args.projects: + parser.error('If repositories are given, project name must be provided.') + elif len(args.projects) > 1: + parser.error('If repositories are given, only one project name must be provided.') + + projects = {args.projects[0]: args.repositories} + + elif not args.projects: + projects = config_projects + + else: + for project in args.projects: + if project not in config_projects: + LOGGER.error('Unknown project %s', project) + return + + projects[project] = config_projects[project] + + output_folder = args.output_folder or config.get('output_folder', '.') + + collect_traffic( + token=token, + projects=projects, + output_folder=output_folder, + ) + + def _get_parser(): # Logging logging_args = argparse.ArgumentParser(add_help=False) @@ -151,6 +188,32 @@ def _get_parser(): help='Start from scratch instead of incrementing over existing data.', ) + # Traffic + traffic = action.add_parser( + 'traffic', help='Collect github traffic metrics.', parents=[logging_args] + ) + traffic.set_defaults(action=_traffic_collection) + + traffic.add_argument('-t', '--token', type=str, required=False, help='Github Token to use.') + traffic.add_argument( + '-c', + '--config-file', + type=str, + default='traffic_config.yaml', + help='Path to the configuration file.', + ) + traffic.add_argument( + '-o', '--output-folder', type=str, required=False, help='Output folder path.' + ) + traffic.add_argument( + '-p', + '--projects', + type=str, + nargs='*', + help='Projects to collect. Defaults to ALL if not given', + ) + traffic.add_argument('-r', '--repositories', nargs='*', help='List of repositories to add.') + return parser diff --git a/github_analytics/drive.py b/github_analytics/drive.py index 0a136a3..0a0332a 100644 --- a/github_analytics/drive.py +++ b/github_analytics/drive.py @@ -15,11 +15,12 @@ PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS' LOGGER = logging.getLogger(__name__) +GDRIVE_LINK = 'gdrive://' def is_drive_path(path): """Tell if the drive is a Google Drive path or not.""" - return path.startswith('gdrive://') + return path.startswith(GDRIVE_LINK) def split_drive_path(path): @@ -119,3 +120,43 @@ def download_spreadsheet(folder, filename): drive_file = _find_file(drive, filename, folder) drive_file.FetchContent(mimetype=XLSX_MIMETYPE) return drive_file.content + + +def get_or_create_gdrive_folder(parent_folder: str, folder_name: str) -> str: + """Check if a folder exists in Google Drive, create it if not, and return its ID. + + Args: + parent_folder (str): + ID of the parent Google Drive folder. + folder_name (str): + Name of the folder to check or create. + + Returns: + str: + The Google Drive folder ID. + """ + drive = _get_drive_client() + + # Check if folder already exists + if parent_folder.startswith(GDRIVE_LINK): + parent_folder = parent_folder.replace(GDRIVE_LINK, '') + + query = { + 'q': f"title = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder' " + f"and '{parent_folder}' in parents and trashed = false" + } + folders = drive.ListFile(query).GetList() + + if folders: + return folders[0]['id'] # Return existing folder ID + + # Create folder if it does not exist + folder_metadata = { + 'title': folder_name, + 'mimeType': 'application/vnd.google-apps.folder', + 'parents': [{'id': parent_folder}], + } + folder = drive.CreateFile(folder_metadata) + folder.Upload() + + return folder['id'] diff --git a/github_analytics/github/traffic.py b/github_analytics/github/traffic.py new file mode 100644 index 0000000..e05fad8 --- /dev/null +++ b/github_analytics/github/traffic.py @@ -0,0 +1,197 @@ +"""Traffic client for retrieving github information.""" + +import logging + +import pandas as pd +import requests + +logging.basicConfig(level=logging.INFO) +LOGGER = logging.getLogger(__name__) + +GITHUB_API_URL = 'https://api.github.com' + + +class TrafficClient: + """Client to fetch traffic data (popular referrers & paths) for a given repository. + + Args: + token (str): + GitHub personal access token for authentication. + quiet (bool, optional): + If True, suppresses output logging. Defaults to False. + """ + + def __init__(self, token): + self.token = token + self.headers = { + 'Authorization': f'token {token}', + 'Accept': 'application/vnd.github.v3+json', + } + + def _get_traffic_data(self, repo: str, endpoint: str) -> list: + """Helper method to fetch traffic data from GitHub's REST API. + + Args: + repo (str): + The repository in the format "owner/repo". + endpoint (str): + The traffic API endpoint (e.g., "popular/referrers", "popular/paths", "views" or + "clones"). + + Returns: + list: + The JSON response containing traffic data. + + Raises: + RuntimeError: + If the API request fails. + """ + url = f'{GITHUB_API_URL}/repos/{repo}/traffic/{endpoint}' + LOGGER.info(f'Fetching traffic data from: {url}') + + response = requests.get(url, headers=self.headers) + + if response.status_code == 200: + LOGGER.info(f'Successfully retrieved {endpoint} data for {repo}.') + return response.json() + else: + LOGGER.error(f'GitHub API Error ({response.status_code}): {response.json()}') + raise RuntimeError(f'GitHub API Error ({response.status_code}): {response.json()}') + + def get_traffic_referrers(self, repo: str) -> pd.DataFrame: + """Fetches the top referring domains that send traffic to the given repository. + + Args: + repo (str): + The repository in the format "owner/repo". + + Returns: + pd.DataFrame: + DataFrame containing referrer traffic details with columns: + - `referrer`: Source domain. + - `count`: Number of views. + - `uniques`: Number of unique visitors. + """ + LOGGER.info(f'Fetching traffic referrers for {repo}.') + data = self._get_traffic_data(repo, 'popular/referrers') + df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques']) + df = df.rename( + columns={'referrer': 'site', 'count': 'views', 'uniques': 'unique_visitors'}, + ) + LOGGER.info(f'Retrieved {len(df)} referrer records for {repo}.') + return df + + def get_traffic_paths(self, repo: str) -> pd.DataFrame: + """Fetches the most visited paths in the given repository. + + Args: + repo (str): + The repository in the format "owner/repo". + + Returns: + pd.DataFrame: DataFrame containing popular paths with columns: + - `path`: The visited path. + - `title`: Page title. + - `count`: Number of views. + - `uniques`: Number of unique visitors. + """ + LOGGER.info(f'Fetching traffic paths for {repo}.') + data = self._get_traffic_data(repo, 'popular/paths') + df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques']) + df = df.rename( + columns={'path': 'content', 'count': 'views', 'uniques': 'unique_visitors'}, + ) + LOGGER.info(f'Retrieved {len(df)} path records for {repo}.') + return df + + def get_traffic_views(self, repo: str) -> pd.DataFrame: + """Fetches the number of views for the given repository over time. + + Args: + repo (str): + The repository in the format "owner/repo". + + Returns: + pd.DataFrame: + DataFrame containing repository views with columns: + - `timestamp`: Date of views. + - `count`: Number of views. + - `uniques`: Number of unique visitors. + """ + data = self._get_traffic_data(repo, 'views') + df = pd.DataFrame(data['views'], columns=['timestamp', 'count', 'uniques']) + df = df.rename(columns={'count': 'views', 'uniques': 'unique_visitors'}) + LOGGER.info(f'Retrieved {len(df)} views for {repo}.') + return df + + def get_traffic_clones(self, repo: str) -> pd.DataFrame: + """Fetches the number of repository clones over time. + + Args: + repo (str): + The repository in the format "owner/repo". + + Returns: + pd.DataFrame: + DataFrame containing repository clones with columns: + - `timestamp`: Date of clones. + - `count`: Number of clones. + - `uniques`: Number of unique cloners. + """ + data = self._get_traffic_data(repo, 'clones') + df = pd.DataFrame(data['clones'], columns=['timestamp', 'count', 'uniques']) + df = df.rename(columns={'count': 'clones', 'uniques': 'unique_cloners'}) + LOGGER.info(f'Retrieved {len(df)} clones for {repo}.') + return df + + def generate_timeframe(cls, traffic_data): + """Generates a timeframe DataFrame with the start and end timestamps from traffic data. + + Args: + traffic_data (dict[str, pd.DataFrame]): + Dictionary containing traffic data, including "Traffic Visitors" and + "Traffic Git Clones". + + Returns: + pd.DataFrame: + A DataFrame with a single row containing 'Start Date' and 'End Date'. + """ + start_date = None + end_date = None + all_timestamps = [] + + if 'Traffic Visitors' in traffic_data and not traffic_data['Traffic Visitors'].empty: + all_timestamps.extend(traffic_data['Traffic Visitors']['timestamp'].tolist()) + + if 'Traffic Git Clones' in traffic_data and not traffic_data['Traffic Git Clones'].empty: + all_timestamps.extend(traffic_data['Traffic Git Clones']['timestamp'].tolist()) + + if all_timestamps: + start_date = min(all_timestamps) + end_date = max(all_timestamps) + + return pd.DataFrame({'Start Date': [start_date], 'End Date': [end_date]}) + + def get_all_traffic(self, repo: str) -> dict[str, pd.DataFrame]: + """Fetches all available traffic data for the given repository. + + Args: + repo (str): + The repository in the format "owner/repo". + + Returns: + dict[str, pd.DataFrame]: + A dictionary containing traffic data: + - `"referrers"`: DataFrame with referrer traffic. + - `"paths"`: DataFrame with popular paths. + - `"views"`: DataFrame with repository views over time. + - `"clones"`: DataFrame with repository clones over time. + """ + traffic_data = { + 'Traffic Referring Sites': self.get_traffic_referrers(repo), + 'Traffic Popular Content': self.get_traffic_paths(repo), + 'Traffic Visitors': self.get_traffic_views(repo), + 'Traffic Git Clones': self.get_traffic_clones(repo), + } + traffic_data['Timeframe'] = self.generate_timeframe(traffic_data) + return traffic_data diff --git a/github_analytics/main.py b/github_analytics/main.py index 9c15e46..6db7464 100644 --- a/github_analytics/main.py +++ b/github_analytics/main.py @@ -1,18 +1,23 @@ """Main script.""" +import datetime import logging import pathlib import pandas as pd +from github_analytics.drive import get_or_create_gdrive_folder from github_analytics.github.repository import RepositoryClient from github_analytics.github.repository_owner import RepositoryOwnerClient +from github_analytics.github.traffic import TrafficClient from github_analytics.github.users import UsersClient from github_analytics.metrics import compute_metrics from github_analytics.output import create_spreadsheet, load_spreadsheet LOGGER = logging.getLogger(__name__) +GDRIVE_LINK = 'gdrive://' + USER_COLUMNS = [ 'user', 'name', @@ -256,9 +261,66 @@ def collect_projects( raise ValueError('No projects have been passed') for project, repositories in projects.items(): - if output_folder.startswith('gdrive://'): + if output_folder.startswith(GDRIVE_LINK): project_path = f'{output_folder}/{project}' else: project_path = str(pathlib.Path(output_folder) / project) collect_project_metrics(token, repositories, project_path, quiet, incremental, add_metrics) + + +def collect_traffic(token, projects, output_folder): + """Collect github metrics for multiple projects. + + Args: + token (str): + Github token to use. + projects (dict[str, List[str]]): + Projects to collect, passed as a dict of project names + and lists of repositories. + ouptut_folder (str): + Folder in which the metrics will be stored. + """ + timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + for project, repositories in projects.items(): + for repository in repositories: + repository_name = repository.split('/')[-1] + if output_folder.startswith(GDRIVE_LINK): + repo_folder = get_or_create_gdrive_folder(output_folder, repository_name) + repo_path = f'{repo_folder}/{timestamp}' + + else: + repo_path = str(pathlib.Path(output_folder) / project / repository_name) + + collect_project_traffic(token, repository, repo_path) + + +def collect_project_traffic(token, repository, repo_path): + """Collects traffic data (popular referrers & paths) from GitHub repositories. + + Args: + token (str): + GitHub token for authentication. + + repository (str): + Repository name such as "owner/repository". + + repo_path (str): + Output path to store the results. + + Returns: + dict[str, pd.DataFrame] or None: + If repo_path is None, returns a dictionary containing traffic data. + """ + client = TrafficClient(token) + try: + traffic_data = client.get_all_traffic(repository) + + except Exception as e: + LOGGER.warning(f'Failed to fetch traffic data for {repository}: {e}') + + if repo_path: + create_spreadsheet(f'{GDRIVE_LINK}{repo_path}', traffic_data) + return None + + return traffic_data diff --git a/github_analytics/slack_utils.py b/github_analytics/slack_utils.py new file mode 100644 index 0000000..ed0ef56 --- /dev/null +++ b/github_analytics/slack_utils.py @@ -0,0 +1,98 @@ +"""Utility functions for Slack integration.""" + +import argparse +import os + +from slack_sdk import WebClient + +GITHUB_URL_PREFIX = 'https://github.com/datacebo/github-analytics/actions/runs/' +DEFAULT_SLACK_CHANNEL = 'sdv-alerts-debug' + + +def _get_slack_client(): + """Create an authenticated Slack client. + + Returns: + WebClient: + An authenticated Slack WebClient instance. + """ + token = os.getenv('SLACK_TOKEN') + client = WebClient(token=token) + return client + + +def post_slack_message(channel, text): + """Post a message to a Slack channel. + + Args: + channel (str): + The name of the channel to post to. + text (str): + The message to send to the channel. + + Returns: + SlackResponse: + Response from Slack API call + """ + client = _get_slack_client() + response = client.chat_postMessage(channel=channel, text=text) + if not response['ok']: + error = response.get('error', 'unknown_error') + msg = f'{error} occured trying to post message to {channel}' + raise RuntimeError(msg) + + return response + + +def post_slack_message_in_thread(channel, text, thread_ts): + """Post a message as a threaded reply in a Slack channel. + + Args: + channel (str): + The name of the channel to post to. + text (str): + The message to send as a reply in the thread. + thread_ts (str): + The timestamp of the message that starts the thread. + + Returns: + SlackResponse: + Response from Slack API call. + """ + client = _get_slack_client() + response = client.chat_postMessage(channel=channel, text=text, thread_ts=thread_ts) + if not response['ok']: + error = response.get('error', 'unknown_error') + msg = f'{error} occurred trying to post threaded message to {channel}' + raise RuntimeError(msg) + + return response + + +def send_alert(args): + """Send an alert message to a slack channel.""" + url = GITHUB_URL_PREFIX + args.run_id + message = f'Github Analytics build failed :fire: :dumpster-fire: :fire: See errors <{url}|here>' + post_slack_message(args.channel, message) + + +def get_parser(): + """Get the parser.""" + parser = argparse.ArgumentParser(description='Function to alert when a Github workflow fails.') + parser.add_argument('-r', '--run-id', type=str, help='The id of the github run.') + parser.add_argument( + '-c', + '--channel', + type=str, + help='The slack channel to post to.', + default=DEFAULT_SLACK_CHANNEL, + ) + parser.set_defaults(action=send_alert) + + return parser + + +if __name__ == '__main__': + parser = get_parser() + args = parser.parse_args() + args.action(args) diff --git a/pyproject.toml b/pyproject.toml index 4481180..cbbb73d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,8 @@ include = ['github_analytics', 'github_analytics.*'] [project.optional-dependencies] dev = [ "ruff>=0.9.8", - "invoke" + "invoke", + "slack-sdk>=3.34,<4.0", ] [tool.ruff] diff --git a/traffic_config.yaml b/traffic_config.yaml new file mode 100644 index 0000000..0bce55b --- /dev/null +++ b/traffic_config.yaml @@ -0,0 +1,11 @@ +output_folder: gdrive://17PsWi_gDy55Ofz5QXFQtCEyBcM3v3BDJ +projects: + sdv-dev: + - sdv-dev/SDV + - sdv-dev/RDT + - sdv-dev/SDMetrics + - sdv-dev/SDGym + - sdv-dev/Copulas + - sdv-dev/CTGAN + - sdv-dev/TGAN + - sdv-dev/DeepEcho diff --git a/weekly.yaml b/weekly.yaml index 256e87d..86ea0ac 100644 --- a/weekly.yaml +++ b/weekly.yaml @@ -15,7 +15,7 @@ projects: gretel: huggingface: mariadb: - mostly: + mostly-ai: pandas: prefect: pycaret: