Skip to content
Merged
1 change: 1 addition & 0 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ jobs:
github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

49 changes: 49 additions & 0 deletions .github/workflows/traffic_collection.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Biweekly Traffic collection

on:
workflow_dispatch:
inputs:
slack_channel:
description: Slack channel to post the error message to if the builds fail.
required: false
default: "sdv-alerts-debug"

schedule:
- cron: "0 0 */14 * *" # Runs every 14 days at midnight UTC

jobs:
collect_traffic:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install .
- name: Collect Github Traffic Data
run: |
github-analytics traffic -v -t ${{ secrets.PERSONAL_ACCESS_TOKEN }} -c traffic_config.yaml
env:
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
alert:
needs: [collect_traffic]
runs-on: ubuntu-latest
if: failure()
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Install slack dependencies
run: |
python -m pip install --upgrade pip
python -m pip install invoke
python -m pip install -e .[dev]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
python -m pip install -e .[dev]
python -m pip install .[dev]

- name: Slack alert if failure
run: python -m github_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
env:
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
65 changes: 64 additions & 1 deletion github_analytics/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import yaml

from github_analytics.main import collect_projects
from github_analytics.main import collect_projects, collect_traffic

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -87,6 +87,43 @@ def _collect(args, parser):
)


def _traffic_collection(args, parser):
token = args.token or os.getenv('GITHUB_TOKEN')
if token is None:
token = input('Please input your Github Token: ')

config = _load_config(args.config_file)
config_projects = config['projects']

projects = {}
if args.repositories:
if not args.projects:
parser.error('If repositories are given, project name must be provided.')
elif len(args.projects) > 1:
parser.error('If repositories are given, only one project name must be provided.')

projects = {args.projects[0]: args.repositories}

elif not args.projects:
projects = config_projects

else:
for project in args.projects:
if project not in config_projects:
LOGGER.error('Unknown project %s', project)
return

projects[project] = config_projects[project]

output_folder = args.output_folder or config.get('output_folder', '.')

collect_traffic(
token=token,
projects=projects,
output_folder=output_folder,
)


def _get_parser():
# Logging
logging_args = argparse.ArgumentParser(add_help=False)
Expand Down Expand Up @@ -151,6 +188,32 @@ def _get_parser():
help='Start from scratch instead of incrementing over existing data.',
)

# Traffic
traffic = action.add_parser(
'traffic', help='Collect github traffic metrics.', parents=[logging_args]
)
traffic.set_defaults(action=_traffic_collection)

traffic.add_argument('-t', '--token', type=str, required=False, help='Github Token to use.')
traffic.add_argument(
'-c',
'--config-file',
type=str,
default='traffic_config.yaml',
help='Path to the configuration file.',
)
traffic.add_argument(
'-o', '--output-folder', type=str, required=False, help='Output folder path.'
)
traffic.add_argument(
'-p',
'--projects',
type=str,
nargs='*',
help='Projects to collect. Defaults to ALL if not given',
)
traffic.add_argument('-r', '--repositories', nargs='*', help='List of repositories to add.')

return parser


Expand Down
43 changes: 42 additions & 1 deletion github_analytics/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'

LOGGER = logging.getLogger(__name__)
GDRIVE_LINK = 'gdrive://'


def is_drive_path(path):
"""Tell if the drive is a Google Drive path or not."""
return path.startswith('gdrive://')
return path.startswith(GDRIVE_LINK)


def split_drive_path(path):
Expand Down Expand Up @@ -119,3 +120,43 @@ def download_spreadsheet(folder, filename):
drive_file = _find_file(drive, filename, folder)
drive_file.FetchContent(mimetype=XLSX_MIMETYPE)
return drive_file.content


def get_or_create_gdrive_folder(parent_folder: str, folder_name: str) -> str:
"""Check if a folder exists in Google Drive, create it if not, and return its ID.

Args:
parent_folder (str):
ID of the parent Google Drive folder.
folder_name (str):
Name of the folder to check or create.

Returns:
str:
The Google Drive folder ID.
"""
drive = _get_drive_client()

# Check if folder already exists
if parent_folder.startswith(GDRIVE_LINK):
parent_folder = parent_folder.replace(GDRIVE_LINK, '')

query = {
'q': f"title = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder' "
f"and '{parent_folder}' in parents and trashed = false"
}
folders = drive.ListFile(query).GetList()

if folders:
return folders[0]['id'] # Return existing folder ID

# Create folder if it does not exist
folder_metadata = {
'title': folder_name,
'mimeType': 'application/vnd.google-apps.folder',
'parents': [{'id': parent_folder}],
}
folder = drive.CreateFile(folder_metadata)
folder.Upload()

return folder['id']
197 changes: 197 additions & 0 deletions github_analytics/github/traffic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""Traffic client for retrieving github information."""

import logging

import pandas as pd
import requests

logging.basicConfig(level=logging.INFO)
LOGGER = logging.getLogger(__name__)

GITHUB_API_URL = 'https://api.github.com'


class TrafficClient:
"""Client to fetch traffic data (popular referrers & paths) for a given repository.

Args:
token (str):
GitHub personal access token for authentication.
quiet (bool, optional):
If True, suppresses output logging. Defaults to False.
"""

def __init__(self, token):
self.token = token
self.headers = {
'Authorization': f'token {token}',
'Accept': 'application/vnd.github.v3+json',
}

def _get_traffic_data(self, repo: str, endpoint: str) -> list:
"""Helper method to fetch traffic data from GitHub's REST API.

Args:
repo (str):
The repository in the format "owner/repo".
endpoint (str):
The traffic API endpoint (e.g., "popular/referrers", "popular/paths", "views" or
"clones").

Returns:
list:
The JSON response containing traffic data.

Raises:
RuntimeError:
If the API request fails.
"""
url = f'{GITHUB_API_URL}/repos/{repo}/traffic/{endpoint}'
LOGGER.info(f'Fetching traffic data from: {url}')

response = requests.get(url, headers=self.headers)

if response.status_code == 200:
LOGGER.info(f'Successfully retrieved {endpoint} data for {repo}.')
return response.json()
else:
LOGGER.error(f'GitHub API Error ({response.status_code}): {response.json()}')
raise RuntimeError(f'GitHub API Error ({response.status_code}): {response.json()}')

def get_traffic_referrers(self, repo: str) -> pd.DataFrame:
"""Fetches the top referring domains that send traffic to the given repository.

Args:
repo (str):
The repository in the format "owner/repo".

Returns:
pd.DataFrame:
DataFrame containing referrer traffic details with columns:
- `referrer`: Source domain.
- `count`: Number of views.
- `uniques`: Number of unique visitors.
"""
LOGGER.info(f'Fetching traffic referrers for {repo}.')
data = self._get_traffic_data(repo, 'popular/referrers')
df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's match what we see on GitHub. These titles will make it easier for Kalyan and non-engineering people to understand the data.
Screenshot 2025-03-11 at 10 03 25 AM

Suggested change
df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques'])
df = pd.DataFrame(data, columns=['site', 'views', 'unique_visitors'])

df = df.rename(
columns={'referrer': 'site', 'count': 'views', 'uniques': 'unique_visitors'},
)
LOGGER.info(f'Retrieved {len(df)} referrer records for {repo}.')
return df

def get_traffic_paths(self, repo: str) -> pd.DataFrame:
"""Fetches the most visited paths in the given repository.

Args:
repo (str):
The repository in the format "owner/repo".

Returns:
pd.DataFrame: DataFrame containing popular paths with columns:
- `path`: The visited path.
- `title`: Page title.
- `count`: Number of views.
- `uniques`: Number of unique visitors.
"""
LOGGER.info(f'Fetching traffic paths for {repo}.')
data = self._get_traffic_data(repo, 'popular/paths')
df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques'])
df = pd.DataFrame(data, columns=['content', 'title', 'views', 'unique_visitors'])

df = df.rename(
columns={'path': 'content', 'count': 'views', 'uniques': 'unique_visitors'},
)
LOGGER.info(f'Retrieved {len(df)} path records for {repo}.')
return df

def get_traffic_views(self, repo: str) -> pd.DataFrame:
"""Fetches the number of views for the given repository over time.

Args:
repo (str):
The repository in the format "owner/repo".

Returns:
pd.DataFrame:
DataFrame containing repository views with columns:
- `timestamp`: Date of views.
- `count`: Number of views.
- `uniques`: Number of unique visitors.
"""
data = self._get_traffic_data(repo, 'views')
df = pd.DataFrame(data['views'], columns=['timestamp', 'count', 'uniques'])
df = df.rename(columns={'count': 'views', 'uniques': 'unique_visitors'})
LOGGER.info(f'Retrieved {len(df)} views for {repo}.')
return df

def get_traffic_clones(self, repo: str) -> pd.DataFrame:
"""Fetches the number of repository clones over time.

Args:
repo (str):
The repository in the format "owner/repo".

Returns:
pd.DataFrame:
DataFrame containing repository clones with columns:
- `timestamp`: Date of clones.
- `count`: Number of clones.
- `uniques`: Number of unique cloners.
"""
data = self._get_traffic_data(repo, 'clones')
df = pd.DataFrame(data['clones'], columns=['timestamp', 'count', 'uniques'])
df = df.rename(columns={'count': 'clones', 'uniques': 'unique_cloners'})
LOGGER.info(f'Retrieved {len(df)} clones for {repo}.')
return df

def generate_timeframe(cls, traffic_data):
"""Generates a timeframe DataFrame with the start and end timestamps from traffic data.

Args:
traffic_data (dict[str, pd.DataFrame]):
Dictionary containing traffic data, including "Traffic Visitors" and
"Traffic Git Clones".

Returns:
pd.DataFrame:
A DataFrame with a single row containing 'Start Date' and 'End Date'.
"""
start_date = None
end_date = None
all_timestamps = []

if 'Traffic Visitors' in traffic_data and not traffic_data['Traffic Visitors'].empty:
all_timestamps.extend(traffic_data['Traffic Visitors']['timestamp'].tolist())

if 'Traffic Git Clones' in traffic_data and not traffic_data['Traffic Git Clones'].empty:
all_timestamps.extend(traffic_data['Traffic Git Clones']['timestamp'].tolist())

if all_timestamps:
start_date = min(all_timestamps)
end_date = max(all_timestamps)

return pd.DataFrame({'Start Date': [start_date], 'End Date': [end_date]})

def get_all_traffic(self, repo: str) -> dict[str, pd.DataFrame]:
"""Fetches all available traffic data for the given repository.

Args:
repo (str):
The repository in the format "owner/repo".

Returns:
dict[str, pd.DataFrame]:
A dictionary containing traffic data:
- `"referrers"`: DataFrame with referrer traffic.
- `"paths"`: DataFrame with popular paths.
- `"views"`: DataFrame with repository views over time.
- `"clones"`: DataFrame with repository clones over time.
"""
traffic_data = {
'Traffic Referring Sites': self.get_traffic_referrers(repo),
'Traffic Popular Content': self.get_traffic_paths(repo),
'Traffic Visitors': self.get_traffic_views(repo),
'Traffic Git Clones': self.get_traffic_clones(repo),
}
traffic_data['Timeframe'] = self.generate_timeframe(traffic_data)
return traffic_data
Loading