sdv-dev · pvk-developer · Mar 11, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -23,3 +23,4 @@ jobs:
         github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml
       env:
         PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+
diff --git a/.github/workflows/traffic_collection.yaml b/.github/workflows/traffic_collection.yaml
@@ -0,0 +1,49 @@
+name: Biweekly Traffic collection
+
+on:
+  workflow_dispatch:
+    inputs:
+      slack_channel:
+        description: Slack channel to post the error message to if the builds fail.
+        required: false
+        default: "sdv-alerts-debug"
+
+  schedule:
+    - cron: "0 0 */14 * *"  # Runs every 14 days at midnight UTC
+
+jobs:
+  collect_traffic:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.13'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install .
+    - name: Collect Github Traffic Data
+      run: |
+        github-analytics traffic -v -t ${{ secrets.PERSONAL_ACCESS_TOKEN }} -c traffic_config.yaml
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+  alert:
+    needs: [collect_traffic]
+    runs-on: ubuntu-latest
+    if: failure()
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.13'
+    - name: Install slack dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install invoke
+        python -m pip install -e .[dev]
-        python -m pip install -e .[dev]
+        python -m pip install .[dev]
-        python -m pip install -e .[dev]
+        python -m pip install .[dev]
+    - name: Slack alert if failure
+      run: python -m github_analytics.slack_utils -r ${{ github.run_id }} -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }}
+      env:
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
diff --git a/github_analytics/__main__.py b/github_analytics/__main__.py
@@ -9,7 +9,7 @@
 
 import yaml
 
-from github_analytics.main import collect_projects
+from github_analytics.main import collect_projects, collect_traffic
 
 LOGGER = logging.getLogger(__name__)
 
@@ -87,6 +87,43 @@ def _collect(args, parser):
     )
 
 
+def _traffic_collection(args, parser):
+    token = args.token or os.getenv('GITHUB_TOKEN')
+    if token is None:
+        token = input('Please input your Github Token: ')
+
+    config = _load_config(args.config_file)
+    config_projects = config['projects']
+
+    projects = {}
+    if args.repositories:
+        if not args.projects:
+            parser.error('If repositories are given, project name must be provided.')
+        elif len(args.projects) > 1:
+            parser.error('If repositories are given, only one project name must be provided.')
+
+        projects = {args.projects[0]: args.repositories}
+
+    elif not args.projects:
+        projects = config_projects
+
+    else:
+        for project in args.projects:
+            if project not in config_projects:
+                LOGGER.error('Unknown project %s', project)
+                return
+
+            projects[project] = config_projects[project]
+
+    output_folder = args.output_folder or config.get('output_folder', '.')
+
+    collect_traffic(
+        token=token,
+        projects=projects,
+        output_folder=output_folder,
+    )
+
+
 def _get_parser():
     # Logging
     logging_args = argparse.ArgumentParser(add_help=False)
@@ -151,6 +188,32 @@ def _get_parser():
         help='Start from scratch instead of incrementing over existing data.',
     )
 
+    # Traffic
+    traffic = action.add_parser(
+        'traffic', help='Collect github traffic metrics.', parents=[logging_args]
+    )
+    traffic.set_defaults(action=_traffic_collection)
+
+    traffic.add_argument('-t', '--token', type=str, required=False, help='Github Token to use.')
+    traffic.add_argument(
+        '-c',
+        '--config-file',
+        type=str,
+        default='traffic_config.yaml',
+        help='Path to the configuration file.',
+    )
+    traffic.add_argument(
+        '-o', '--output-folder', type=str, required=False, help='Output folder path.'
+    )
+    traffic.add_argument(
+        '-p',
+        '--projects',
+        type=str,
+        nargs='*',
+        help='Projects to collect. Defaults to ALL if not given',
+    )
+    traffic.add_argument('-r', '--repositories', nargs='*', help='List of repositories to add.')
+
     return parser
 
 

diff --git a/github_analytics/drive.py b/github_analytics/drive.py
@@ -15,11 +15,12 @@
 PYDRIVE_CREDENTIALS = 'PYDRIVE_CREDENTIALS'
 
 LOGGER = logging.getLogger(__name__)
+GDRIVE_LINK = 'gdrive://'
 
 
 def is_drive_path(path):
     """Tell if the drive is a Google Drive path or not."""
-    return path.startswith('gdrive://')
+    return path.startswith(GDRIVE_LINK)
 
 
 def split_drive_path(path):
@@ -119,3 +120,43 @@ def download_spreadsheet(folder, filename):
     drive_file = _find_file(drive, filename, folder)
     drive_file.FetchContent(mimetype=XLSX_MIMETYPE)
     return drive_file.content
+
+
+def get_or_create_gdrive_folder(parent_folder: str, folder_name: str) -> str:
+    """Check if a folder exists in Google Drive, create it if not, and return its ID.
+
+    Args:
+        parent_folder (str):
+            ID of the parent Google Drive folder.
+        folder_name (str):
+            Name of the folder to check or create.
+
+    Returns:
+        str:
+            The Google Drive folder ID.
+    """
+    drive = _get_drive_client()
+
+    # Check if folder already exists
+    if parent_folder.startswith(GDRIVE_LINK):
+        parent_folder = parent_folder.replace(GDRIVE_LINK, '')
+
+    query = {
+        'q': f"title = '{folder_name}' and mimeType = 'application/vnd.google-apps.folder' "
+        f"and '{parent_folder}' in parents and trashed = false"
+    }
+    folders = drive.ListFile(query).GetList()
+
+    if folders:
+        return folders[0]['id']  # Return existing folder ID
+
+    # Create folder if it does not exist
+    folder_metadata = {
+        'title': folder_name,
+        'mimeType': 'application/vnd.google-apps.folder',
+        'parents': [{'id': parent_folder}],
+    }
+    folder = drive.CreateFile(folder_metadata)
+    folder.Upload()
+
+    return folder['id']
diff --git a/github_analytics/github/traffic.py b/github_analytics/github/traffic.py
@@ -0,0 +1,197 @@
+"""Traffic client for retrieving github information."""
+
+import logging
+
+import pandas as pd
+import requests
+
+logging.basicConfig(level=logging.INFO)
+LOGGER = logging.getLogger(__name__)
+
+GITHUB_API_URL = 'https://api.github.com'
+
+
+class TrafficClient:
+    """Client to fetch traffic data (popular referrers & paths) for a given repository.
+
+    Args:
+        token (str):
+            GitHub personal access token for authentication.
+        quiet (bool, optional):
+            If True, suppresses output logging. Defaults to False.
+    """
+
+    def __init__(self, token):
+        self.token = token
+        self.headers = {
+            'Authorization': f'token {token}',
+            'Accept': 'application/vnd.github.v3+json',
+        }
+
+    def _get_traffic_data(self, repo: str, endpoint: str) -> list:
+        """Helper method to fetch traffic data from GitHub's REST API.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+            endpoint (str):
+                The traffic API endpoint (e.g., "popular/referrers", "popular/paths", "views" or
+                "clones").
+
+        Returns:
+            list:
+                The JSON response containing traffic data.
+
+        Raises:
+            RuntimeError:
+                If the API request fails.
+        """
+        url = f'{GITHUB_API_URL}/repos/{repo}/traffic/{endpoint}'
+        LOGGER.info(f'Fetching traffic data from: {url}')
+
+        response = requests.get(url, headers=self.headers)
+
+        if response.status_code == 200:
+            LOGGER.info(f'Successfully retrieved {endpoint} data for {repo}.')
+            return response.json()
+        else:
+            LOGGER.error(f'GitHub API Error ({response.status_code}): {response.json()}')
+            raise RuntimeError(f'GitHub API Error ({response.status_code}): {response.json()}')
+
+    def get_traffic_referrers(self, repo: str) -> pd.DataFrame:
+        """Fetches the top referring domains that send traffic to the given repository.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+
+        Returns:
+            pd.DataFrame:
+                DataFrame containing referrer traffic details with columns:
+                    - `referrer`: Source domain.
+                    - `count`: Number of views.
+                    - `uniques`: Number of unique visitors.
+        """
+        LOGGER.info(f'Fetching traffic referrers for {repo}.')
+        data = self._get_traffic_data(repo, 'popular/referrers')
+        df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques'])
-        df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques'])
+        df = pd.DataFrame(data, columns=['site', 'views', 'unique_visitors'])
-        df = pd.DataFrame(data, columns=['referrer', 'count', 'uniques'])
+        df = pd.DataFrame(data, columns=['site', 'views', 'unique_visitors'])
+        df = df.rename(
+            columns={'referrer': 'site', 'count': 'views', 'uniques': 'unique_visitors'},
+        )
+        LOGGER.info(f'Retrieved {len(df)} referrer records for {repo}.')
+        return df
+
+    def get_traffic_paths(self, repo: str) -> pd.DataFrame:
+        """Fetches the most visited paths in the given repository.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+
+        Returns:
+            pd.DataFrame: DataFrame containing popular paths with columns:
+                - `path`: The visited path.
+                - `title`: Page title.
+                - `count`: Number of views.
+                - `uniques`: Number of unique visitors.
+        """
+        LOGGER.info(f'Fetching traffic paths for {repo}.')
+        data = self._get_traffic_data(repo, 'popular/paths')
+        df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques'])
-        df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques'])
+        df = pd.DataFrame(data, columns=['content', 'title', 'views', 'unique_visitors'])
-        df = pd.DataFrame(data, columns=['path', 'title', 'count', 'uniques'])
+        df = pd.DataFrame(data, columns=['content', 'title', 'views', 'unique_visitors'])
+        df = df.rename(
+            columns={'path': 'content', 'count': 'views', 'uniques': 'unique_visitors'},
+        )
+        LOGGER.info(f'Retrieved {len(df)} path records for {repo}.')
+        return df
+
+    def get_traffic_views(self, repo: str) -> pd.DataFrame:
+        """Fetches the number of views for the given repository over time.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+
+        Returns:
+            pd.DataFrame:
+                DataFrame containing repository views with columns:
+                    - `timestamp`: Date of views.
+                    - `count`: Number of views.
+                    - `uniques`: Number of unique visitors.
+        """
+        data = self._get_traffic_data(repo, 'views')
+        df = pd.DataFrame(data['views'], columns=['timestamp', 'count', 'uniques'])
+        df = df.rename(columns={'count': 'views', 'uniques': 'unique_visitors'})
+        LOGGER.info(f'Retrieved {len(df)} views for {repo}.')
+        return df
+
+    def get_traffic_clones(self, repo: str) -> pd.DataFrame:
+        """Fetches the number of repository clones over time.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+
+        Returns:
+            pd.DataFrame:
+                DataFrame containing repository clones with columns:
+                    - `timestamp`: Date of clones.
+                    - `count`: Number of clones.
+                    - `uniques`: Number of unique cloners.
+        """
+        data = self._get_traffic_data(repo, 'clones')
+        df = pd.DataFrame(data['clones'], columns=['timestamp', 'count', 'uniques'])
+        df = df.rename(columns={'count': 'clones', 'uniques': 'unique_cloners'})
+        LOGGER.info(f'Retrieved {len(df)} clones for {repo}.')
+        return df
+
+    def generate_timeframe(cls, traffic_data):
+        """Generates a timeframe DataFrame with the start and end timestamps from traffic data.
+
+        Args:
+            traffic_data (dict[str, pd.DataFrame]):
+                Dictionary containing traffic data, including "Traffic Visitors" and
+                "Traffic Git Clones".
+
+        Returns:
+            pd.DataFrame:
+                A DataFrame with a single row containing 'Start Date' and 'End Date'.
+        """
+        start_date = None
+        end_date = None
+        all_timestamps = []
+
+        if 'Traffic Visitors' in traffic_data and not traffic_data['Traffic Visitors'].empty:
+            all_timestamps.extend(traffic_data['Traffic Visitors']['timestamp'].tolist())
+
+        if 'Traffic Git Clones' in traffic_data and not traffic_data['Traffic Git Clones'].empty:
+            all_timestamps.extend(traffic_data['Traffic Git Clones']['timestamp'].tolist())
+
+        if all_timestamps:
+            start_date = min(all_timestamps)
+            end_date = max(all_timestamps)
+
+        return pd.DataFrame({'Start Date': [start_date], 'End Date': [end_date]})
+
+    def get_all_traffic(self, repo: str) -> dict[str, pd.DataFrame]:
+        """Fetches all available traffic data for the given repository.
+
+        Args:
+            repo (str):
+                The repository in the format "owner/repo".
+
+        Returns:
+            dict[str, pd.DataFrame]:
+                A dictionary containing traffic data:
+                    - `"referrers"`: DataFrame with referrer traffic.
+                    - `"paths"`: DataFrame with popular paths.
+                    - `"views"`: DataFrame with repository views over time.
+                    - `"clones"`: DataFrame with repository clones over time.
+        """
+        traffic_data = {
+            'Traffic Referring Sites': self.get_traffic_referrers(repo),
+            'Traffic Popular Content': self.get_traffic_paths(repo),
+            'Traffic Visitors': self.get_traffic_views(repo),
+            'Traffic Git Clones': self.get_traffic_clones(repo),
+        }
+        traffic_data['Timeframe'] = self.generate_timeframe(traffic_data)
+        return traffic_data
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,3 +23,4 @@ jobs:
		github-analytics collect -v -q -t ${{ secrets.GITHUB_TOKEN }} -m -c daily.yaml
		env:
		PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}