Merge branch '15-add-daily-workflow-to-export-anaconda-downloads-to-google-drive' into 22-include-pre-releases-downloads-for-pypi-daily-collection-workflow

gsheni · web-flow · commit 719ce0610f82 · 2025-06-17T10:39:46.000-04:00
diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml
@@ -7,6 +7,16 @@ on:
         description: Slack channel to post the error message to if the builds fail.
         required: false
         default: "sdv-alerts-debug"
+      max_days_pypi:
+        description: 'Maximum number of days to collect, starting from today for PyPI.'
+        required: false
+        type: number
+        default: 30
+      max_days_anaconda:
+        description: 'Maximum number of days to collect, starting from today for Anaconda'
+        required: false
+        type: number
+        default: 90
   schedule:
     - cron:  '0 0 * * *'
 
@@ -32,7 +42,7 @@ jobs:
       run: |
         uv run download-analytics collect-pypi \
           --verbose \
-          --max-days 30 \
+          --max-days ${{ inputs.max_days_pypi || 30 }} \
           --add-metrics \
           --output-folder gdrive://10QHbqyvptmZX4yhu2Y38YJbVHqINRr0n
       env:
@@ -42,7 +52,7 @@ jobs:
       run: |
         uv run download-analytics collect-anaconda \
           --output-folder gdrive://1UnDYovLkL4gletOF5328BG1X59mSHF-Z \
-          --max-days 90 \
+          --max-days ${{ inputs.max_days_anaconda || 90 }} \
           --verbose
       env:
         PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
diff --git a/download_analytics/anaconda.py b/download_analytics/anaconda.py
@@ -22,6 +22,7 @@
 PREVIOUS_ANACONDA_ORG_VERSION_FILENAME = 'anaconda_org_per_version.csv'
 TIME_COLUMN = 'time'
 PKG_COLUMN = 'pkg_name'
+ANACONDA_BUCKET_PATH = 's3://anaconda-package-data/conda'
 
 
 def _read_anaconda_parquet(URL, pkg_names=None):
@@ -55,7 +56,7 @@ def _anaconda_package_data_by_day(year, month, day, pkg_names=None):
     padded_day = '{:02d}'.format(day)
 
     filename = f'{padded_year}-{padded_month}-{padded_day}.parquet'
-    URL = f's3://anaconda-package-data/conda/hourly/{padded_year}/{padded_month}/{filename}'
+    URL = f'{ANACONDA_BUCKET_PATH}/hourly/{padded_year}/{padded_month}/{filename}'
     return _read_anaconda_parquet(URL, pkg_names=pkg_names)
 
 
@@ -68,7 +69,7 @@ def anaconda_package_data_by_year_month(year, month, pkg_names=None):
     padded_year = '{:04d}'.format(year)
     padded_month = '{:02d}'.format(month)
     filename = f'{padded_year}-{padded_month}.parquet'
-    URL = f's3://anaconda-package-data/conda/monthly/{padded_year}/{filename}'
+    URL = f'{ANACONDA_BUCKET_PATH}/monthly/{padded_year}/{filename}'
     return _read_anaconda_parquet(URL, pkg_names=pkg_names)
 
 
diff --git a/download_analytics/time_utils.py b/download_analytics/time_utils.py
@@ -33,7 +33,7 @@ def get_min_max_dt_in_year(year):
 def drop_duplicates_by_date(df, time_column, group_by_columns):
     """Keep only the latest record for each day within each group.
 
-    For each unique combination of date and group, retains only the row with the
+    For each unique combination of date and group, retain only the row with the
     latest timestamp. This is useful for deduplicating time series data where
     multiple records may exist for the same day.