Skip to content

Commit 719ce06

Browse files
authored
Merge branch '15-add-daily-workflow-to-export-anaconda-downloads-to-google-drive' into 22-include-pre-releases-downloads-for-pypi-daily-collection-workflow
2 parents 17b2df0 + db48947 commit 719ce06

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

.github/workflows/daily_collection.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ on:
77
description: Slack channel to post the error message to if the builds fail.
88
required: false
99
default: "sdv-alerts-debug"
10+
max_days_pypi:
11+
description: 'Maximum number of days to collect, starting from today for PyPI.'
12+
required: false
13+
type: number
14+
default: 30
15+
max_days_anaconda:
16+
description: 'Maximum number of days to collect, starting from today for Anaconda'
17+
required: false
18+
type: number
19+
default: 90
1020
schedule:
1121
- cron: '0 0 * * *'
1222

@@ -32,7 +42,7 @@ jobs:
3242
run: |
3343
uv run download-analytics collect-pypi \
3444
--verbose \
35-
--max-days 30 \
45+
--max-days ${{ inputs.max_days_pypi || 30 }} \
3646
--add-metrics \
3747
--output-folder gdrive://10QHbqyvptmZX4yhu2Y38YJbVHqINRr0n
3848
env:
@@ -42,7 +52,7 @@ jobs:
4252
run: |
4353
uv run download-analytics collect-anaconda \
4454
--output-folder gdrive://1UnDYovLkL4gletOF5328BG1X59mSHF-Z \
45-
--max-days 90 \
55+
--max-days ${{ inputs.max_days_anaconda || 90 }} \
4656
--verbose
4757
env:
4858
PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}

download_analytics/anaconda.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
PREVIOUS_ANACONDA_ORG_VERSION_FILENAME = 'anaconda_org_per_version.csv'
2323
TIME_COLUMN = 'time'
2424
PKG_COLUMN = 'pkg_name'
25+
ANACONDA_BUCKET_PATH = 's3://anaconda-package-data/conda'
2526

2627

2728
def _read_anaconda_parquet(URL, pkg_names=None):
@@ -55,7 +56,7 @@ def _anaconda_package_data_by_day(year, month, day, pkg_names=None):
5556
padded_day = '{:02d}'.format(day)
5657

5758
filename = f'{padded_year}-{padded_month}-{padded_day}.parquet'
58-
URL = f's3://anaconda-package-data/conda/hourly/{padded_year}/{padded_month}/{filename}'
59+
URL = f'{ANACONDA_BUCKET_PATH}/hourly/{padded_year}/{padded_month}/{filename}'
5960
return _read_anaconda_parquet(URL, pkg_names=pkg_names)
6061

6162

@@ -68,7 +69,7 @@ def anaconda_package_data_by_year_month(year, month, pkg_names=None):
6869
padded_year = '{:04d}'.format(year)
6970
padded_month = '{:02d}'.format(month)
7071
filename = f'{padded_year}-{padded_month}.parquet'
71-
URL = f's3://anaconda-package-data/conda/monthly/{padded_year}/{filename}'
72+
URL = f'{ANACONDA_BUCKET_PATH}/monthly/{padded_year}/{filename}'
7273
return _read_anaconda_parquet(URL, pkg_names=pkg_names)
7374

7475

download_analytics/time_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def get_min_max_dt_in_year(year):
3333
def drop_duplicates_by_date(df, time_column, group_by_columns):
3434
"""Keep only the latest record for each day within each group.
3535
36-
For each unique combination of date and group, retains only the row with the
36+
For each unique combination of date and group, retain only the row with the
3737
latest timestamp. This is useful for deduplicating time series data where
3838
multiple records may exist for the same day.
3939

0 commit comments

Comments
 (0)