Skip to content

Commit 7232b67

Browse files
Add versioning to uploads (#269)
* Add versioning to uploads Fixes #268 * Fix typo
1 parent d0b7fa9 commit 7232b67

File tree

3 files changed

+122
-29
lines changed

3 files changed

+122
-29
lines changed

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: patch
2+
changes:
3+
added:
4+
- Versioning to dataset uploads.

policyengine_us_data/storage/upload_completed_datasets.py

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,43 +4,28 @@
44
CPS_2023,
55
)
66
from policyengine_us_data.storage import STORAGE_FOLDER
7-
from policyengine_us_data.utils.huggingface import upload
7+
from policyengine_us_data.utils.data_upload import upload_data_files
88
from google.cloud import storage
99
import google.auth
1010

1111

1212
def upload_datasets():
13-
credentials, project_id = google.auth.default()
14-
storage_client = storage.Client(
15-
credentials=credentials, project=project_id
16-
)
17-
bucket = storage_client.bucket("policyengine-us-data")
18-
19-
datasets_to_upload = [
20-
EnhancedCPS_2024,
21-
Pooled_3_Year_CPS_2023,
22-
CPS_2023,
13+
dataset_files = [
14+
EnhancedCPS_2024.file_path,
15+
Pooled_3_Year_CPS_2023.file_path,
16+
CPS_2023.file_path,
2317
]
2418

25-
for dataset in datasets_to_upload:
26-
dataset = dataset()
27-
if not dataset.exists:
28-
raise ValueError(
29-
f"Dataset {dataset.name} does not exist at {dataset.file_path}."
30-
)
19+
for file_path in dataset_files:
20+
if not file_path.exists():
21+
raise ValueError(f"File {file_path} does not exist.")
3122

32-
upload(
33-
dataset.file_path,
34-
"policyengine/policyengine-us-data",
35-
dataset.file_path.name,
36-
)
37-
38-
blob = dataset.file_path.name
39-
blob = bucket.blob(blob)
40-
blob.upload_from_filename(dataset.file_path)
41-
print(
42-
f"Uploaded {dataset.file_path.name} to GCS bucket policyengine-us-data."
43-
)
23+
upload_data_files(
24+
files=dataset_files,
25+
hf_repo_name="policyengine/policyengine-us-data",
26+
hf_repo_type="model",
27+
gcs_bucket_name="policyengine-us-data",
28+
)
4429

4530

4631
if __name__ == "__main__":
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from typing import List
2+
from huggingface_hub import HfApi, CommitOperationAdd
3+
from huggingface_hub.errors import RevisionNotFoundError
4+
from google.cloud import storage
5+
from pathlib import Path
6+
from importlib import metadata
7+
import google.auth
8+
import logging
9+
10+
11+
def upload_data_files(
12+
files: List[str],
13+
gcs_bucket_name: str = "policyengine-us-data",
14+
hf_repo_name: str = "policyengine/policyengine-us-data",
15+
hf_repo_type: str = "model",
16+
version: str = None,
17+
):
18+
if version is None:
19+
version = metadata.version("policyengine-us-data")
20+
21+
upload_files_to_hf(
22+
files=files,
23+
version=version,
24+
hf_repo_name=hf_repo_name,
25+
hf_repo_type=hf_repo_type,
26+
)
27+
28+
upload_files_to_gcs(
29+
files=files,
30+
version=version,
31+
gcs_bucket_name=gcs_bucket_name,
32+
)
33+
34+
35+
def upload_files_to_hf(
36+
files: List[str],
37+
version: str,
38+
hf_repo_name: str = "policyengine/policyengine-us-data",
39+
hf_repo_type: str = "model",
40+
):
41+
"""
42+
Upload files to Hugging Face repository and tag the commit with the version.
43+
"""
44+
api = HfApi()
45+
hf_operations = []
46+
47+
for file_path in files:
48+
file_path = Path(file_path)
49+
if not file_path.exists():
50+
raise ValueError(f"File {file_path} does not exist.")
51+
hf_operations.append(
52+
CommitOperationAdd(
53+
path_in_repo=file_path.name,
54+
path_or_fileobj=str(file_path),
55+
)
56+
)
57+
commit_info = api.create_commit(
58+
repo_id=hf_repo_name,
59+
operations=hf_operations,
60+
repo_type=hf_repo_type,
61+
commit_message=f"Upload data files for version {version}",
62+
)
63+
logging.info(f"Uploaded files to Hugging Face repository {hf_repo_name}.")
64+
65+
# Tag commit with version
66+
api.create_tag(
67+
repo_id=hf_repo_name,
68+
tag=version,
69+
revision=commit_info.oid,
70+
repo_type=hf_repo_type,
71+
)
72+
logging.info(
73+
f"Tagged commit with {version} in Hugging Face repository {hf_repo_name}."
74+
)
75+
76+
77+
def upload_files_to_gcs(
78+
files: List[str],
79+
version: str,
80+
gcs_bucket_name: str = "policyengine-us-data",
81+
):
82+
"""
83+
Upload files to Google Cloud Storage and set metadata with the version.
84+
"""
85+
credentials, project_id = google.auth.default()
86+
storage_client = storage.Client(
87+
credentials=credentials, project=project_id
88+
)
89+
bucket = storage_client.bucket(gcs_bucket_name)
90+
91+
for file_path in files:
92+
file_path = Path(file_path)
93+
blob = bucket.blob(file_path.name)
94+
blob.upload_from_filename(file_path)
95+
logging.info(
96+
f"Uploaded {file_path.name} to GCS bucket {gcs_bucket_name}."
97+
)
98+
99+
# Set metadata
100+
blob.metadata = {"version": version}
101+
blob.patch()
102+
logging.info(
103+
f"Set metadata for {file_path.name} in GCS bucket {gcs_bucket_name}."
104+
)

0 commit comments

Comments
 (0)