Skip to content

Commit ea51b4a

Browse files
author
Michael Smit
committed
Added more logging.
1. Added more logging to downloading/caching logic. 2. Added more logging to simulation and the calculate stuff. 3. Fixed a bunch of incorrectly declared and/or handled types.
1 parent 3dff1d5 commit ea51b4a

File tree

6 files changed

+63
-32
lines changed

6 files changed

+63
-32
lines changed

policyengine/outputs/macro/comparison/calculate_economy_comparison.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
SingleEconomy,
1212
)
1313
from typing import List, Dict, Optional
14+
import logging
15+
16+
logger = logging.getLogger(__file__)
1417

1518

1619
class BudgetaryImpact(BaseModel):
@@ -809,7 +812,9 @@ def calculate_economy_comparison(
809812
if not simulation.is_comparison:
810813
raise ValueError("Simulation must be a comparison simulation.")
811814

815+
logging.info("Calculating baseline econonmy")
812816
baseline: SingleEconomy = simulation.calculate_single_economy(reform=False)
817+
logging.info("Calculating reform economy")
813818
reform: SingleEconomy = simulation.calculate_single_economy(reform=True)
814819
options = simulation.options
815820
country_id = options.country
@@ -835,6 +840,7 @@ def calculate_economy_comparison(
835840
)
836841

837842
if simulation.options.include_cliffs:
843+
logging.info("Calculating cliff impacts")
838844
cliff_impact = CliffImpact(
839845
baseline=CliffImpactInSimulation(
840846
cliff_gap=baseline.cliff_gap,
@@ -848,6 +854,7 @@ def calculate_economy_comparison(
848854
else:
849855
cliff_impact = None
850856

857+
logging.info("Comparison complete")
851858
return EconomyComparison(
852859
model_version=simulation.model_version,
853860
data_version=simulation.data_version,

policyengine/simulation.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
from typing import Callable
3434
import importlib
3535
from policyengine.utils.data_download import download
36+
import logging
37+
38+
logger = logging.getLogger(__file__)
3639

3740
CountryType = Literal["uk", "us"]
3841
ScopeType = Literal["household", "macro"]
@@ -102,10 +105,14 @@ def __init__(self, **options: SimulationOptions):
102105
if not isinstance(self.options.data, dict) and not isinstance(
103106
self.options.data, Dataset
104107
):
108+
logging.debug("Loading data")
105109
self._set_data(self.options.data)
110+
logging.info("Data loaded")
106111
self._initialise_simulations()
112+
logging.info("Simulations initialised")
107113
self.check_data_version()
108114
self._add_output_functions()
115+
logging.info("Output functions loaded")
109116

110117
def _add_output_functions(self):
111118
folder = Path(__file__).parent / "outputs"

policyengine/utils/data/caching_google_storage_client.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,11 @@ def download(
5050
f"Copying downloaded data for {bucket}, {key} to {target}"
5151
)
5252
atomic_write(target, data)
53+
logger.info(f"Data successfully copied to {target} with version {version}")
5354
if return_version:
5455
return version
5556
return
57+
logger.error(f"Cached data for {bucket}, {key}{', ' + version if version is not None else ''} is not bytes.")
5658
raise Exception("Expected data for blob to be cached as bytes")
5759

5860
# If the crc has changed from what we downloaded last time download it again.
@@ -68,22 +70,22 @@ def sync(
6870
crckey = f"{bucket}.{key}.{version}.crc"
6971

7072
crc = self.client.crc32c(bucket, key, version=version)
73+
id_string = f"{bucket}, {key}{', ' + version if version is not None else ''}"
7174
if crc is None:
72-
raise Exception(f"Unable to find {key} in bucket {bucket}")
75+
raise Exception(f"Unable to find {id_string}")
7376

7477
prev_crc = self.cache.get(crckey, default=None)
75-
logger.debug(f"Previous crc for {bucket}, {key} was {prev_crc}")
78+
logger.debug(f"Previous crc for {id_string} was {prev_crc}")
7679
if prev_crc == crc:
7780
logger.info(
78-
f"Cache exists and crc is unchanged for {bucket}, {key}."
81+
f"Cache exists and crc is unchanged for {id_string} ."
7982
)
8083
return
81-
8284
[content, downloaded_crc] = self.client.download(
8385
bucket, key, version=version
8486
)
85-
logger.debug(
86-
f"Downloaded new version of {bucket}, {key} with crc {downloaded_crc}"
87+
logger.info(
88+
f"Downloaded new version of {id_string} with crc {downloaded_crc}"
8789
)
8890

8991
# atomic transaction to update both the data and the metadata
@@ -94,6 +96,7 @@ def sync(
9496
# Whatever the CRC was before we downloaded, we set the cache CRC
9597
# to the CRC reported by the download itself to avoid race conditions.
9698
self.cache.set(crckey, downloaded_crc)
99+
logger.info("Cache updated for {id_string}")
97100

98101
def clear(self):
99102
self.cache.clear()

policyengine/utils/data/simplified_google_storage_client.py

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,29 @@ def __init__(self):
1919
self.client = Client()
2020

2121
def get_versioned_blob(
22-
self, bucket: str, key: str, version: Optional[str] = None
22+
self, bucket_name: str, key: str, version: Optional[str] = None
2323
) -> Blob:
2424
"""
2525
Get a versioned blob from the specified bucket and key.
2626
If version is None, returns the latest version of the blob.
2727
"""
28-
bucket = self.client.bucket(bucket)
28+
bucket = self.client.bucket(bucket_name)
2929
if version is None:
3030
return bucket.blob(key)
31-
else:
32-
versions: Iterable[Blob] = bucket.list_blobs(
33-
prefix=key, versions=True
34-
)
35-
for v in versions:
36-
if v.metadata is None:
37-
continue # Skip blobs without metadata
38-
if v.metadata.get("version") == version:
39-
return v
40-
raise ValueError(
41-
f"Could not find version {version} of blob {key} in bucket {bucket.name}"
42-
)
31+
logging.debug("Searching {bucket_name}, {prefix}* for version {version}")
32+
versions: Iterable[Blob] = bucket.list_blobs(
33+
prefix=key, versions=True
34+
)
35+
for v in versions:
36+
if v.metadata is None:
37+
continue # Skip blobs without metadata
38+
if v.metadata.get("version") == version:
39+
logging.info(f"Blob {bucket_name}, {v.path} has version {version}")
40+
return v
41+
logging.info(f"No version {version} found for {bucket_name}, {key}")
42+
raise ValueError(
43+
f"Could not find version {version} of blob {key} in bucket {bucket.name}"
44+
)
4345

4446
def crc32c(
4547
self, bucket_name: str, key: str, version: Optional[str] = None
@@ -51,7 +53,7 @@ def crc32c(
5153
blob = self.get_versioned_blob(bucket_name, key, version)
5254

5355
blob.reload()
54-
logger.debug(f"Crc is {blob.crc32c}")
56+
logger.info(f"Crc for {bucket_name}, {key} is {blob.crc32c}")
5557
return blob.crc32c
5658

5759
def download(
@@ -60,10 +62,11 @@ def download(
6062
"""
6163
get the blob content and associated CRC from google storage.
6264
"""
63-
logger.info(f"Downloading {bucket}, {key}")
65+
logger.debug(f"Downloading {bucket}, {key}{ ', version:' + version if version is not None else ''}")
6466
blob = self.get_versioned_blob(bucket, key, version)
6567

6668
result = blob.download_as_bytes()
69+
logger.info(f"Downloaded {bucket}, {key}{ ', version:' + version if version is not None else ''}")
6770
# According to documentation blob.crc32c is updated as a side effect of
6871
# downloading the content. As a result this should now be the crc of the downloaded
6972
# content (i.e. there is not a race condition where it's getting the CRC from the cloud)
@@ -74,11 +77,21 @@ def _get_latest_version(self, bucket: str, key: str) -> Optional[str]:
7477
Get the latest version of a blob in the specified bucket and key.
7578
If no version is specified, return None.
7679
"""
80+
logger.debug(f"Getting latest version of {bucket}, {key}")
7781
blob = self.client.get_bucket(bucket).get_blob(key)
82+
if blob is None:
83+
logging.warning(f"No blob found in bucket {bucket} with key {key}")
84+
return None
85+
7886
if blob.metadata is None:
7987
logging.warning(
80-
"No metadata found for blob, so it has no version attached."
88+
f"No metadata found for blob {bucket}, {key}, so it has no version attached."
8189
)
8290
return None
83-
else:
84-
return blob.metadata.get("version")
91+
92+
version = blob.metadata.get("version")
93+
if version is None:
94+
logging.warning(f"Blob {bucket}, {key} does not have a version in its metadata")
95+
return None
96+
logging.info(f"Metadata for blob {bucket}, {key} has version: {version}")
97+
return blob.metadata.get("version")

policyengine/utils/data_download.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@ def download(
1212
gcs_bucket: str,
1313
version: Optional[str] = None,
1414
return_version: bool = False,
15-
) -> Tuple[str, Optional[str]]:
15+
) -> Tuple[str, str] | str:
1616
logging.info("Using Google Cloud Storage for download.")
17-
version = download_file_from_gcs(
17+
downloaded_version = download_file_from_gcs(
1818
bucket_name=gcs_bucket,
1919
file_name=filepath,
2020
destination_path=filepath,
2121
version=version,
2222
)
2323
if return_version:
24-
return filepath, version
24+
return filepath, downloaded_version
2525
return filepath

policyengine/utils/google_cloud_bucket.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import asyncio
33
from pathlib import Path
44
from google.cloud.storage import Blob
5-
from typing import Iterable
5+
from typing import Iterable, Optional
66

77
_caching_client: CachingGoogleStorageClient | None = None
88

@@ -24,8 +24,8 @@ def download_file_from_gcs(
2424
bucket_name: str,
2525
file_name: str,
2626
destination_path: str,
27-
version: str = None,
28-
) -> str:
27+
version: Optional[str] = None,
28+
) -> str | None:
2929
"""
3030
Download a file from Google Cloud Storage to a local path.
3131
@@ -38,10 +38,11 @@ def download_file_from_gcs(
3838
version (str): The version of the file that was downloaded, if available.
3939
"""
4040

41-
return _get_client().download(
41+
version = _get_client().download(
4242
bucket_name,
4343
file_name,
4444
Path(destination_path),
4545
version=version,
4646
return_version=True,
4747
)
48+
return version

0 commit comments

Comments
 (0)