Skip to content

Commit 2365d67

Browse files
author
Michael Smit
committed
Added more logging.
1. Added more logging to downloading/caching logic. 2. Added more logging to simulation and the calculate stuff. 3. Fixed a bunch of incorrectly declared and/or handled types.
1 parent 3dff1d5 commit 2365d67

File tree

6 files changed

+79
-34
lines changed

6 files changed

+79
-34
lines changed

policyengine/outputs/macro/comparison/calculate_economy_comparison.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
SingleEconomy,
1212
)
1313
from typing import List, Dict, Optional
14+
import logging
15+
16+
logger = logging.getLogger(__file__)
1417

1518

1619
class BudgetaryImpact(BaseModel):
@@ -809,7 +812,9 @@ def calculate_economy_comparison(
809812
if not simulation.is_comparison:
810813
raise ValueError("Simulation must be a comparison simulation.")
811814

815+
logging.info("Calculating baseline econonmy")
812816
baseline: SingleEconomy = simulation.calculate_single_economy(reform=False)
817+
logging.info("Calculating reform economy")
813818
reform: SingleEconomy = simulation.calculate_single_economy(reform=True)
814819
options = simulation.options
815820
country_id = options.country
@@ -835,6 +840,7 @@ def calculate_economy_comparison(
835840
)
836841

837842
if simulation.options.include_cliffs:
843+
logging.info("Calculating cliff impacts")
838844
cliff_impact = CliffImpact(
839845
baseline=CliffImpactInSimulation(
840846
cliff_gap=baseline.cliff_gap,
@@ -848,6 +854,7 @@ def calculate_economy_comparison(
848854
else:
849855
cliff_impact = None
850856

857+
logging.info("Comparison complete")
851858
return EconomyComparison(
852859
model_version=simulation.model_version,
853860
data_version=simulation.data_version,

policyengine/simulation.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
from typing import Callable
3434
import importlib
3535
from policyengine.utils.data_download import download
36+
import logging
37+
38+
logger = logging.getLogger(__file__)
3639

3740
CountryType = Literal["uk", "us"]
3841
ScopeType = Literal["household", "macro"]
@@ -102,10 +105,14 @@ def __init__(self, **options: SimulationOptions):
102105
if not isinstance(self.options.data, dict) and not isinstance(
103106
self.options.data, Dataset
104107
):
108+
logging.debug("Loading data")
105109
self._set_data(self.options.data)
110+
logging.info("Data loaded")
106111
self._initialise_simulations()
112+
logging.info("Simulations initialised")
107113
self.check_data_version()
108114
self._add_output_functions()
115+
logging.info("Output functions loaded")
109116

110117
def _add_output_functions(self):
111118
folder = Path(__file__).parent / "outputs"

policyengine/utils/data/caching_google_storage_client.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,15 @@ def download(
5050
f"Copying downloaded data for {bucket}, {key} to {target}"
5151
)
5252
atomic_write(target, data)
53+
logger.info(
54+
f"Data successfully copied to {target} with version {version}"
55+
)
5356
if return_version:
5457
return version
5558
return
59+
logger.error(
60+
f"Cached data for {bucket}, {key}{', ' + version if version is not None else ''} is not bytes."
61+
)
5662
raise Exception("Expected data for blob to be cached as bytes")
5763

5864
# If the crc has changed from what we downloaded last time download it again.
@@ -68,22 +74,22 @@ def sync(
6874
crckey = f"{bucket}.{key}.{version}.crc"
6975

7076
crc = self.client.crc32c(bucket, key, version=version)
77+
id_string = (
78+
f"{bucket}, {key}{', ' + version if version is not None else ''}"
79+
)
7180
if crc is None:
72-
raise Exception(f"Unable to find {key} in bucket {bucket}")
81+
raise Exception(f"Unable to find {id_string}")
7382

7483
prev_crc = self.cache.get(crckey, default=None)
75-
logger.debug(f"Previous crc for {bucket}, {key} was {prev_crc}")
84+
logger.debug(f"Previous crc for {id_string} was {prev_crc}")
7685
if prev_crc == crc:
77-
logger.info(
78-
f"Cache exists and crc is unchanged for {bucket}, {key}."
79-
)
86+
logger.info(f"Cache exists and crc is unchanged for {id_string} .")
8087
return
81-
8288
[content, downloaded_crc] = self.client.download(
8389
bucket, key, version=version
8490
)
85-
logger.debug(
86-
f"Downloaded new version of {bucket}, {key} with crc {downloaded_crc}"
91+
logger.info(
92+
f"Downloaded new version of {id_string} with crc {downloaded_crc}"
8793
)
8894

8995
# atomic transaction to update both the data and the metadata
@@ -94,6 +100,7 @@ def sync(
94100
# Whatever the CRC was before we downloaded, we set the cache CRC
95101
# to the CRC reported by the download itself to avoid race conditions.
96102
self.cache.set(crckey, downloaded_crc)
103+
logger.info("Cache updated for {id_string}")
97104

98105
def clear(self):
99106
self.cache.clear()

policyengine/utils/data/simplified_google_storage_client.py

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,31 @@ def __init__(self):
1919
self.client = Client()
2020

2121
def get_versioned_blob(
22-
self, bucket: str, key: str, version: Optional[str] = None
22+
self, bucket_name: str, key: str, version: Optional[str] = None
2323
) -> Blob:
2424
"""
2525
Get a versioned blob from the specified bucket and key.
2626
If version is None, returns the latest version of the blob.
2727
"""
28-
bucket = self.client.bucket(bucket)
28+
bucket = self.client.bucket(bucket_name)
2929
if version is None:
3030
return bucket.blob(key)
31-
else:
32-
versions: Iterable[Blob] = bucket.list_blobs(
33-
prefix=key, versions=True
34-
)
35-
for v in versions:
36-
if v.metadata is None:
37-
continue # Skip blobs without metadata
38-
if v.metadata.get("version") == version:
39-
return v
40-
raise ValueError(
41-
f"Could not find version {version} of blob {key} in bucket {bucket.name}"
42-
)
31+
logging.debug(
32+
"Searching {bucket_name}, {prefix}* for version {version}"
33+
)
34+
versions: Iterable[Blob] = bucket.list_blobs(prefix=key, versions=True)
35+
for v in versions:
36+
if v.metadata is None:
37+
continue # Skip blobs without metadata
38+
if v.metadata.get("version") == version:
39+
logging.info(
40+
f"Blob {bucket_name}, {v.path} has version {version}"
41+
)
42+
return v
43+
logging.info(f"No version {version} found for {bucket_name}, {key}")
44+
raise ValueError(
45+
f"Could not find version {version} of blob {key} in bucket {bucket.name}"
46+
)
4347

4448
def crc32c(
4549
self, bucket_name: str, key: str, version: Optional[str] = None
@@ -51,7 +55,7 @@ def crc32c(
5155
blob = self.get_versioned_blob(bucket_name, key, version)
5256

5357
blob.reload()
54-
logger.debug(f"Crc is {blob.crc32c}")
58+
logger.info(f"Crc for {bucket_name}, {key} is {blob.crc32c}")
5559
return blob.crc32c
5660

5761
def download(
@@ -60,10 +64,15 @@ def download(
6064
"""
6165
get the blob content and associated CRC from google storage.
6266
"""
63-
logger.info(f"Downloading {bucket}, {key}")
67+
logger.debug(
68+
f"Downloading {bucket}, {key}{ ', version:' + version if version is not None else ''}"
69+
)
6470
blob = self.get_versioned_blob(bucket, key, version)
6571

6672
result = blob.download_as_bytes()
73+
logger.info(
74+
f"Downloaded {bucket}, {key}{ ', version:' + version if version is not None else ''}"
75+
)
6776
# According to documentation blob.crc32c is updated as a side effect of
6877
# downloading the content. As a result this should now be the crc of the downloaded
6978
# content (i.e. there is not a race condition where it's getting the CRC from the cloud)
@@ -74,11 +83,25 @@ def _get_latest_version(self, bucket: str, key: str) -> Optional[str]:
7483
Get the latest version of a blob in the specified bucket and key.
7584
If no version is specified, return None.
7685
"""
86+
logger.debug(f"Getting latest version of {bucket}, {key}")
7787
blob = self.client.get_bucket(bucket).get_blob(key)
88+
if blob is None:
89+
logging.warning(f"No blob found in bucket {bucket} with key {key}")
90+
return None
91+
7892
if blob.metadata is None:
7993
logging.warning(
80-
"No metadata found for blob, so it has no version attached."
94+
f"No metadata found for blob {bucket}, {key}, so it has no version attached."
95+
)
96+
return None
97+
98+
version = blob.metadata.get("version")
99+
if version is None:
100+
logging.warning(
101+
f"Blob {bucket}, {key} does not have a version in its metadata"
81102
)
82103
return None
83-
else:
84-
return blob.metadata.get("version")
104+
logging.info(
105+
f"Metadata for blob {bucket}, {key} has version: {version}"
106+
)
107+
return blob.metadata.get("version")

policyengine/utils/data_download.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@ def download(
1212
gcs_bucket: str,
1313
version: Optional[str] = None,
1414
return_version: bool = False,
15-
) -> Tuple[str, Optional[str]]:
15+
) -> Tuple[str, str] | str:
1616
logging.info("Using Google Cloud Storage for download.")
17-
version = download_file_from_gcs(
17+
downloaded_version = download_file_from_gcs(
1818
bucket_name=gcs_bucket,
1919
file_name=filepath,
2020
destination_path=filepath,
2121
version=version,
2222
)
2323
if return_version:
24-
return filepath, version
24+
return filepath, downloaded_version
2525
return filepath

policyengine/utils/google_cloud_bucket.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import asyncio
33
from pathlib import Path
44
from google.cloud.storage import Blob
5-
from typing import Iterable
5+
from typing import Iterable, Optional
66

77
_caching_client: CachingGoogleStorageClient | None = None
88

@@ -24,8 +24,8 @@ def download_file_from_gcs(
2424
bucket_name: str,
2525
file_name: str,
2626
destination_path: str,
27-
version: str = None,
28-
) -> str:
27+
version: Optional[str] = None,
28+
) -> str | None:
2929
"""
3030
Download a file from Google Cloud Storage to a local path.
3131
@@ -38,10 +38,11 @@ def download_file_from_gcs(
3838
version (str): The version of the file that was downloaded, if available.
3939
"""
4040

41-
return _get_client().download(
41+
version = _get_client().download(
4242
bucket_name,
4343
file_name,
4444
Path(destination_path),
4545
version=version,
4646
return_version=True,
4747
)
48+
return version

0 commit comments

Comments
 (0)