feat&docs: add comments and print statements to index service

wherop · wherop · commit 86e72fecc720 · 2024-11-22T13:31:53.000+01:00
diff --git a/backend/src/service.py b/backend/src/service.py
@@ -2,7 +2,12 @@
 
 import pandas as pd
 
-from src.constants import AggregationMethod, LocationPolygon, TemporalResolution, IndexType
+from src.constants import (
+    AggregationMethod,
+    LocationPolygon,
+    TemporalResolution,
+    IndexType,
+)
 from src.gee.image_preprocessing import get_preprocessed_imagery
 from src.gee.sat_index_info import get_sat_index_info
 from src.gee.ndvi_cache import ndvi_daily_cache
@@ -13,7 +18,7 @@
 def initialize_time_series(
     time_series: List[Dict[str, Union[int, float]]],
     temporal_resolution: TemporalResolution,
-    aggregation_method: AggregationMethod
+    aggregation_method: AggregationMethod,
 ) -> pd.DataFrame:
     """
     Initializes a pandas DataFrame from a time series and applies temporal resolution and aggregation.
@@ -31,28 +36,30 @@ def initialize_time_series(
         # Return an empty DataFrame with a datetime index and 'value' column in UTC
         if temporal_resolution == TemporalResolution.MONTHLY:
             empty_index = pd.date_range(
-                start="1970-01-01", periods=0, freq='MS', tz='UTC')
+                start="1970-01-01", periods=0, freq="MS", tz="UTC"
+            )
         else:
             empty_index = pd.date_range(
-                start="1970-01-01", periods=0, freq='D', tz='UTC')
+                start="1970-01-01", periods=0, freq="D", tz="UTC"
+            )
 
-        return pd.DataFrame(index=empty_index, columns=['value'])
+        return pd.DataFrame(index=empty_index, columns=["value"])
 
     # Convert timestamps to datetime in UTC and create DataFrame
     df = pd.DataFrame(time_series)
-    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', utc=True)
-    df.set_index('timestamp', inplace=True)
+    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
+    df.set_index("timestamp", inplace=True)
 
     # Resample based on temporal resolution and apply aggregation if needed
     if temporal_resolution == TemporalResolution.MONTHLY:
         if aggregation_method == AggregationMethod.MEAN:
-            df = df.resample('MS').mean()
+            df = df.resample("MS").mean()
         elif aggregation_method == AggregationMethod.MEDIAN:
-            df = df.resample('MS').median()
+            df = df.resample("MS").median()
         elif aggregation_method == AggregationMethod.MAX:
-            df = df.resample('MS').max()
+            df = df.resample("MS").max()
         elif aggregation_method == AggregationMethod.MIN:
-            df = df.resample('MS').min()
+            df = df.resample("MS").min()
     # If DAILY, do nothing as time series is already in daily format
     return df
 
@@ -61,7 +68,7 @@ def fill_missing_dates(
     df: pd.DataFrame,
     start: datetime,
     end: datetime,
-    temporal_resolution: TemporalResolution
+    temporal_resolution: TemporalResolution,
 ) -> pd.DataFrame:
     """
     Fills missing entries in the time series, adding NaN for missing days or months.
@@ -88,18 +95,18 @@ def fill_missing_dates(
 
     # Generate the complete date range based on the temporal resolution
     if temporal_resolution == TemporalResolution.DAILY:
-        date_range = pd.date_range(start=start, end=end, freq='D', tz='UTC')
+        date_range = pd.date_range(start=start, end=end, freq="D", tz="UTC")
     elif temporal_resolution == TemporalResolution.MONTHLY:
-        date_range = pd.date_range(start=start, end=end, freq='MS', tz='UTC')
+        date_range = pd.date_range(start=start, end=end, freq="MS", tz="UTC")
     # If the input DataFrame is empty, create a new one with NaNs for all dates in the range
     if df.empty:
-        df = pd.DataFrame(index=date_range, columns=['value'])
-        df['value'] = None
+        df = pd.DataFrame(index=date_range, columns=["value"])
+        df["value"] = None
     else:
         # Reindex to the complete date range, filling missing dates with NaN
         df = df.reindex(date_range)
 
-    df.columns = ['value']
+    df.columns = ["value"]
     return df
 
 
@@ -109,31 +116,40 @@ def sat_index_service(
     aggregation_method: AggregationMethod,
     start_date: datetime,
     end_date: datetime,
-    index_type: IndexType
+    index_type: IndexType,
 ):
     # Temporary implementation of GEE Caching strategy
     current_cache_end_date = datetime(
-        2024, 9, 29, tzinfo=timezone.utc)
-    if start_date < current_cache_end_date and end_date < current_cache_end_date:  # current end of cache
+        2024, 9, 29, tzinfo=timezone.utc
+    )  # current end of cache
+
+    # Entire range is within the cache,
+    # get entire range from cache, process nothing.
+    if start_date < current_cache_end_date and end_date < current_cache_end_date:
         cache_start_date = start_date
         cache_end_date = end_date
         processing_start_date = None
         processing_end_date = None
 
+    # Partial overlap with the cache,
+    # get cached part from cache, process the rest until end of range.
     elif start_date < current_cache_end_date and end_date > current_cache_end_date:
         cache_start_date = start_date
         cache_end_date = current_cache_end_date
         processing_start_date = current_cache_end_date + timedelta(days=1)
         processing_end_date = end_date
 
+    # Entire range is outside the cache,
+    # get nothing from cache, process entire range.
     elif start_date > current_cache_end_date:
         cache_start_date = None
         cache_end_date = None
         processing_start_date = start_date
         processing_end_date = end_date
 
+    # Get and process uncached range
     if processing_start_date:
-
+        print(f'Getting {processing_start_date.date()} to {processing_end_date.date()} from GEE.')
         masked_images = get_preprocessed_imagery(
             LocationPolygon[location.value].value,
             processing_start_date,
@@ -143,7 +159,9 @@ def sat_index_service(
             masked_images, LocationPolygon[location.value].value, index_type
         )
 
+    # Get cached range
     if cache_start_date:
+        print(f'Getting {cache_start_date.date()} to {cache_end_date.date()} from cache.')
         cached_data_subset = get_cache_subset(cache_start_date, cache_end_date)
 
     if processing_start_date and cache_start_date:
@@ -152,18 +170,20 @@ def sat_index_service(
         ndvi_data = cached_data_subset if cache_start_date else NDVI_time_series
 
     index_df = initialize_time_series(
-        ndvi_data, temporal_resolution, aggregation_method)
+        ndvi_data, temporal_resolution, aggregation_method
+    )
 
-    filled_df = fill_missing_dates(
-        index_df, start_date, end_date, temporal_resolution)
+    filled_df = fill_missing_dates(index_df, start_date, end_date, temporal_resolution)
 
     return convert_df_to_list(filled_df)
 
 
 def get_cache_subset(start_date: datetime, end_date: datetime):
     subset: list[dict] = []
     for entry in ndvi_daily_cache:
-        if entry["timestamp"] >= int(start_date.timestamp()) and entry["timestamp"] <= int(end_date.timestamp()):
+        if entry["timestamp"] >= int(start_date.timestamp()) and entry[
+            "timestamp"
+        ] <= int(end_date.timestamp()):
             subset.append(entry)
     return subset
 
@@ -180,17 +200,17 @@ def convert_df_to_list(df: pd.DataFrame) -> List[Dict[str, Union[int, float, Non
     """
     # Convert the DataFrame index to epoch timestamps and reset index
     df_reset = df.reset_index()
-    df_reset['timestamp'] = df_reset['index'].astype(int) // 10**9
-    df_reset = df_reset.rename(columns={'value': 'value'})
+    df_reset["timestamp"] = df_reset["index"].astype(int) // 10**9
+    df_reset = df_reset.rename(columns={"value": "value"})
 
     # Convert to list of dictionaries
-    result = df_reset[['timestamp', 'value']].to_dict(orient='records')
+    result = df_reset[["timestamp", "value"]].to_dict(orient="records")
 
     # Convert NaN to None (needs to handle empty df as well)
     for entry in result:
-        if entry['value'] is None:
-            entry['value'] = None
-        elif math.isnan(entry['value']):
-            entry['value'] = None
+        if entry["value"] is None:
+            entry["value"] = None
+        elif math.isnan(entry["value"]):
+            entry["value"] = None
 
     return result