Simplify and modernize GeoPandas to file usage (#25)

henrykironde · web-flow · commit 4a40813333c7 · 2025-12-03T02:08:10.000-05:00
- Improve combine_bird_predictions.py
- Refactor process_nests.py for accurate date handling
- Ensure empty shapefile schema are supported
- Updated .astype, use explicit 'int64' and 'float64' instead of 'int' and 'float
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@
 App/Zooniverse/*
 lightning_logs
 logs/**
+*core.*
diff --git a/combine_bird_predictions.py b/combine_bird_predictions.py
@@ -1,54 +1,75 @@
 import os
 import sys
 import shutil
-from zipfile import ZIP_DEFLATED
-from zipfile import ZipFile
-import geopandas
+from zipfile import ZipFile, ZIP_DEFLATED
+import geopandas as gpd
 import pandas as pd
 import tools
 
 
 def combine(paths):
-    """Take prediction shapefiles and wrap into a single file"""
-    shapefiles = []
-    for x in paths:
-        shapefiles.append(geopandas.read_file(x))
-    summary = geopandas.GeoDataFrame(pd.concat(shapefiles, ignore_index=True), crs=shapefiles[0].crs)
-    return summary
+    """Read multiple prediction shapefiles and concatenate into one GeoDataFrame."""
+    gdfs = []
+    target_crs = None
+    for p in paths:
+        gdf = gpd.read_file(p)
+        if target_crs is None:
+            target_crs = gdf.crs
+        elif gdf.crs != target_crs:
+            # Reproject to the CRS of the first file
+            gdf = gdf.to_crs(target_crs)
+        gdfs.append(gdf)
+    if not gdfs:
+        raise ValueError("No input shapefiles provided.")
+    return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=target_crs)
 
 
 if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python combine_bird_predictions.py <shp1> <shp2> ...")
+        sys.exit(1)
+
     working_dir = tools.get_working_dir()
-    predictions_path = f"{working_dir}/predictions/"
-    output_path = f"{working_dir}/everwatch-workflow/App/Zooniverse/data"
-    output_zip = os.path.join(output_path, "PredictedBirds.zip")
+    output_path = os.path.join(working_dir, "everwatch-workflow", "App", "Zooniverse", "data")
+    os.makedirs(output_path, exist_ok=True)
+
+    output_shp_base = os.path.join(output_path, "PredictedBirds")
+    output_zip = output_shp_base + ".zip"
 
+    # Read and combine
     predictions = sys.argv[1:]
-    # write output to zooniverse app
     df = combine(predictions)
-    df.to_file(os.path.join(output_path, "PredictedBirds.shp"))
-
-    # Write output as csv
-    grouped_df = df.groupby(['Site', 'Date', 'label']).size().reset_index(name='count')
-    csv_file_path = os.path.join(output_path, "PredictedBirds.csv")
-    grouped_df.to_csv(csv_file_path, index=False)
-
-    # Zip the shapefile for storage efficiency
-    with ZipFile(output_zip, 'w', ZIP_DEFLATED) as zip:
-        for ext in ['cpg', 'dbf', 'prj', 'shp', 'shx']:
-            focal_file = os.path.join(output_path, f"PredictedBirds.{ext}")
-            file_name = os.path.basename(focal_file)
-            zip.write(focal_file, arcname=file_name)
-            os.remove(focal_file)
-
-    # Copy PredictedBirds.zip to everglades-forecast-web repo
-    dest_path = "/blue/ewhite/everglades/everglades-forecast-web/data"
-    if not os.path.exists(dest_path):
-        os.makedirs(dest_path)
+
+    try:
+        import pyogrio
+        df.to_file(f"{output_shp_base}.shp", driver="ESRI Shapefile", engine="pyogrio")
+    except ImportError:
+        df.to_file(f"{output_shp_base}.shp", driver="ESRI Shapefile", engine="fiona")
+
+    # Write summary CSV
+    grouped_df = df.groupby(["Site", "Date", "label"]).size().reset_index(name="count")
+    grouped_df.to_csv(output_shp_base + ".csv", index=False)
+
+    # Zip shapefile components
+    shp_exts = ["cpg", "dbf", "prj", "shp", "shx"]
+    with ZipFile(output_zip, "w", compression=ZIP_DEFLATED) as zf:
+        for ext in shp_exts:
+            f = f"{output_shp_base}.{ext}"
+            if os.path.exists(f):
+                zf.write(f, arcname=os.path.basename(f))
+    # Clean up shapefile parts after zipping
+    for ext in shp_exts:
+        f = f"{output_shp_base}.{ext}"
+        if os.path.exists(f):
+            os.remove(f)
+
+    # Copy PredictedBirds.zip to forecast web repo (ensure permissions)
+    dest_path = os.path.join(working_dir, "everglades-forecast-web", "data")
+    os.makedirs(dest_path, exist_ok=True)
     dest_file = os.path.join(dest_path, "PredictedBirds.zip")
 
     if os.path.exists(output_zip):
         shutil.copy(output_zip, dest_file)
         print(f"{output_zip} copied to {dest_file}.")
     else:
-        print("{output_zip} file does not exist.")
+        print(f"{output_zip} file does not exist.")
diff --git a/combine_birds_site_year.py b/combine_birds_site_year.py
@@ -29,7 +29,12 @@ def combine_files(bird_detection_files, year, site, score_thresh, savedir):
     df.crs = eventdf.crs
     df = df.assign(bird_id=range(1, len(df) + 1))  # Index bird IDs starting at 1
     filename = os.path.join(savedir, f"{site}_{year}_combined.shp")
-    df.to_file(filename)
+
+    try:
+        import pyogrio
+        df.to_file(filename, driver="ESRI Shapefile", engine="pyogrio")
+    except ImportError:
+        df.to_file(filename, driver="ESRI Shapefile", engine="fiona")
 
     return df
 
diff --git a/combine_nests.py b/combine_nests.py
@@ -1,4 +1,3 @@
-import glob
 import os
 import re
 import sys
@@ -18,21 +17,20 @@ def get_site(path):
 
 def load_shapefile(x):
     shp = geopandas.read_file(x)
-    # Force correct types
-    # Empty shape files don't see to maintain provided types
-    # when written and loaded
+    # Force correct datatypes
+    # Empty shapefiles don't seem to maintain provided types when written and loaded
     shp = shp.astype({
-        'nest_id': 'int',
+        'nest_id': 'int64',
         'Site': 'str',
         'Year': 'str',
-        'xmean': 'float',
-        'ymean': 'float',
+        'xmean': 'float64',
+        'ymean': 'float64',
         'first_obs': 'str',
         'last_obs': 'str',
-        'num_obs': 'int',
+        'num_obs': 'int64',
         'species': 'str',
-        'sum_top1': 'float',
-        'num_top1': 'int',
+        'sum_top1': 'float64',
+        'num_top1': 'int64',
         'bird_match': 'str'
     })
     shp["site"] = get_site(x)
diff --git a/process_nests.py b/process_nests.py
@@ -5,104 +5,128 @@
 import tools
 
 
-def count_max_consec_detects(nest_data, date_data):
-    """Determine the maximum number of consecutive bird detections"""
-    assert date_data.shape[0] == 1, "date_data should be a Pandas DataFrame with one row"
-    sorted_dates = pd.Series(date_data.Date[0]).sort_values().reset_index(drop=True)
-    sorted_nest_dates = pd.Series(nest_data.Date).sort_values().reset_index(drop=True)
-    sorted_dates_dict = {val: key for key, val in sorted_dates.items()}
-    sorted_dates_combined_diff = sorted_nest_dates.map(sorted_dates_dict).diff()
-    all_consec_detects = []
-    consec_detects = 0
-    for i in range(1, len(sorted_dates_combined_diff)):
-        if sorted_dates_combined_diff[i] == 1 and sorted_dates_combined_diff[i - 1] != 1:
-            # New start to consectutive detection set
-            consec_detects = 1
-            if i + 1 == len(sorted_dates_combined_diff):
-                all_consec_detects.append(consec_detects)
-        elif sorted_dates_combined_diff[i] == 1 and sorted_dates_combined_diff[i - 1] == 1:
-            # Increment existing consecutive detection set
-            consec_detects += 1
-            if i + 1 == len(sorted_dates_combined_diff):
-                all_consec_detects.append(consec_detects)
-        elif sorted_dates_combined_diff[i] != 1 and sorted_dates_combined_diff[i - 1] == 1:
-            # Store completed consecutive detection set and reset
-            all_consec_detects.append(consec_detects)
-            consec_detects = 0
-        elif sorted_dates_combined_diff[i] != 1 and sorted_dates_combined_diff[i - 1] != 1:
-            consec_detects == 0
+def count_max_consec_detects(nest_data: pd.DataFrame, date_data: pd.DataFrame) -> int:
+    """Determine the maximum number of consecutive bird detections."""
+    assert date_data.shape[0] == 1, "date_data should be a DataFrame with one row"
+    # Normalize to datetime and build an ordered index of dates observed at the site-year
+    all_dates = sorted(pd.to_datetime(d) for d in date_data.loc[0, "Date"])
+    pos = {d: i for i, d in enumerate(all_dates)}
+    idxs = sorted(pos.get(pd.to_datetime(d)) for d in nest_data["Date"].unique() if pd.to_datetime(d) in pos)
+    idxs = [i for i in idxs if i is not None]
+    if not idxs:
+        return 0
+    longest = cur = 1
+    for i in range(1, len(idxs)):
+        if idxs[i] - idxs[i - 1] == 1:
+            cur += 1
         else:
-            assert False, "Oops, I shouldn't be here"
-    if all_consec_detects:
-        max_consec_detects = max(all_consec_detects)
-    else:
-        max_consec_detects = 0
-
-    return max_consec_detects
+            longest = max(longest, cur)
+            cur = 1
+    return max(longest, cur)
 
 
 def process_nests(nest_file, year, site, savedir, min_score=0.3, min_detections=3, min_consec_detects=1):
-    """Process nests into a one row per nest table"""
+    """Process nests into a one-row-per-nest table and write a shapefile."""
+    SCHEMA = {
+        "geometry": "Point",
+        "properties": {
+            "nest_id": "int",
+            "Site": "str",
+            "Year": "str",
+            "xmean": "float",
+            "ymean": "float",
+            "first_obs": "str",
+            "last_obs": "str",
+            "num_obs": "int",
+            "species": "str",
+            "sum_top1": "float",
+            "num_top1": "int",
+            "bird_match": "str",
+        },
+    }
+
     nests_data = geopandas.read_file(nest_file)
-    date_data = nests_data.groupby(['Site', 'Year']).agg({'Date': lambda x: x.unique().tolist()}).reset_index()
-    target_inds = nests_data['target_ind'].unique()
-    nests = []
+
+    # Build date_data: single row with all dates for the site-year
+    date_data = (nests_data.groupby(["Site", "Year"]).agg({
+        "Date": lambda x: pd.Series(x).unique().tolist()
+    }).reset_index())
+
+    target_inds = nests_data["target_ind"].unique()
+    nests_rows = []
+
     for target_ind in target_inds:
-        nest_data = nests_data[(nests_data['target_ind'] == target_ind) & (nests_data['score'] >= min_score)]
+        nest_data = nests_data[(nests_data["target_ind"] == target_ind) & (nests_data["score"] >= min_score)]
         num_consec_detects = count_max_consec_detects(nest_data, date_data)
+
         if len(nest_data) >= min_detections or num_consec_detects >= min_consec_detects:
-            summed_scores = nest_data.groupby(['Site', 'Year', 'target_ind', 'label']).score.agg(['sum', 'count'])
-            top_score_data = summed_scores[summed_scores['sum'] == max(summed_scores['sum'])].reset_index()
-            nest_info = nest_data.groupby(['Site', 'Year', 'target_ind']).agg({
-                'Date': ['min', 'max', 'count'],
-                'match_xmin': ['mean'],
-                'match_ymin': ['mean'],
-                'match_xmax': ['mean'],
-                'match_ymax': ['mean']
-            }).reset_index()
+            # Aggregate scores per label and pick the top label by summed score
+            summed_scores = (nest_data.groupby(["Site", "Year", "target_ind",
+                                                "label"])["score"].agg(["sum", "count"]).reset_index())
+            top_idx = summed_scores["sum"].idxmax()
+            top_score_data = summed_scores.loc[top_idx]
+
+            # Summary stats
+            nest_info = (nest_data.groupby(["Site", "Year", "target_ind"]).agg({
+                "Date": ["min", "max", "count"],
+                "match_xmin": ["mean"],
+                "match_xmax": ["mean"],
+                "match_ymin": ["mean"],
+                "match_ymax": ["mean"],
+            }))
             xmean = (nest_info['match_xmin']['mean'][0] + nest_info['match_xmax']['mean']) / 2
             ymean = (nest_info['match_ymin']['mean'][0] + nest_info['match_ymax']['mean']) / 2
-            bird_match = ",".join([str(x) for x in nest_data["bird_id"]])
-            nests.append([
-                target_ind, nest_info['Site'][0], nest_info['Year'][0], xmean[0], ymean[0], nest_info['Date']['min'][0],
-                nest_info['Date']['max'][0], nest_info['Date']['count'][0], top_score_data['label'][0],
-                top_score_data['sum'][0], top_score_data['count'][0], bird_match
+            # Flatten date stats
+            first_obs = nest_info[("Date", "min")].values[0]
+            last_obs = nest_info[("Date", "max")].values[0]
+            num_obs = int(nest_info[("Date", "count")].values[0])
+
+            bird_match = ",".join(str(x) for x in nest_data["bird_id"])
+
+            nests_rows.append([
+                int(target_ind),
+                str(top_score_data["Site"]),
+                str(top_score_data["Year"]),
+                float(xmean),
+                float(ymean),
+                str(first_obs),
+                str(last_obs),
+                int(num_obs),
+                str(top_score_data["label"]),
+                float(top_score_data["sum"]),
+                int(top_score_data["count"]),
+                bird_match,
             ])
 
-    if not os.path.exists(savedir):
-        os.makedirs(savedir)
+    os.makedirs(savedir, exist_ok=True)
     filename = os.path.join(savedir, f"{site}_{year}_processed_nests.shp")
 
-    if nests:
-        nests = pd.DataFrame(nests,
-                             columns=[
-                                 'nest_id', 'Site', 'Year', 'xmean', 'ymean', 'first_obs', 'last_obs', 'num_obs',
-                                 'species', 'sum_top1', 'num_top1', 'bird_match'
-                             ])
-        nests_shp = geopandas.GeoDataFrame(nests, geometry=geopandas.points_from_xy(nests.xmean, nests.ymean))
-        nests_shp.crs = nests_data.crs
-        nests_shp.to_file(filename)
+    gdf_tofile = None
+    if nests_rows:
+        nests_df = pd.DataFrame(nests_rows, columns=list(SCHEMA["properties"].keys()))
+        nests_gdf = geopandas.GeoDataFrame(
+            nests_df,
+            geometry=geopandas.points_from_xy(nests_df.xmean, nests_df.ymean),
+            crs=nests_data.crs,
+        )
+        gdf_tofile = nests_gdf
     else:
-        schema = {
-            "geometry": "Polygon",
-            "properties": {
-                'nest_id': 'int',
-                'Site': 'str',
-                'Year': 'str',
-                'xmean': 'float',
-                'ymean': 'float',
-                'first_obs': 'str',
-                'last_obs': 'str',
-                'num_obs': 'int',
-                'species': 'str',
-                'sum_top1': 'float',
-                'num_top1': 'int',
-                'bird_match': 'str'
-            }
+        empty_data = {
+            k: pd.Series(dtype="int64" if v == "int" else "float64" if v == "float" else "object")
+            for k, v in SCHEMA["properties"].items()
         }
-        crs = nests_data.crs
-        empty_nests = geopandas.GeoDataFrame(geometry=[])
-        empty_nests.to_file(filename, driver='ESRI Shapefile', schema=schema, crs=crs)
+        empty_gdf = geopandas.GeoDataFrame(
+            empty_data,
+            geometry=geopandas.GeoSeries([], dtype="geometry"),
+            crs=nests_data.crs,
+        )
+        gdf_tofile = empty_gdf
+
+    try:
+        import pyogrio
+        gdf_tofile.to_file(filename, driver="ESRI Shapefile", engine="pyogrio")
+    except ImportError:
+        gdf_tofile.to_file(filename, driver="ESRI Shapefile", engine="fiona")
 
 
 if __name__ == "__main__":