Skip to content

Commit 4a40813

Browse files
authored
Simplify and modernize GeoPandas to file usage (#25)
- Improve combine_bird_predictions.py - Refactor process_nests.py for accurate date handling - Ensure empty shapefile schema are supported - Updated .astype, use explicit 'int64' and 'float64' instead of 'int' and 'float
1 parent 78d9ae0 commit 4a40813

File tree

5 files changed

+175
-126
lines changed

5 files changed

+175
-126
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
App/Zooniverse/*
1010
lightning_logs
1111
logs/**
12+
*core.*

combine_bird_predictions.py

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,75 @@
11
import os
22
import sys
33
import shutil
4-
from zipfile import ZIP_DEFLATED
5-
from zipfile import ZipFile
6-
import geopandas
4+
from zipfile import ZipFile, ZIP_DEFLATED
5+
import geopandas as gpd
76
import pandas as pd
87
import tools
98

109

1110
def combine(paths):
12-
"""Take prediction shapefiles and wrap into a single file"""
13-
shapefiles = []
14-
for x in paths:
15-
shapefiles.append(geopandas.read_file(x))
16-
summary = geopandas.GeoDataFrame(pd.concat(shapefiles, ignore_index=True), crs=shapefiles[0].crs)
17-
return summary
11+
"""Read multiple prediction shapefiles and concatenate into one GeoDataFrame."""
12+
gdfs = []
13+
target_crs = None
14+
for p in paths:
15+
gdf = gpd.read_file(p)
16+
if target_crs is None:
17+
target_crs = gdf.crs
18+
elif gdf.crs != target_crs:
19+
# Reproject to the CRS of the first file
20+
gdf = gdf.to_crs(target_crs)
21+
gdfs.append(gdf)
22+
if not gdfs:
23+
raise ValueError("No input shapefiles provided.")
24+
return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=target_crs)
1825

1926

2027
if __name__ == "__main__":
28+
if len(sys.argv) < 2:
29+
print("Usage: python combine_bird_predictions.py <shp1> <shp2> ...")
30+
sys.exit(1)
31+
2132
working_dir = tools.get_working_dir()
22-
predictions_path = f"{working_dir}/predictions/"
23-
output_path = f"{working_dir}/everwatch-workflow/App/Zooniverse/data"
24-
output_zip = os.path.join(output_path, "PredictedBirds.zip")
33+
output_path = os.path.join(working_dir, "everwatch-workflow", "App", "Zooniverse", "data")
34+
os.makedirs(output_path, exist_ok=True)
35+
36+
output_shp_base = os.path.join(output_path, "PredictedBirds")
37+
output_zip = output_shp_base + ".zip"
2538

39+
# Read and combine
2640
predictions = sys.argv[1:]
27-
# write output to zooniverse app
2841
df = combine(predictions)
29-
df.to_file(os.path.join(output_path, "PredictedBirds.shp"))
30-
31-
# Write output as csv
32-
grouped_df = df.groupby(['Site', 'Date', 'label']).size().reset_index(name='count')
33-
csv_file_path = os.path.join(output_path, "PredictedBirds.csv")
34-
grouped_df.to_csv(csv_file_path, index=False)
35-
36-
# Zip the shapefile for storage efficiency
37-
with ZipFile(output_zip, 'w', ZIP_DEFLATED) as zip:
38-
for ext in ['cpg', 'dbf', 'prj', 'shp', 'shx']:
39-
focal_file = os.path.join(output_path, f"PredictedBirds.{ext}")
40-
file_name = os.path.basename(focal_file)
41-
zip.write(focal_file, arcname=file_name)
42-
os.remove(focal_file)
43-
44-
# Copy PredictedBirds.zip to everglades-forecast-web repo
45-
dest_path = "/blue/ewhite/everglades/everglades-forecast-web/data"
46-
if not os.path.exists(dest_path):
47-
os.makedirs(dest_path)
42+
43+
try:
44+
import pyogrio
45+
df.to_file(f"{output_shp_base}.shp", driver="ESRI Shapefile", engine="pyogrio")
46+
except ImportError:
47+
df.to_file(f"{output_shp_base}.shp", driver="ESRI Shapefile", engine="fiona")
48+
49+
# Write summary CSV
50+
grouped_df = df.groupby(["Site", "Date", "label"]).size().reset_index(name="count")
51+
grouped_df.to_csv(output_shp_base + ".csv", index=False)
52+
53+
# Zip shapefile components
54+
shp_exts = ["cpg", "dbf", "prj", "shp", "shx"]
55+
with ZipFile(output_zip, "w", compression=ZIP_DEFLATED) as zf:
56+
for ext in shp_exts:
57+
f = f"{output_shp_base}.{ext}"
58+
if os.path.exists(f):
59+
zf.write(f, arcname=os.path.basename(f))
60+
# Clean up shapefile parts after zipping
61+
for ext in shp_exts:
62+
f = f"{output_shp_base}.{ext}"
63+
if os.path.exists(f):
64+
os.remove(f)
65+
66+
# Copy PredictedBirds.zip to forecast web repo (ensure permissions)
67+
dest_path = os.path.join(working_dir, "everglades-forecast-web", "data")
68+
os.makedirs(dest_path, exist_ok=True)
4869
dest_file = os.path.join(dest_path, "PredictedBirds.zip")
4970

5071
if os.path.exists(output_zip):
5172
shutil.copy(output_zip, dest_file)
5273
print(f"{output_zip} copied to {dest_file}.")
5374
else:
54-
print("{output_zip} file does not exist.")
75+
print(f"{output_zip} file does not exist.")

combine_birds_site_year.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@ def combine_files(bird_detection_files, year, site, score_thresh, savedir):
2929
df.crs = eventdf.crs
3030
df = df.assign(bird_id=range(1, len(df) + 1)) # Index bird IDs starting at 1
3131
filename = os.path.join(savedir, f"{site}_{year}_combined.shp")
32-
df.to_file(filename)
32+
33+
try:
34+
import pyogrio
35+
df.to_file(filename, driver="ESRI Shapefile", engine="pyogrio")
36+
except ImportError:
37+
df.to_file(filename, driver="ESRI Shapefile", engine="fiona")
3338

3439
return df
3540

combine_nests.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import glob
21
import os
32
import re
43
import sys
@@ -18,21 +17,20 @@ def get_site(path):
1817

1918
def load_shapefile(x):
2019
shp = geopandas.read_file(x)
21-
# Force correct types
22-
# Empty shape files don't see to maintain provided types
23-
# when written and loaded
20+
# Force correct datatypes
21+
# Empty shapefiles don't seem to maintain provided types when written and loaded
2422
shp = shp.astype({
25-
'nest_id': 'int',
23+
'nest_id': 'int64',
2624
'Site': 'str',
2725
'Year': 'str',
28-
'xmean': 'float',
29-
'ymean': 'float',
26+
'xmean': 'float64',
27+
'ymean': 'float64',
3028
'first_obs': 'str',
3129
'last_obs': 'str',
32-
'num_obs': 'int',
30+
'num_obs': 'int64',
3331
'species': 'str',
34-
'sum_top1': 'float',
35-
'num_top1': 'int',
32+
'sum_top1': 'float64',
33+
'num_top1': 'int64',
3634
'bird_match': 'str'
3735
})
3836
shp["site"] = get_site(x)

process_nests.py

Lines changed: 106 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -5,104 +5,128 @@
55
import tools
66

77

8-
def count_max_consec_detects(nest_data, date_data):
9-
"""Determine the maximum number of consecutive bird detections"""
10-
assert date_data.shape[0] == 1, "date_data should be a Pandas DataFrame with one row"
11-
sorted_dates = pd.Series(date_data.Date[0]).sort_values().reset_index(drop=True)
12-
sorted_nest_dates = pd.Series(nest_data.Date).sort_values().reset_index(drop=True)
13-
sorted_dates_dict = {val: key for key, val in sorted_dates.items()}
14-
sorted_dates_combined_diff = sorted_nest_dates.map(sorted_dates_dict).diff()
15-
all_consec_detects = []
16-
consec_detects = 0
17-
for i in range(1, len(sorted_dates_combined_diff)):
18-
if sorted_dates_combined_diff[i] == 1 and sorted_dates_combined_diff[i - 1] != 1:
19-
# New start to consectutive detection set
20-
consec_detects = 1
21-
if i + 1 == len(sorted_dates_combined_diff):
22-
all_consec_detects.append(consec_detects)
23-
elif sorted_dates_combined_diff[i] == 1 and sorted_dates_combined_diff[i - 1] == 1:
24-
# Increment existing consecutive detection set
25-
consec_detects += 1
26-
if i + 1 == len(sorted_dates_combined_diff):
27-
all_consec_detects.append(consec_detects)
28-
elif sorted_dates_combined_diff[i] != 1 and sorted_dates_combined_diff[i - 1] == 1:
29-
# Store completed consecutive detection set and reset
30-
all_consec_detects.append(consec_detects)
31-
consec_detects = 0
32-
elif sorted_dates_combined_diff[i] != 1 and sorted_dates_combined_diff[i - 1] != 1:
33-
consec_detects == 0
8+
def count_max_consec_detects(nest_data: pd.DataFrame, date_data: pd.DataFrame) -> int:
9+
"""Determine the maximum number of consecutive bird detections."""
10+
assert date_data.shape[0] == 1, "date_data should be a DataFrame with one row"
11+
# Normalize to datetime and build an ordered index of dates observed at the site-year
12+
all_dates = sorted(pd.to_datetime(d) for d in date_data.loc[0, "Date"])
13+
pos = {d: i for i, d in enumerate(all_dates)}
14+
idxs = sorted(pos.get(pd.to_datetime(d)) for d in nest_data["Date"].unique() if pd.to_datetime(d) in pos)
15+
idxs = [i for i in idxs if i is not None]
16+
if not idxs:
17+
return 0
18+
longest = cur = 1
19+
for i in range(1, len(idxs)):
20+
if idxs[i] - idxs[i - 1] == 1:
21+
cur += 1
3422
else:
35-
assert False, "Oops, I shouldn't be here"
36-
if all_consec_detects:
37-
max_consec_detects = max(all_consec_detects)
38-
else:
39-
max_consec_detects = 0
40-
41-
return max_consec_detects
23+
longest = max(longest, cur)
24+
cur = 1
25+
return max(longest, cur)
4226

4327

4428
def process_nests(nest_file, year, site, savedir, min_score=0.3, min_detections=3, min_consec_detects=1):
45-
"""Process nests into a one row per nest table"""
29+
"""Process nests into a one-row-per-nest table and write a shapefile."""
30+
SCHEMA = {
31+
"geometry": "Point",
32+
"properties": {
33+
"nest_id": "int",
34+
"Site": "str",
35+
"Year": "str",
36+
"xmean": "float",
37+
"ymean": "float",
38+
"first_obs": "str",
39+
"last_obs": "str",
40+
"num_obs": "int",
41+
"species": "str",
42+
"sum_top1": "float",
43+
"num_top1": "int",
44+
"bird_match": "str",
45+
},
46+
}
47+
4648
nests_data = geopandas.read_file(nest_file)
47-
date_data = nests_data.groupby(['Site', 'Year']).agg({'Date': lambda x: x.unique().tolist()}).reset_index()
48-
target_inds = nests_data['target_ind'].unique()
49-
nests = []
49+
50+
# Build date_data: single row with all dates for the site-year
51+
date_data = (nests_data.groupby(["Site", "Year"]).agg({
52+
"Date": lambda x: pd.Series(x).unique().tolist()
53+
}).reset_index())
54+
55+
target_inds = nests_data["target_ind"].unique()
56+
nests_rows = []
57+
5058
for target_ind in target_inds:
51-
nest_data = nests_data[(nests_data['target_ind'] == target_ind) & (nests_data['score'] >= min_score)]
59+
nest_data = nests_data[(nests_data["target_ind"] == target_ind) & (nests_data["score"] >= min_score)]
5260
num_consec_detects = count_max_consec_detects(nest_data, date_data)
61+
5362
if len(nest_data) >= min_detections or num_consec_detects >= min_consec_detects:
54-
summed_scores = nest_data.groupby(['Site', 'Year', 'target_ind', 'label']).score.agg(['sum', 'count'])
55-
top_score_data = summed_scores[summed_scores['sum'] == max(summed_scores['sum'])].reset_index()
56-
nest_info = nest_data.groupby(['Site', 'Year', 'target_ind']).agg({
57-
'Date': ['min', 'max', 'count'],
58-
'match_xmin': ['mean'],
59-
'match_ymin': ['mean'],
60-
'match_xmax': ['mean'],
61-
'match_ymax': ['mean']
62-
}).reset_index()
63+
# Aggregate scores per label and pick the top label by summed score
64+
summed_scores = (nest_data.groupby(["Site", "Year", "target_ind",
65+
"label"])["score"].agg(["sum", "count"]).reset_index())
66+
top_idx = summed_scores["sum"].idxmax()
67+
top_score_data = summed_scores.loc[top_idx]
68+
69+
# Summary stats
70+
nest_info = (nest_data.groupby(["Site", "Year", "target_ind"]).agg({
71+
"Date": ["min", "max", "count"],
72+
"match_xmin": ["mean"],
73+
"match_xmax": ["mean"],
74+
"match_ymin": ["mean"],
75+
"match_ymax": ["mean"],
76+
}))
6377
xmean = (nest_info['match_xmin']['mean'][0] + nest_info['match_xmax']['mean']) / 2
6478
ymean = (nest_info['match_ymin']['mean'][0] + nest_info['match_ymax']['mean']) / 2
65-
bird_match = ",".join([str(x) for x in nest_data["bird_id"]])
66-
nests.append([
67-
target_ind, nest_info['Site'][0], nest_info['Year'][0], xmean[0], ymean[0], nest_info['Date']['min'][0],
68-
nest_info['Date']['max'][0], nest_info['Date']['count'][0], top_score_data['label'][0],
69-
top_score_data['sum'][0], top_score_data['count'][0], bird_match
79+
# Flatten date stats
80+
first_obs = nest_info[("Date", "min")].values[0]
81+
last_obs = nest_info[("Date", "max")].values[0]
82+
num_obs = int(nest_info[("Date", "count")].values[0])
83+
84+
bird_match = ",".join(str(x) for x in nest_data["bird_id"])
85+
86+
nests_rows.append([
87+
int(target_ind),
88+
str(top_score_data["Site"]),
89+
str(top_score_data["Year"]),
90+
float(xmean),
91+
float(ymean),
92+
str(first_obs),
93+
str(last_obs),
94+
int(num_obs),
95+
str(top_score_data["label"]),
96+
float(top_score_data["sum"]),
97+
int(top_score_data["count"]),
98+
bird_match,
7099
])
71100

72-
if not os.path.exists(savedir):
73-
os.makedirs(savedir)
101+
os.makedirs(savedir, exist_ok=True)
74102
filename = os.path.join(savedir, f"{site}_{year}_processed_nests.shp")
75103

76-
if nests:
77-
nests = pd.DataFrame(nests,
78-
columns=[
79-
'nest_id', 'Site', 'Year', 'xmean', 'ymean', 'first_obs', 'last_obs', 'num_obs',
80-
'species', 'sum_top1', 'num_top1', 'bird_match'
81-
])
82-
nests_shp = geopandas.GeoDataFrame(nests, geometry=geopandas.points_from_xy(nests.xmean, nests.ymean))
83-
nests_shp.crs = nests_data.crs
84-
nests_shp.to_file(filename)
104+
gdf_tofile = None
105+
if nests_rows:
106+
nests_df = pd.DataFrame(nests_rows, columns=list(SCHEMA["properties"].keys()))
107+
nests_gdf = geopandas.GeoDataFrame(
108+
nests_df,
109+
geometry=geopandas.points_from_xy(nests_df.xmean, nests_df.ymean),
110+
crs=nests_data.crs,
111+
)
112+
gdf_tofile = nests_gdf
85113
else:
86-
schema = {
87-
"geometry": "Polygon",
88-
"properties": {
89-
'nest_id': 'int',
90-
'Site': 'str',
91-
'Year': 'str',
92-
'xmean': 'float',
93-
'ymean': 'float',
94-
'first_obs': 'str',
95-
'last_obs': 'str',
96-
'num_obs': 'int',
97-
'species': 'str',
98-
'sum_top1': 'float',
99-
'num_top1': 'int',
100-
'bird_match': 'str'
101-
}
114+
empty_data = {
115+
k: pd.Series(dtype="int64" if v == "int" else "float64" if v == "float" else "object")
116+
for k, v in SCHEMA["properties"].items()
102117
}
103-
crs = nests_data.crs
104-
empty_nests = geopandas.GeoDataFrame(geometry=[])
105-
empty_nests.to_file(filename, driver='ESRI Shapefile', schema=schema, crs=crs)
118+
empty_gdf = geopandas.GeoDataFrame(
119+
empty_data,
120+
geometry=geopandas.GeoSeries([], dtype="geometry"),
121+
crs=nests_data.crs,
122+
)
123+
gdf_tofile = empty_gdf
124+
125+
try:
126+
import pyogrio
127+
gdf_tofile.to_file(filename, driver="ESRI Shapefile", engine="pyogrio")
128+
except ImportError:
129+
gdf_tofile.to_file(filename, driver="ESRI Shapefile", engine="fiona")
106130

107131

108132
if __name__ == "__main__":

0 commit comments

Comments
 (0)