Skip to content

Commit 85c5b4f

Browse files
committed
refactor(preprocessing.py): use GeoPandas instead of gis-utils for reading/writing shapefiles; except whenreading multiple shapefiles; fixes potential issue with mixed geometry types (e.g. Polygon and MultiPolygon) and possibly other potential issues.
1 parent 036ea3e commit 85c5b4f

File tree

1 file changed

+25
-43
lines changed

1 file changed

+25
-43
lines changed

sfrmaker/preprocessing.py

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
get_shapefile_crs,
2222
write_raster,
2323
project,
24+
df2shp,
2425
shp2df,
25-
df2shp,
2626
get_shapefile_crs,
2727
get_authority_crs
2828
)
@@ -347,7 +347,8 @@ def cull_flowlines(NHDPlus_paths,
347347
'pf_file': '{}/PlusFlow{}.dbf'.format(outfolder, version),
348348
'elevslope_file': '{}/elevslope{}.dbf'.format(outfolder, version)
349349
}
350-
df2shp(fl, results['flowlines_file'], crs=4269)
350+
fl = gpd.GeoDataFrame(fl, crs=4269)
351+
fl.to_file(results['flowlines_file'], index=False)
351352
df2shp(pfvaa, results['pfvaa_file'])
352353
df2shp(pf, results['pf_file'])
353354
df2shp(elevslope, results['elevslope_file'])
@@ -694,16 +695,16 @@ def preprocess_nhdplus(flowlines_file, pfvaa_file,
694695
outfile = outfile.format(f'gt{asum_thresh:.0f}km_')
695696
else:
696697
outfile = outfile.format('')
697-
df2shp(flccb.drop('buffpoly', axis=1),
698-
outfolder / outfile,
699-
index=False, crs=dest_crs)
698+
flccb.drop('buffpoly', axis=1).to_file(
699+
outfolder / outfile, index=False
700+
)
700701
else:
701702
if not Path(flowline_elevations_file).exists():
702703
raise ValueError(
703704
"If run_zonal_statistics=False a flowline_elevations_file produced by"
704705
"a previous run of the sfrmaker.preprocessing.preprocess_nhdplus() "
705706
"function is needed.")
706-
flccb = shp2df(flowline_elevations_file)
707+
flccb = gpd.read_file(flowline_elevations_file)
707708
flccb.index = flccb['COMID']
708709
flccb['buffpoly'] = flccb['geometry']
709710
merge_cols = [c for c in flccb.columns if c not in fl.columns]
@@ -945,9 +946,8 @@ def preprocess_nhdplus(flowlines_file, pfvaa_file,
945946
narwidth_crs = get_crs(narwidth_shapefile)
946947
narwidth_bbox = project(flowline_bbox, flowline_crs, narwidth_crs)
947948
sample_NARWidth(flcc, narwidth_shapefile,
948-
waterbodies=waterbody_shapefiles,
949+
waterbody_shapefiles=waterbody_shapefiles,
949950
bbox_filter=narwidth_bbox.bounds,
950-
crs=project_crs,
951951
output_width_units=output_length_units)
952952
logger.log('Sampling widths from NARWidth database')
953953

@@ -969,8 +969,7 @@ def preprocess_nhdplus(flowlines_file, pfvaa_file,
969969
outfile = outfile.format(f'_gt{asum_thresh:.0f}km')
970970
else:
971971
outfile = outfile.format('')
972-
df2shp(flcc.drop('buffpoly', axis=1), outfolder / outfile,
973-
index=False, crs=dest_crs)
972+
flcc.drop('buffpoly', axis=1, errors='ignore').to_file(outfolder / outfile, index=False)
974973
logger.log('Preprocessing Flowlines')
975974

976975
return flcc
@@ -1048,9 +1047,8 @@ def clip_flowlines_to_polygon(flowlines, polygon,
10481047
return flc
10491048

10501049

1051-
def sample_NARWidth(flowlines, narwidth_shapefile, waterbodies,
1050+
def sample_NARWidth(flowlines, narwidth_shapefile, waterbody_shapefiles,
10521051
bbox_filter=None,
1053-
crs=None,
10541052
output_width_units='meters',
10551053
outpath='shps/'):
10561054
"""
@@ -1060,32 +1058,15 @@ def sample_NARWidth(flowlines, narwidth_shapefile, waterbodies,
10601058
10611059
Parameters
10621060
----------
1063-
flowlines : DataFrame
1061+
flowlines : GeoDataFrame
10641062
flowlines dataframe from preprocess_nhdplus().
1065-
Flowlines must be in a projected Coordinate reference system (CRS).
1063+
Flowlines must be in a projected Coordinate reference system (CRS),
1064+
specified by a valid .crs attribute.
10661065
narwidth_shapefile : str
10671066
Path to shapefile from the NARWidth database (Allen and Pavelsky, 2015).
10681067
waterbody_shapefiles : str or list of strings, optional
10691068
Path(s) to NHDPlus NHDWaterbody shapefile(s). Only required if a
10701069
``narwidth_shapefile`` is specified.
1071-
crs : obj
1072-
Coordinate reference system of flowlines.
1073-
A Python int, dict, str, or :class:`pyproj.crs.CRS` instance
1074-
passed to :meth:`pyproj.crs.CRS.from_user_input`
1075-
1076-
Can be any of:
1077-
- PROJ string
1078-
- Dictionary of PROJ parameters
1079-
- PROJ keyword arguments for parameters
1080-
- JSON string with PROJ parameters
1081-
- CRS WKT string
1082-
- An authority string [i.e. 'epsg:4326']
1083-
- An EPSG integer code [i.e. 4326]
1084-
- A tuple of ("auth_name": "auth_code") [i.e ('epsg', '4326')]
1085-
- An object with a `to_wkt` method.
1086-
- A :class:`pyproj.crs.CRS` class
1087-
1088-
By default, None
10891070
filter : tuple
10901071
Bounds (most likely in lat/lon) for filtering NARWidth lines that are read in
10911072
(left, bottom, right, top)
@@ -1102,28 +1083,29 @@ def sample_NARWidth(flowlines, narwidth_shapefile, waterbodies,
11021083
To avoid erroneous overlap between main-stem NARWidth estimates and minor tributaries, flowlines with arbolate sums less than 500 km only receive widths from NARWidth lines that have at least 50% of their length inside of the 1-km buffer. NARWidth values are generally higher than arbolate sum-based estimates, because the NARWidth estimates represent mean flows and include all reaches of the stream, whereas the arbolate sum estimates are based on field measurements taken at narrower than average, well-behaved channel sections near stream gages, under base flow conditions. Therefore, measured channel widths may be biased low compared to actual widths throughout the stream network (Allen and Pavelsky, 2015; Park, 1977).
11031084
"""
11041085

1105-
wb = shp2df(waterbodies)
1086+
wb = shp2df(waterbody_shapefiles)
11061087

11071088
if not os.path.isdir(outpath):
11081089
os.makedirs(outpath)
11091090

1110-
flowline_crs = get_crs(crs=crs)
1111-
if flowline_crs.is_geographic:
1091+
if flowlines.crs.is_geographic:
11121092
msg = ("Flowlines must be in a projected Coordinate Reference System "
11131093
"(CRS; i.e. with units of meters).")
11141094
raise ValueError(msg)
11151095

11161096
# read in narwidth shapefile; to_crs to flowline CRS
11171097
nw = shp2df(narwidth_shapefile, filter=bbox_filter)
11181098
narwidth_crs = get_shapefile_crs(narwidth_shapefile)
1119-
nw['geometry'] = project(nw.geometry, narwidth_crs, flowline_crs)
1099+
nw['geometry'] = project(nw.geometry, narwidth_crs, flowlines.crs)
11201100

11211101
# draw buffers around flowlines
11221102
buffdist = 1000 # m
11231103
buffers = [g.buffer(buffdist) for g in flowlines.geometry]
11241104
flbuff = flowlines.copy()
11251105
flbuff['geometry'] = buffers
1126-
df2shp(flbuff, '{}/flowlines_edited_buffers_{}.shp'.format(outpath, buffdist), crs=flowline_crs)
1106+
assert flbuff.crs == flowlines.crs
1107+
flbuff.drop('buffpoly', axis=1).to_file(
1108+
f'{outpath}/flowlines_edited_buffers_{buffdist}.shp', index=False)
11271109

11281110
# determine which narwidth segments intersect the flowline buffers
11291111
results = intersect_rtree(nw.geometry.tolist(), flbuff.geometry.tolist())
@@ -1187,9 +1169,9 @@ def sample_NARWidth(flowlines, narwidth_shapefile, waterbodies,
11871169
rivers_with_widths = ~flowlines.is_wb & (flowlines.nhd_asum > 500) & ~np.isnan(flowlines.narwd_mean)
11881170
wbs = flowlines.is_wb & ~np.isnan(flowlines.narwd_mean)
11891171
criteria = rivers_with_widths | wbs
1190-
1191-
df2shp(flowlines.loc[criteria, :], '{}/flowlines_w_sampled_narwidth_elevations.shp'.format(outpath),
1192-
crs=flowline_crs)
1172+
flowlines.drop('buffpoly', axis=1, inplace=True, errors='ignore')
1173+
flowlines.loc[criteria, :].to_file(f'{outpath}/flowlines_w_sampled_narwidth_elevations.shp',
1174+
index=False)
11931175

11941176

11951177
def edit_flowlines(flowlines, config_file,
@@ -1232,7 +1214,7 @@ def edit_flowlines(flowlines, config_file,
12321214

12331215
if isinstance(flowlines, str) or isinstance(flowlines, Path):
12341216
logger.log_file_and_date_modified(flowlines)
1235-
df = shp2df(flowlines)
1217+
df = gpd.read_file(flowlines)
12361218
# make a backup
12371219
for ext in '.shp', '.dbf', '.shx', '.prj':
12381220
source = flowlines[:-4] + ext
@@ -1248,7 +1230,7 @@ def edit_flowlines(flowlines, config_file,
12481230
if 'add_flowlines' in cfg:
12491231
add_flowlines_file = os.path.join(config_path,
12501232
cfg['add_flowlines']['filename'])
1251-
df2 = shp2df(add_flowlines_file)
1233+
df2 = gpd.read_file(add_flowlines_file)
12521234

12531235
# resolve case differences in column names
12541236
# conform columns to flowlines
@@ -1291,7 +1273,7 @@ def edit_flowlines(flowlines, config_file,
12911273

12921274
# write out an updated version of the input flowlines file
12931275
if isinstance(flowlines, str) or isinstance(flowlines, Path):
1294-
df2shp(df, flowlines, prj=prj_file)
1276+
df.to_file(flowlines, index=False)
12951277
logger.statement('wrote {}'.format(flowlines))
12961278
return df
12971279

0 commit comments

Comments
 (0)