diff --git a/DataSet.py b/DataSet.py index d183649..18a7343 100644 --- a/DataSet.py +++ b/DataSet.py @@ -91,6 +91,7 @@ def get_data( self, region_type, region_value, state=None, + filter else: x_sql = self.sql + ' where ' + filter + print (x_sql) #Debugging program_data = get_echo_data( x_sql, self.idx_field ) except pd.errors.EmptyDataError: print( "No program records were found." ) @@ -336,7 +337,7 @@ def _set_facility_filter( self, region_type, region_value=None, state=None ): value_type = type(region_value) if ( value_type == list or value_type == tuple ): for region in region_value: - if ( region_type == 'Congressional District' ): + if ( region_type == 'Congressional District' or region_type == 'Census Tract'): id_string += str( region ) + ',' else: id_string += '\'' + str( region ) + '\',' @@ -346,4 +347,5 @@ def _set_facility_filter( self, region_type, region_value=None, state=None ): filter += ' = \'' + region_value + '\'' if ( region_type == 'Congressional District' or region_type == 'County' ): filter += ' and "FAC_STATE" = \'' + state + '\'' + print(filter) #debugging return filter diff --git a/geographies.py b/geographies.py index 4a6d7f0..6e423b5 100644 --- a/geographies.py +++ b/geographies.py @@ -1,12 +1,89 @@ +spatial_tables = { + #table_name = name of table in spatial database + #id_field = field used to identify the data + #match_field = field used to match data with ECHO + #pretty_field = field that has a human-readable ID e.g. county name + + # HUC8 + "Watershed": dict( + table_name="wbdhu8", + id_field="huc8", + match_field="huc8", + pretty_field="NAME" + ), + + "HUC10 Watersheds": dict( + table_name="wbdhu10", + id_field="huc10", + match_field="huc10", + pretty_field="NAME" + ), + + "HUC12 Watersheds": dict( + table_name="wbdhu12", + id_field="huc12", + match_field="huc12", + pretty_field="NAME" + ), + + "Ecoregions": dict( + table_name="eco_level3", + id_field="US_L3NAME" #e.g. Atlantic Coastal Pine Barrens + ), + + "County": dict( + table_name="tl_2019_us_county", + id_field="GEOID", # four or five digit code corresponding to two digit state number (e.g. 55) and 2-3 digit county code! + match_field="NAME", #match with state_counties.csv + pretty_field="NAME" # e.g. CEDAR + ), + + "Zip Code": dict( + table_name="tl_2019_us_zcta510", + id_field="zcta5ce10", + match_field="zcta5ce10", + pretty_field="zcta5ce10" + ), + + "EPA Region": dict( + table_name="epa_regions", + id_field="eparegion" # In the form of "Region 1", "Region 2", up to "Region 10" + ), + + "State": dict( + table_name = "tl_2019_us_state", + id_field = "STUSPS", # e.g. MS, IA, AK + match_field="STUSPS", + pretty_field="NAME" # e.g. cedar + ), + + "Congressional District": dict( + table_name = "tl_2019_us_cd116", # Unfortunately, ECHO seems based on 113th Congress + id_field = "GEOID", # this is the combination of the state id and the CD e.g. AR-2 = 0502 + match_field="CD116FP", #match with state_counties.csv + pretty_field="CD116FP" # two digit state-specific district number + ) + , + + "Census Tract": dict( + table_name = "###", # + id_field = "GEOID10", # + match_field="GEOID10", # + pretty_field="GEOID10" # NAMELSAD10 ? + ) +} + region_field = { 'State': { "field": 'FAC_STATE' }, 'Congressional District': { "field": 'FAC_DERIVED_CD113' }, 'County': { "field": 'FAC_COUNTY' }, 'Zip Code': { "field": 'FAC_ZIP' }, + 'Watershed': {"field": 'FAC_DERIVED_HUC'}, + 'Census Tract': {"field": 'FAC_DERIVED_CB2010'}, } # Commenting out these region types until implemented -# 'Watershed': {"field": 'FAC_DERIVED_HUC'}, -# 'Census Block': {"field": 'FAC_DERIVED_CB2010'} +# 'Census Block': {"field": 'FAC_DERIVED_CB2010'} # We don't have this in the spatial database because there are too many CBs - too much data! +# 'EPA Region': { "field": 'FAC_EPA_REGION'} # Possibly too large to handle in Colab environment states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", @@ -14,3 +91,61 @@ "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"] +fips = { + "AK": "02", + "AL": "01", + "AR": "05", + "AS": "60", + "AZ": "04", + "CA": "06", + "CO": "08", + "CT": "09", + "DC": "11", + "DE": "10", + "FL": "12", + "GA": "13", + "GU": "66", + "HI": "15", + "IA": "19", + "ID": "16", + "IL": "17", + "IN": "18", + "KS": "20", + "KY": "21", + "LA": "22", + "MA": "25", + "MD": "24", + "ME": "23", + "MI": "26", + "MN": "27", + "MO": "29", + "MS": "28", + "MT": "30", + "NC": "37", + "ND": "38", + "NE": "31", + "NH": "33", + "NJ": "34", + "NM": "35", + "NV": "32", + "NY": "36", + "OH": "39", + "OK": "40", + "OR": "41", + "PA": "42", + "PR": "72", + "RI": "44", + "SC": "45", + "SD": "46", + "TN": "47", + "TX": "48", + "UT": "49", + "VA": "51", + "VI": "78", + "VT": "50", + "WA": "53", + "WI": "55", + "WV": "54", + "WY": "56" +} + diff --git a/state_cd.csv b/state_cd.csv index ce8bef6..f171887 100644 --- a/state_cd.csv +++ b/state_cd.csv @@ -1,439 +1,439 @@ -FAC_STATE,FAC_DERIVED_CD113 -AK,1 -AL,1 -AL,2 -AL,3 -AL,4 -AL,5 -AL,6 -AL,7 -AR,1 -AR,2 -AR,3 -AR,4 -AS,1 -AZ,1 -AZ,2 -AZ,3 -AZ,4 -AZ,5 -AZ,6 -AZ,7 -AZ,8 -AZ,9 -CA,1 -CA,2 -CA,3 -CA,4 -CA,5 -CA,6 -CA,7 -CA,8 -CA,9 -CA,10 -CA,11 -CA,12 -CA,13 -CA,14 -CA,15 -CA,16 -CA,17 -CA,18 -CA,19 -CA,20 -CA,21 -CA,22 -CA,23 -CA,24 -CA,25 -CA,26 -CA,27 -CA,28 -CA,29 -CA,30 -CA,31 -CA,32 -CA,33 -CA,34 -CA,35 -CA,36 -CA,37 -CA,38 -CA,39 -CA,40 -CA,41 -CA,42 -CA,43 -CA,44 -CA,45 -CA,46 -CA,47 -CA,48 -CA,49 -CA,50 -CA,51 -CA,52 -CA,53 -CO,1 -CO,2 -CO,3 -CO,4 -CO,5 -CO,6 -CO,7 -CT,1 -CT,2 -CT,3 -CT,4 -CT,5 -DC,1 -DE,1 -FL,1 -FL,2 -FL,3 -FL,4 -FL,5 -FL,6 -FL,7 -FL,8 -FL,9 -FL,10 -FL,11 -FL,12 -FL,13 -FL,14 -FL,15 -FL,16 -FL,17 -FL,18 -FL,19 -FL,21 -FL,22 -FL,23 -FL,24 -FL,25 -FL,26 -FL,27 -GA,1 -GA,2 -GA,3 -GA,4 -GA,5 -GA,6 -GA,7 -GA,8 -GA,9 -GA,10 -GA,11 -GA,12 -GA,13 -GA,14 -GU,1 -HI,1 -HI,2 -IA,1 -IA,2 -IA,3 -IA,4 -ID,1 -ID,2 -IL,1 -IL,2 -IL,3 -IL,4 -IL,5 -IL,6 -IL,7 -IL,8 -IL,9 -IL,10 -IL,11 -IL,12 -IL,13 -IL,14 -IL,15 -IL,16 -IL,17 -IL,18 -IN,1 -IN,2 -IN,3 -IN,4 -IN,5 -IN,6 -IN,7 -IN,8 -IN,9 -KS,1 -KS,2 -KS,3 -KS,4 -KY,1 -KY,2 -KY,3 -KY,4 -KY,5 -KY,6 -LA,1 -LA,2 -LA,3 -LA,4 -LA,5 -LA,6 -MA,1 -MA,2 -MA,3 -MA,4 -MA,5 -MA,6 -MA,7 -MA,8 -MA,9 -MD,1 -MD,2 -MD,3 -MD,4 -MD,5 -MD,6 -MD,7 -MD,8 -ME,1 -ME,2 -MI,1 -MI,2 -MI,3 -MI,4 -MI,5 -MI,6 -MI,7 -MI,8 -MI,9 -MI,10 -MI,11 -MI,12 -MI,13 -MI,14 -MN,1 -MN,2 -MN,3 -MN,4 -MN,5 -MN,6 -MN,7 -MN,8 -MO,1 -MO,2 -MO,3 -MO,4 -MO,5 -MO,6 -MO,7 -MO,8 -MP,1 -MS,1 -MS,2 -MS,3 -MS,4 -MT,1 -NC,1 -NC,2 -NC,3 -NC,4 -NC,5 -NC,6 -NC,7 -NC,8 -NC,9 -NC,10 -NC,11 -NC,12 -NC,13 -ND,1 -NE,1 -NE,2 -NE,3 -NH,1 -NH,2 -NJ,1 -NJ,2 -NJ,3 -NJ,4 -NJ,5 -NJ,6 -NJ,7 -NJ,8 -NJ,9 -NJ,10 -NJ,11 -NJ,12 -NM,1 -NM,2 -NM,3 -NV,1 -NV,2 -NV,3 -NV,4 -NY,1 -NY,2 -NY,3 -NY,4 -NY,5 -NY,6 -NY,7 -NY,8 -NY,9 -NY,10 -NY,11 -NY,12 -NY,13 -NY,14 -NY,15 -NY,16 -NY,17 -NY,18 -NY,19 -NY,20 -NY,21 -NY,22 -NY,23 -NY,24 -NY,25 -NY,26 -NY,27 -OH,1 -OH,2 -OH,3 -OH,4 -OH,5 -OH,6 -OH,7 -OH,8 -OH,9 -OH,10 -OH,12 -OH,13 -OH,14 -OH,16 -OK,1 -OK,2 -OK,3 -OK,4 -OK,5 -OR,1 -OR,2 -OR,3 -OR,4 -OR,5 -PA,1 -PA,2 -PA,3 -PA,4 -PA,5 -PA,6 -PA,7 -PA,8 -PA,9 -PA,10 -PA,11 -PA,12 -PA,13 -PA,14 -PA,15 -PA,16 -PA,17 -PA,18 -PR,1 -RI,1 -RI,2 -SC,1 -SC,2 -SC,3 -SC,4 -SC,5 -SC,6 -SC,7 -SD,1 -TN,1 -TN,2 -TN,3 -TN,4 -TN,5 -TN,6 -TN,7 -TN,8 -TN,9 -TX,1 -TX,2 -TX,3 -TX,4 -TX,5 -TX,6 -TX,7 -TX,8 -TX,9 -TX,10 -TX,11 -TX,12 -TX,13 -TX,14 -TX,15 -TX,16 -TX,17 -TX,18 -TX,19 -TX,20 -TX,21 -TX,22 -TX,23 -TX,24 -TX,25 -TX,26 -TX,27 -TX,28 -TX,29 -TX,30 -TX,31 -TX,32 -TX,33 -TX,34 -TX,35 -TX,36 -UT,1 -UT,2 -UT,3 -UT,4 -VA,1 -VA,2 -VA,3 -VA,4 -VA,5 -VA,6 -VA,7 -VA,8 -VA,9 -VA,10 -VA,11 -VI,1 -VT,1 -WA,1 -WA,2 -WA,3 -WA,4 -WA,5 -WA,6 -WA,7 -WA,8 -WA,9 -WA,10 -WI,1 -WI,2 -WI,3 -WI,4 -WI,5 -WI,6 -WI,7 -WI,8 -WV,1 -WV,2 -WV,3 -WY,1 +FAC_STATE,FAC_DERIVED_CD113 +AK,01 +AL,01 +AL,02 +AL,03 +AL,04 +AL,05 +AL,06 +AL,07 +AR,01 +AR,02 +AR,03 +AR,04 +AS,01 +AZ,01 +AZ,02 +AZ,03 +AZ,04 +AZ,05 +AZ,06 +AZ,07 +AZ,08 +AZ,09 +CA,01 +CA,02 +CA,03 +CA,04 +CA,05 +CA,06 +CA,07 +CA,08 +CA,09 +CA,10 +CA,11 +CA,12 +CA,13 +CA,14 +CA,15 +CA,16 +CA,17 +CA,18 +CA,19 +CA,20 +CA,21 +CA,22 +CA,23 +CA,24 +CA,25 +CA,26 +CA,27 +CA,28 +CA,29 +CA,30 +CA,31 +CA,32 +CA,33 +CA,34 +CA,35 +CA,36 +CA,37 +CA,38 +CA,39 +CA,40 +CA,41 +CA,42 +CA,43 +CA,44 +CA,45 +CA,46 +CA,47 +CA,48 +CA,49 +CA,50 +CA,51 +CA,52 +CA,53 +CO,01 +CO,02 +CO,03 +CO,04 +CO,05 +CO,06 +CO,07 +CT,01 +CT,02 +CT,03 +CT,04 +CT,05 +DC,01 +DE,01 +FL,01 +FL,02 +FL,03 +FL,04 +FL,05 +FL,06 +FL,07 +FL,08 +FL,09 +FL,10 +FL,11 +FL,12 +FL,13 +FL,14 +FL,15 +FL,16 +FL,17 +FL,18 +FL,19 +FL,21 +FL,22 +FL,23 +FL,24 +FL,25 +FL,26 +FL,27 +GA,01 +GA,02 +GA,03 +GA,04 +GA,05 +GA,06 +GA,07 +GA,08 +GA,09 +GA,10 +GA,11 +GA,12 +GA,13 +GA,14 +GU,01 +HI,01 +HI,02 +IA,01 +IA,02 +IA,03 +IA,04 +ID,01 +ID,02 +IL,01 +IL,02 +IL,03 +IL,04 +IL,05 +IL,06 +IL,07 +IL,08 +IL,09 +IL,10 +IL,11 +IL,12 +IL,13 +IL,14 +IL,15 +IL,16 +IL,17 +IL,18 +IN,01 +IN,02 +IN,03 +IN,04 +IN,05 +IN,06 +IN,07 +IN,08 +IN,09 +KS,01 +KS,02 +KS,03 +KS,04 +KY,01 +KY,02 +KY,03 +KY,04 +KY,05 +KY,06 +LA,01 +LA,02 +LA,03 +LA,04 +LA,05 +LA,06 +MA,01 +MA,02 +MA,03 +MA,04 +MA,05 +MA,06 +MA,07 +MA,08 +MA,09 +MD,01 +MD,02 +MD,03 +MD,04 +MD,05 +MD,06 +MD,07 +MD,08 +ME,01 +ME,02 +MI,01 +MI,02 +MI,03 +MI,04 +MI,05 +MI,06 +MI,07 +MI,08 +MI,09 +MI,10 +MI,11 +MI,12 +MI,13 +MI,14 +MN,01 +MN,02 +MN,03 +MN,04 +MN,05 +MN,06 +MN,07 +MN,08 +MO,01 +MO,02 +MO,03 +MO,04 +MO,05 +MO,06 +MO,07 +MO,08 +MP,01 +MS,01 +MS,02 +MS,03 +MS,04 +MT,01 +NC,01 +NC,02 +NC,03 +NC,04 +NC,05 +NC,06 +NC,07 +NC,08 +NC,09 +NC,10 +NC,11 +NC,12 +NC,13 +ND,01 +NE,01 +NE,02 +NE,03 +NH,01 +NH,02 +NJ,01 +NJ,02 +NJ,03 +NJ,04 +NJ,05 +NJ,06 +NJ,07 +NJ,08 +NJ,09 +NJ,10 +NJ,11 +NJ,12 +NM,01 +NM,02 +NM,03 +NV,01 +NV,02 +NV,03 +NV,04 +NY,01 +NY,02 +NY,03 +NY,04 +NY,05 +NY,06 +NY,07 +NY,08 +NY,09 +NY,10 +NY,11 +NY,12 +NY,13 +NY,14 +NY,15 +NY,16 +NY,17 +NY,18 +NY,19 +NY,20 +NY,21 +NY,22 +NY,23 +NY,24 +NY,25 +NY,26 +NY,27 +OH,01 +OH,02 +OH,03 +OH,04 +OH,05 +OH,06 +OH,07 +OH,08 +OH,09 +OH,10 +OH,12 +OH,13 +OH,14 +OH,16 +OK,01 +OK,02 +OK,03 +OK,04 +OK,05 +OR,01 +OR,02 +OR,03 +OR,04 +OR,05 +PA,01 +PA,02 +PA,03 +PA,04 +PA,05 +PA,06 +PA,07 +PA,08 +PA,09 +PA,10 +PA,11 +PA,12 +PA,13 +PA,14 +PA,15 +PA,16 +PA,17 +PA,18 +PR,01 +RI,01 +RI,02 +SC,01 +SC,02 +SC,03 +SC,04 +SC,05 +SC,06 +SC,07 +SD,01 +TN,01 +TN,02 +TN,03 +TN,04 +TN,05 +TN,06 +TN,07 +TN,08 +TN,09 +TX,01 +TX,02 +TX,03 +TX,04 +TX,05 +TX,06 +TX,07 +TX,08 +TX,09 +TX,10 +TX,11 +TX,12 +TX,13 +TX,14 +TX,15 +TX,16 +TX,17 +TX,18 +TX,19 +TX,20 +TX,21 +TX,22 +TX,23 +TX,24 +TX,25 +TX,26 +TX,27 +TX,28 +TX,29 +TX,30 +TX,31 +TX,32 +TX,33 +TX,34 +TX,35 +TX,36 +UT,01 +UT,02 +UT,03 +UT,04 +VA,01 +VA,02 +VA,03 +VA,04 +VA,05 +VA,06 +VA,07 +VA,08 +VA,09 +VA,10 +VA,11 +VI,01 +VT,01 +WA,01 +WA,02 +WA,03 +WA,04 +WA,05 +WA,06 +WA,07 +WA,08 +WA,09 +WA,10 +WI,01 +WI,02 +WI,03 +WI,04 +WI,05 +WI,06 +WI,07 +WI,08 +WV,01 +WV,02 +WV,03 +WY,01 \ No newline at end of file diff --git a/utilities.py b/utilities.py index 5554ce6..f559b2d 100644 --- a/utilities.py +++ b/utilities.py @@ -1,643 +1,883 @@ -''' -Provide a number of utility Python functions that can de-clutter -the Jupyter notebooks that use them. -''' - -# Import libraries -import pdb -import os -import csv -import datetime -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import folium -import urllib -import seaborn as sns -from folium.plugins import FastMarkerCluster -import ipywidgets as widgets -from ipywidgets import interact, interactive, fixed, interact_manual, Layout -from ECHO_modules.get_data import get_echo_data -from ECHO_modules.geographies import region_field, states - -from IPython.display import display - - -# Set up some default parameters for graphing -from matplotlib import cycler -colour = "#00C2AB" # The default colour for the barcharts -colors = cycler('color', - ['#4FBBA9', '#E56D13', '#D43A69', - '#25539f', '#88BB44', '#FFBBBB']) -plt.rc('axes', facecolor='#E6E6E6', edgecolor='none', - axisbelow=True, grid=True, prop_cycle=colors) -plt.rc('grid', color='w', linestyle='solid') -plt.rc('xtick', direction='out', color='gray') -plt.rc('ytick', direction='out', color='gray') -plt.rc('patch', edgecolor='#E6E6E6') -plt.rc('lines', linewidth=2) -font = {'family' : 'DejaVu Sans', - 'weight' : 'normal', - 'size' : 16} -plt.rc('font', **font) -plt.rc('legend', fancybox = True, framealpha=1, shadow=True, borderpad=1) - - -def fix_county_names( in_counties ): - ''' - ECHO_EXPORTER has counties listed both as ALAMEDA and ALAMEDA COUNTY, seemingly - for every county. We drop the 'COUNTY' so they only get listed once. - - Parameters - ---------- - in_counties : list of county names (str) - - Returns - ------- - list - The list of counties without duplicates - ''' - - counties = [] - for county in in_counties: - if (county.endswith( ' COUNTY' )): - county = county[:-7] - counties.append( county.strip() ) - counties = np.unique( counties ) - return counties - - -def show_region_type_widget(): - ''' - Create and return a dropdown list of types of regions - - Returns - ------- - widget - The dropdown widget with the list of regions - ''' - - style = {'description_width': 'initial'} - select_region_widget = widgets.Dropdown( - options=region_field.keys(), - style=style, - value='County', - description='Region of interest:', - disabled=False - ) - display( select_region_widget ) - return select_region_widget - - -def show_state_widget(): - ''' - Create and return a dropdown list of states - - Returns - ------- - widget - The dropdown widget with the state list - ''' - - dropdown_state=widgets.Dropdown( - options=states, - description='State:', - disabled=False, - ) - - display( dropdown_state ) - return dropdown_state - - -def show_pick_region_widget( type, state_widget=None ): - ''' - Create and return a dropdown list of regions appropriate - to the input parameters - - Parameters - ---------- - type : str - The type of region - state_widget : widget - The widget in which a state may have been selected - - Returns - ------- - widget - The dropdown widget with region choices - ''' - - region_widget = None - - if ( type != 'Zip Code' ): - if ( state_widget is None ): - print( "You must first choose a state." ) - return - my_state = state_widget.value - - if ( type == 'Zip Code' ): - region_widget = widgets.Text( - value='98225', - description='Zip Code:', - disabled=False - ) - elif ( type == 'County' ): - df = pd.read_csv( 'ECHO_modules/state_counties.csv' ) - counties = df[df['FAC_STATE'] == my_state]['FAC_COUNTY'] - region_widget=widgets.SelectMultiple( - options=fix_county_names( counties ), - description='County:', - disabled=False - ) - elif ( type == 'Congressional District' ): - df = pd.read_csv( 'ECHO_modules/state_cd.csv' ) - cds = df[df['FAC_STATE'] == my_state]['FAC_DERIVED_CD113'] - region_widget=widgets.SelectMultiple( - options=cds.to_list(), - description='District:', - disabled=False - ) - if ( region_widget is not None ): - display( region_widget ) - return region_widget - - -def get_regions_selected( region_type, region_widget ): - ''' - The region_widget may have multiple selections. - Depending on its region_type, extract the selections - and return them. - - Parameters - ---------- - region_type : string - 'Zip Code', 'Congressional District', 'County' - - region_widget : widget - The widget that will contain the selections. - - Returns - ------- - list - The selections - ''' - - selections = list() - if ( region_type == 'Zip Code' ): - selections = region_widget.value.split(',') - else: - selections = list( region_widget.value ) - - return selections - - -def show_data_set_widget( data_sets ): - ''' - Create and return a dropdown list of data sets with appropriate - flags set in the echo_data. - - Parameters - ---------- - data_sets : dict - The data sets, key = name, value = DataSet object - - Returns - ------- - widget - The widget with data set choices - ''' - - data_set_choices = list( data_sets.keys() ) - - data_set_widget=widgets.Dropdown( - options=list(data_set_choices), - description='Data sets:', - disabled=False, - ) - display(data_set_widget) - return data_set_widget - - -def show_fac_widget( fac_series ): - ''' - Create and return a dropdown list of facilities from the - input Series - - Parameters - ---------- - fac_series : Series - The facilities to be shown. It may have duplicates. - - Returns - ------- - widget - The widget with facility names - ''' - - fac_list = fac_series.dropna().unique() - fac_list.sort() - style = {'description_width': 'initial'} - widget=widgets.SelectMultiple( - options=fac_list, - style=style, - layout=Layout(width='70%'), - description='Facility Name:', - disabled=False, - ) - display(widget) - return widget - - -def get_active_facilities( state, region_type, regions_selected ): - ''' - Get a Dataframe with the ECHO_EXPORTER facilities with FAC_ACTIVE_FLAG - set to 'Y' for the region selected. - - Parameters - ---------- - state : str - The state, which could be None - region_type : str - The type of region: 'State', 'Congressional District', etc. - regions_selected : list - The selected regions of the specified region_type - - Returns - ------- - Dataframe - The active facilities returned from the database query - ''' - - if ( region_type == 'State' ): - sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' - sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' - sql = sql.format( state ) - df_active = get_echo_data( sql, 'REGISTRY_ID' ) - elif ( region_type == 'Congressional District'): - cd_str = ",".join( map( lambda x: str(x), regions_selected )) - sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' - sql += ' and "FAC_DERIVED_CD113" in ({})' - sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' - sql = sql.format( state, cd_str ) - df_active = get_echo_data( sql, 'REGISTRY_ID' ) - elif ( region_type == 'County' ): - # Single items in a list will have a comma at the end that trips up - # the query. Convert the regions_selected list to a string. - regions = "'" + "','".join( regions_selected ) + "'" - - sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' - sql += ' and "FAC_COUNTY" in ({})' - sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' - sql = sql.format( state, regions ) - df_active = get_echo_data( sql, 'REGISTRY_ID' ) - elif ( region_type == 'Zip Code' ): - sql = 'select * from "ECHO_EXPORTER" where "FAC_ZIP" = \'{}\'' - sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' - sql = sql.format( regions_selected ) - df_active = get_echo_data( sql, 'REGISTRY_ID' ) - else: - df_active = None - return df_active - - -def marker_text( row, no_text ): - ''' - Create a string with information about the facility or program instance. - - Parameters - ---------- - row : Series - Expected to contain FAC_NAME and DFR_URL fields from ECHO_EXPORTER - no_text : Boolean - If True, don't put any text with the markers, which reduces chance of errors - - Returns - ------- - str - The text to attach to the marker - ''' - - text = "" - if ( no_text ): - return text - if ( type( row['FAC_NAME'] == str )) : - try: - text = row["FAC_NAME"] + ' - ' - except TypeError: - print( "A facility was found without a name. ") - if 'DFR_URL' in row: - text += " -
" - return text - - -def check_bounds( row, bounds ): - ''' - See if the FAC_LAT and FAC_LONG of the row are interior to - the minx, miny, maxx, maxy of the bounds. - - Parameters - ---------- - row : Series - Must contain FAC_LAT and FAC_LONG - bounds : Dataframe - Bounding rectangle--minx,miny,maxx,maxy - - Returns - ------- - True if the row's point is in the bounds - ''' - - if ( row['FAC_LONG'] < bounds.minx[0] or row['FAC_LAT'] < bounds.miny[0] \ - or row['FAC_LONG'] > bounds.maxx[0] or row['FAC_LAT'] > bounds.maxy[0]): - return False - return True - - -def mapper(df, bounds=None, no_text=False): - ''' - Display a map of the Dataframe passed in. - Based on https://medium.com/@bobhaffner/folium-markerclusters-and-fastmarkerclusters-1e03b01cb7b1 - - Parameters - ---------- - df : Dataframe - The facilities to map. They must have a FAC_LAT and FAC_LONG field. - bounds : Dataframe - A bounding rectangle--minx, miny, maxx, maxy. Discard points outside. - - Returns - ------- - folium.Map - ''' - - # Initialize the map - m = folium.Map( - location = [df.mean()["FAC_LAT"], df.mean()["FAC_LONG"]] - ) - - # Create the Marker Cluster array - #kwargs={"disableClusteringAtZoom": 10, "showCoverageOnHover": False} - mc = FastMarkerCluster("") - - # Add a clickable marker for each facility - for index, row in df.iterrows(): - if ( bounds is not None ): - if ( not check_bounds( row, bounds )): - continue - mc.add_child(folium.CircleMarker( - location = [row["FAC_LAT"], row["FAC_LONG"]], - popup = marker_text( row, no_text ), - radius = 8, - color = "black", - weight = 1, - fill_color = "orange", - fill_opacity= .4 - )) - - m.add_child(mc) - bounds = m.get_bounds() - m.fit_bounds(bounds) - - # Show the map - return m - -def point_mapper(df, aggcol, quartiles=False, other_fac=None): - ''' - Display a point symbol map of the Dataframe passed in. A point symbol map represents - each facility as a point, with the size of the point scaled to the data value - (e.g. inspections, violations) proportionally or through quartiles. - Parameters - ---------- - df : Dataframe - The facilities to map. They must have a FAC_LAT and FAC_LONG field. - This Dataframe should - already be aggregated by facility e.g.: - NPDES_ID violations FAC_LAT FAC_LONG - NY12345 13 43.03 -73.92 - NY54321 2 42.15 -80.12 - ... - aggcol : String - The name of the field in the Dataframe that has been aggregated. This is - used for the legend (pop-up window on the map) - quartiles : Boolean - False (default) returns a proportionally-scaled point symbol map, meaning - that the radius of each point is directly scaled to the value (e.g. 13 violations) - True returns a graduated point symbol map, meaning that the radius of each - point is a function of the splitting the Dataframe into quartiles. - other_fac : Dataframe - Other regulated facilities without violations, inspections, - penalties, etc. - whatever the value being mapped is. This is an optional - variable enabling further context to the map. They must have a FAC_LAT and FAC_LONG field. - Returns - ------- - folium.Map - ''' - if ( df is not None ): - - map_of_facilities = folium.Map() - - if quartiles == True: - df['quantile'] = pd.qcut(df[aggcol], 4, labels=False, duplicates="drop") - scale = {0: 8,1:12, 2: 16, 3: 24} # First quartile = size 8 circles, etc. - - # Add a clickable marker for each facility - for index, row in df.iterrows(): - if quartiles == True: - r = scale[row["quantile"]] - else: - r = row[aggcol] - map_of_facilities.add_child(folium.CircleMarker( - location = [row["FAC_LAT"], row["FAC_LONG"]], - popup = aggcol+": "+str(row[aggcol]), - radius = r * 4, # arbitrary scalar - color = "black", - weight = 1, - fill_color = "orange", - fill_opacity= .4 - )) - - if ( other_fac is not None ): - for index, row in other_fac.iterrows(): - map_of_facilities.add_child(folium.CircleMarker( - location = [row["FAC_LAT"], row["FAC_LONG"]], - popup = "other facility", - radius = 4, - color = "black", - weight = 1, - fill_color = "black", - fill_opacity= 1 - )) - - return map_of_facilities - - else: - print( "There are no facilities to map." ) - -def write_dataset( df, base, type, state, regions ): - ''' - Write out a file of the Dataframe passed in. - - Parameters - ---------- - df : Dataframe - The data to write. - base: str - A base string of the file to write - type: str - The region type of the data - state: str - The state, or None - regions: list - The region identifiers, e.g. CD number, County, State, Zip code - ''' - if ( df is not None and len( df ) > 0 ): - if ( not os.path.exists( 'CSVs' )): - os.makedirs( 'CSVs' ) - filename = 'CSVs/' + base[:50] - if ( type != 'Zip Code' ): - filename += '-' + state - filename += '-' + type - if ( regions is not None ): - for region in regions: - filename += '-' + str(region) - filename = urllib.parse.quote_plus(filename, safe='/') - filename += '.csv' - df.to_csv( filename ) - print( "Wrote " + filename ) - else: - print( "There is no data to write." ) - - -def make_filename( base, type, state, region, filetype='csv' ): - ''' - Make a filename from the parameters and return it. - The filename will be in the Output directory relative to - the current working directory, and in a sub-directory - built out of the state and CD. - - Parameters - ---------- - base : str - A base string of the file - type : str - The region type - state : str - The state or None - region : str - The region - filetype : str - Optional file suffix. - - Returns - ------- - str - The filename created. - - Examples - -------- - >>> filename = make_filename( 'noncomp_CWA_pg6', *df_type ) - ''' - # If type is 'State', the state name is the region. - dir = 'Output/' - if ( type == 'State' ): - dir += region - filename = base + '_' + region - else: - dir += state - filename = base + '_' + state - if ( region is not None ): - dir += str(region) - filename += '-' + str(region) - x = datetime.datetime.now() - filename += '-' + x.strftime( "%m%d%y") +'.' + filetype - dir += '/' - if ( not os.path.exists( dir )): - os.makedirs( dir ) - return dir + filename - - -def get_top_violators( df_active, flag, noncomp_field, action_field, num_fac=10 ): - ''' - Sort the dataframe and return the rows that have the most number of - non-compliant quarters. - - Parameters - ---------- - df_active : Dataframe - Must have ECHO_EXPORTER fields - flag : str - Identifies the EPA programs of the facility (AIR_FLAG, NPDES_FLAG, etc.) - noncomp_field : str - The field with the non-compliance values, 'S' or 'V'. - action_field - The field with the count of quarters with formal actions - num_fac - The number of facilities to include in the returned Dataframe - - Returns - ------- - Dataframe - The top num_fac violators for the EPA program in the region - - Examples - -------- - >>> df_violators = get_top_violators( df_active, 'AIR_FLAG', - 'CAA_3YR_COMPL_QTRS_HISTORY', 'CAA_FORMAL_ACTION_COUNT', 20 ) - ''' - df = df_active.loc[ df_active[flag] == 'Y' ] - if ( len( df ) == 0 ): - return None - df_active = df.copy() - noncomp = df_active[ noncomp_field ] - noncomp_count = noncomp.str.count('S') + noncomp.str.count('V') - df_active['noncomp_count'] = noncomp_count - df_active = df_active[['FAC_NAME', 'noncomp_count', action_field, - 'DFR_URL', 'FAC_LAT', 'FAC_LONG']] - df_active = df_active[df_active['noncomp_count'] > 0] - df_active = df_active.sort_values( by=['noncomp_count', action_field], - ascending=False ) - df_active = df_active.head( num_fac ) - return df_active - -def chart_top_violators( ranked, state, selections, epa_pgm ): - ''' - Draw a horizontal bar chart of the top non-compliant facilities. - - Parameters - ---------- - ranked : Dataframe - The facilities to be charted - state : str - The state - selections : list - The selections - epa_pgm : str - The EPA program associated with this list of non-compliant facilities - - Returns - ------- - seaborn.barplot - The graph that is generated - ''' - if ranked is None: - print( 'There is no {} data to graph.'.format( epa_pgm )) - return None - unit = ranked.index - values = ranked['noncomp_count'] - if ( len(values) == 0 ): - return "No {} facilities with non-compliant quarters in {} - {}".format( - epa_pgm, state, str( selections )) - sns.set(style='whitegrid') - fig, ax = plt.subplots(figsize=(10,10)) - try: - g = sns.barplot(x=values, y=unit, order=list(unit), orient="h") - g.set_title('{} facilities with the most non-compliant quarters in {} - {}'.format( - epa_pgm, state, str( selections ))) - ax.set_xlabel("Non-compliant quarters") - ax.set_ylabel("Facility") - ax.set_yticklabels(ranked["FAC_NAME"]) - return ( g ) - except TypeError as te: - print( "TypeError: {}".format( str(te) )) - return None +''' +Provide a number of utility Python functions that can de-clutter +the Jupyter notebooks that use them. +''' + +# Import libraries +import pdb +import os +import csv +import datetime +import pandas as pd +import geopandas +import numpy as np +import matplotlib.pyplot as plt +import urllib +import seaborn as sns + +import folium +from folium.plugins import FastMarkerCluster + +import ipywidgets as widgets +from ipywidgets import interact, interactive, fixed, interact_manual, Layout +from IPython.display import display + +from ECHO_modules.get_data import get_echo_data +from ECHO_modules.geographies import region_field, states + +# Set up some default parameters for graphing +from matplotlib import cycler +colour = "#00C2AB" # The default colour for the barcharts +colors = cycler('color', + ['#4FBBA9', '#E56D13', '#D43A69', + '#25539f', '#88BB44', '#FFBBBB']) +plt.rc('axes', facecolor='#E6E6E6', edgecolor='none', + axisbelow=True, grid=True, prop_cycle=colors) +plt.rc('grid', color='w', linestyle='solid') +plt.rc('xtick', direction='out', color='gray') +plt.rc('ytick', direction='out', color='gray') +plt.rc('patch', edgecolor='#E6E6E6') +plt.rc('lines', linewidth=2) +font = {'family' : 'DejaVu Sans', + 'weight' : 'normal', + 'size' : 16} +plt.rc('font', **font) +plt.rc('legend', fancybox = True, framealpha=1, shadow=True, borderpad=1) + +# Styles for States ("other") and selected regions (e.g. Zip Codes) - "this" +style = {'this': {'fillColor': '#0099ff', 'color': '#182799', "weight": 1}, +'other': {'fillColor': '#FFA500', 'color': '#182799', "weight": 1}} + +def fix_county_names( in_counties ): + ''' + ECHO_EXPORTER has counties listed both as ALAMEDA and ALAMEDA COUNTY, seemingly + for every county. We drop the 'COUNTY' so they only get listed once. + + Parameters + ---------- + in_counties : list of county names (str) + + Returns + ------- + list + The list of counties without duplicates + ''' + + counties = [] + for county in in_counties: + if (county.endswith( ' COUNTY' )): + county = county[:-7] + counties.append( county.strip() ) + counties = np.unique( counties ) + return counties + + +def show_region_type_widget(): + ''' + Create and return a dropdown list of types of regions + + Returns + ------- + widget + The dropdown widget with the list of regions + ''' + + style = {'description_width': 'initial'} + select_region_widget = widgets.Dropdown( + options=region_field.keys(), + style=style, + value='County', + description='Region of interest:', + disabled=False + ) + display( select_region_widget ) + return select_region_widget + + +def show_state_widget(): + ''' + Create and return a dropdown list of states + + Returns + ------- + widget + The dropdown widget with the state list + ''' + + dropdown_state=widgets.Dropdown( + options=states, + description='State:', + disabled=False, + ) + + display( dropdown_state ) + return dropdown_state + + +def show_pick_region_widget( type, my_state, input ): + ''' + Create and return a dropdown list of regions appropriate + to the input parameters + + Parameters + ---------- + type : str + The type of region + state : str + "AL", "AR", etc. + + Returns + ------- + widget + The dropdown widget with region choices + ''' + + region_widget = None + """ + Can be deleted b/c Cross-Programs will account for this + if ( type != 'Zip Code' ): + if ( state_widget is None ): + print( "You must first choose a state." ) + return + + """ + + if ( type == 'Zip Code' ): + region_widget = widgets.Text( + value='98225', + description='Zip Code:', + disabled=False + ) + elif ( type == 'County' ): + df = pd.read_csv( 'ECHO_modules/state_counties.csv' ) + counties = df[df['FAC_STATE'] == my_state]['FAC_COUNTY'] + region_widget=widgets.SelectMultiple( + options=fix_county_names( counties ), + description='County:', + disabled=False + ) + elif ( type == 'Congressional District' ): + df = pd.read_csv( 'ECHO_modules/state_cd.csv', dtype={'FAC_STATE': str, 'FAC_DERIVED_CD113':str}) + #df['FAC_DERIVED_CD113'] = df['FAC_DERIVED_CD113'].astype(str) # To preserve 01, 02, etc. + cds = df[df['FAC_STATE'] == my_state]['FAC_DERIVED_CD113'] + region_widget=widgets.SelectMultiple( + options=cds.to_list(), + description='District:', + disabled=False + ) + elif ( type == 'Watershed' ): + region_widget=widgets.SelectMultiple( + options= input, + description='Watershed:', + disabled=False + ) + elif ( type == 'Census Tract' ): + region_widget=widgets.SelectMultiple( + options= input, + description='Tract:', + disabled=False + ) + if ( region_widget is not None ): + display( region_widget ) + return region_widget + + +def get_regions_selected( region_type, region_widget ): + ''' + The region_widget may have multiple selections. + Depending on its region_type, extract the selections + and return them. + + Parameters + ---------- + region_type : string + 'Zip Code', 'Congressional District', 'County' + + region_widget : widget + The widget that will contain the selections. + + Returns + ------- + list + The selections + ''' + + selections = list() + if ( region_type == 'Zip Code' ): + selections = region_widget.value.split(',') + else: + selections = list( region_widget.value ) + + return selections + + +def show_data_set_widget( data_sets ): + ''' + Create and return a dropdown list of data sets with appropriate + flags set in the echo_data. + + Parameters + ---------- + data_sets : dict + The data sets, key = name, value = DataSet object + + Returns + ------- + widget + The widget with data set choices + ''' + + data_set_choices = list( data_sets.keys() ) + + data_set_widget=widgets.Dropdown( + options=list(data_set_choices), + description='Data sets:', + disabled=False, + ) + display(data_set_widget) + return data_set_widget + + +def show_fac_widget( fac_series ): + ''' + Create and return a dropdown list of facilities from the + input Series + + Parameters + ---------- + fac_series : Series + The facilities to be shown. It may have duplicates. + + Returns + ------- + widget + The widget with facility names + ''' + + fac_list = fac_series.dropna().unique() + fac_list.sort() + style = {'description_width': 'initial'} + widget=widgets.SelectMultiple( + options=fac_list, + style=style, + layout=Layout(width='70%'), + description='Facility Name:', + disabled=False, + ) + display(widget) + return widget + + +def get_active_facilities( state, region_type, regions_selected ): + ''' + Get a Dataframe with the ECHO_EXPORTER facilities with FAC_ACTIVE_FLAG + set to 'Y' for the region selected. + + Parameters + ---------- + state : str + The state, which could be None + region_type : str + The type of region: 'State', 'Congressional District', etc. + regions_selected : list + The selected regions of the specified region_type + + Returns + ------- + Dataframe + The active facilities returned from the database query + ''' + + if ( region_type == 'State' ): + sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( state ) + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + elif ( region_type == 'Congressional District'): + cd_str = ",".join( map( lambda x: str(x), regions_selected )) + sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' + sql += ' and "FAC_DERIVED_CD113" in ({})' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( state, cd_str ) + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + elif ( region_type == 'County' ): + # Single items in a list will have a comma at the end that trips up + # the query. Convert the regions_selected list to a string. + regions = "'" + "','".join( regions_selected ) + "'" + sql = 'select * from "ECHO_EXPORTER" where "FAC_STATE" = \'{}\'' + sql += ' and "FAC_COUNTY" in ({})' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( state, regions ) + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + elif ( region_type == 'Zip Code' ): + sql = 'select * from "ECHO_EXPORTER" where "FAC_ZIP" = \'{}\'' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( regions_selected[0] ) # UNIQUE CASE - replace with list of zip codes??? + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + elif ( region_type == 'Watershed' ): + regions = "'" + "','".join( regions_selected ) + "'" + sql = 'select * from "ECHO_EXPORTER" where "FAC_DERIVED_HUC" in ({})' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( regions ) + print(sql) + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + elif ( region_type == 'Census Tract' ): + regions = "'" + "','".join( regions_selected ) + "'" + print(regions) #Debugging # Should be something like \'48201431501%%%%\', \'48201240400%%%%\' + sql = 'select * from "ECHO_EXPORTER" where CAST("FAC_DERIVED_CB2010" AS TEXT) LIKE ANY(ARRAY[{}])' + sql += ' and "FAC_ACTIVE_FLAG" = \'Y\'' + sql = sql.format( regions ) + print(sql) + df_active = get_echo_data( sql, 'REGISTRY_ID' ) + else: + df_active = None + return df_active + + +def aggregate_by_facility(data, program, df_active): + ''' + Definition + data = program data + program = program object + df_active = a df generated from previous cells of all fac active in the selected regions + ''' + + diff = None + + def differ(input, program): + ''' + helper function to sort facilities in this program (input) from the full list of faciliities regulated under the program + ''' + diff = list( + set(df_active[program.echo_type + "_IDS"]) - set(input[program.idx_field]) + ) + + # get rid of NaNs - probably no program IDs + diff = [x for x in diff if str(x) != 'nan'] + + # ^ Not perfect given that some facilities have multiple NPDES_IDs + # Below return the full ECHO_EXPORTER details for facilities without violations, penalties, or inspections + diff = df_active.loc[df_active[program.echo_type + "_IDS"].isin(diff)] + return diff + + if (program.name == "CWA Violations"): + year = data["YEARQTR"].astype("str").str[0:4:1] + data["YEARQTR"] = year + data["sum"] = data["NUME90Q"] + data["NUMCVDT"] + data['NUMSVCD'] + data["NUMPSCH"] + data = data.groupby([program.idx_field, "FAC_NAME", "FAC_LAT", "FAC_LONG"]).sum() + data = data.reset_index() + data = data.loc[data["sum"] > 0] # only symbolize facilities with violations + diff = differ(data, program) + aggregator = "sum" # keep track of which field we use to aggregate data, which may differ from the preset + + # Penalties + elif (program.name == "CAA Penalties" or program.name == "RCRA Penalties" or program.name == "CWA Penalties" ): + data.rename( columns={ program.date_field: 'Date', program.agg_col: 'Amount'}, inplace=True ) + if ( program.name == "CWA Penalties" ): + data['Amount'] = data['Amount'].fillna(0) + data['STATE_LOCAL_PENALTY_AMT'].fillna(0) + data = data.groupby([program.idx_field, "FAC_NAME", "FAC_LAT", "FAC_LONG"]).agg({'Amount':'sum'}) + data = data.reset_index() + data = data.loc[data["Amount"] > 0] # only symbolize facilities with penalties + diff = differ(data, program) + aggregator = "Amount" # keep track of which field we use to aggregate data, which may differ from the preset + + # Air emissions + + # Inspections, violations + else: + data = data.groupby([program.idx_field, "FAC_NAME", "FAC_LAT", "FAC_LONG"]).agg({program.date_field: 'count'}) + data['count'] = data[program.date_field] + data = data.reset_index() + data = data.loc[data["count"] > 0] # only symbolize facilities with X + diff = differ(data, program) + aggregator = "count" # ??? keep track of which field we use to aggregate data, which may differ from the preset + + if ( len(data) > 0 ): + return {"data": data, "diff": diff, "aggregator": aggregator} + else: + print( "There is no data for this program and region after 2000." ) + + +def marker_text( row, no_text ): + ''' + Create a string with information about the facility or program instance. + + Parameters + ---------- + row : Series + Expected to contain FAC_NAME and DFR_URL fields from ECHO_EXPORTER + no_text : Boolean + If True, don't put any text with the markers, which reduces chance of errors + + Returns + ------- + str + The text to attach to the marker + ''' + + text = "" + if ( no_text ): + return text + if ( type( row['FAC_NAME'] == str )) : + try: + text = row["FAC_NAME"] + ' - ' + except TypeError: + print( "A facility was found without a name. ") + if 'DFR_URL' in row: + text += " - " + return text + + +def check_bounds( row, bounds ): + ''' + See if the FAC_LAT and FAC_LONG of the row are interior to + the minx, miny, maxx, maxy of the bounds. + + Parameters + ---------- + row : Series + Must contain FAC_LAT and FAC_LONG + bounds : Dataframe + Bounding rectangle--minx,miny,maxx,maxy + + Returns + ------- + True if the row's point is in the bounds + ''' + + if ( row['FAC_LONG'] < bounds.minx[0] or row['FAC_LAT'] < bounds.miny[0] \ + or row['FAC_LONG'] > bounds.maxx[0] or row['FAC_LAT'] > bounds.maxy[0]): + return False + return True + + +def mapper(df, bounds=None, no_text=False): + ''' + Display a map of the Dataframe passed in. + Based on https://medium.com/@bobhaffner/folium-markerclusters-and-fastmarkerclusters-1e03b01cb7b1 + + Parameters + ---------- + df : Dataframe + The facilities to map. They must have a FAC_LAT and FAC_LONG field. + bounds : Dataframe + A bounding rectangle--minx, miny, maxx, maxy. Discard points outside. + + Returns + ------- + folium.Map + ''' + + # Initialize the map + m = folium.Map( + location = [df.mean()["FAC_LAT"], df.mean()["FAC_LONG"]] + ) + + # Create the Marker Cluster array + #kwargs={"disableClusteringAtZoom": 10, "showCoverageOnHover": False} + mc = FastMarkerCluster("") + + # Add a clickable marker for each facility + for index, row in df.iterrows(): + if ( bounds is not None ): + if ( not check_bounds( row, bounds )): + continue + mc.add_child(folium.CircleMarker( + location = [row["FAC_LAT"], row["FAC_LONG"]], + popup = marker_text( row, no_text ), + radius = 8, + color = "black", + weight = 1, + fill_color = "orange", + fill_opacity= .4 + )) + + m.add_child(mc) + + bounds = m.get_bounds() + m.fit_bounds(bounds) + + # Show the map + return m + +def point_mapper(df, aggcol, quartiles=False, other_fac=None, basemap=None): + ''' + Display a point symbol map of the Dataframe passed in. A point symbol map represents + each facility as a point, with the size of the point scaled to the data value + (e.g. inspections, violations) proportionally or through quartiles. + Parameters + ---------- + df : Dataframe + The facilities to map. They must have a FAC_LAT and FAC_LONG field. + This Dataframe should + already be aggregated by facility e.g.: + NPDES_ID violations FAC_LAT FAC_LONG + NY12345 13 43.03 -73.92 + NY54321 2 42.15 -80.12 + ... + aggcol : String + The name of the field in the Dataframe that has been aggregated. This is + used for the legend (pop-up window on the map) + quartiles : Boolean + False (default) returns a proportionally-scaled point symbol map, meaning + that the radius of each point is directly scaled to the value (e.g. 13 violations) + True returns a graduated point symbol map, meaning that the radius of each + point is a function of the splitting the Dataframe into quartiles. + other_fac : Dataframe + Other regulated facilities without violations, inspections, + penalties, etc. - whatever the value being mapped is. This is an optional + variable enabling further context to the map. They must have a FAC_LAT and FAC_LONG field. + basemap : Dataframe + Should be a spatial dataframe from get_spatial_data that can be mapped + + Returns + ------- + folium.Map + ''' + if ( df is not None ): + + map_of_facilities = folium.Map() + + if quartiles == True: + df['quantile'] = pd.qcut(df[aggcol], 4, labels=False, duplicates="drop") + scale = {0: 8,1:12, 2: 16, 3: 24} # First quartile = size 8 circles, etc. + + # add basemap (selected regions) + if (basemap is not None): + b = folium.GeoJson( + basemap, + style_function = lambda x: style['this'] + ).add_to(map_of_facilities) + + # Add a clickable marker for each facility with info + for index, row in df.iterrows(): + if quartiles == True: + r = scale[row["quantile"]] + else: + r = row[aggcol] + map_of_facilities.add_child(folium.CircleMarker( + location = [row["FAC_LAT"], row["FAC_LONG"]], + popup = marker_text( row, False ) + "" + aggcol + ": "+str(row[aggcol]), + radius = r * 2, # arbitrary scalar + color = "black", + weight = 1, + fill_color = "orange", + fill_opacity= .4 + )) + + # add other facilities + if ( other_fac is not None ): + for index, row in other_fac.iterrows(): + map_of_facilities.add_child(folium.CircleMarker( + location = [row["FAC_LAT"], row["FAC_LONG"]], + popup = marker_text( row, False ), + radius = 4, + color = "black", + weight = 1, + fill_color = "black", + fill_opacity= 1 + )) + + # check and fit bounds + bounds = map_of_facilities.get_bounds() + map_of_facilities.fit_bounds(bounds) + + return map_of_facilities + + else: + print( "There are no facilities to map." ) + +def show_map(regions, states, region_type, spatial_tables): + ''' + # show the map of just the regions (e.g. zip codes) and the selected state(s) + # create the map using a library called Folium (https://github.com/python-visualization/folium) + ''' + m = folium.Map() + + # Show the state(s) + s = folium.GeoJson( + states, + name = "State", + style_function = lambda x: style['other'] + ).add_to(m) + + # Show the intersection regions (e.g. Zip Codes) + i = folium.GeoJson( + regions, + name = region_type, + style_function = lambda x: style['this'] + ).add_to(m) + folium.GeoJsonTooltip(fields=[spatial_tables[region_type]["pretty_field"].lower()]).add_to(i) # Add tooltip for identifying features + + # compute boundaries so that the map automatically zooms in + bounds = m.get_bounds() + m.fit_bounds(bounds, padding=0) + + # display the map! + display(m) + +def selector(units): + ''' + helper function for `get_spatial_data` + helps parse out multiple inputs into a SQL format + e.g. takes a list ["AL", "AK", "AR"] and returns the string ("AL", "AK", "AR") + ''' + selection = '(' + if (type(units) == list): + for place in units: + selection += '\''+str(place)+'\', ' + selection = selection[:-2] # remove trailing comma + selection += ')' + else: + selection = '(\''+str(units)+'\')' + return selection + +def get_spatial_data(region_type, states, spatial_tables, fips=None): + ''' + returns spatial data from the database utilizing an intersection query + e.g. return watersheds based on whether they cross the selected state + + region_type = "Congressional District" # from cell 3 region_type_widget + states = ["AL"] # from cell 2 state dropdown selection. + states variable has ability to be expanded to multiple state selection. + spatial_tables is from ECHO_modules/geographies.py + ''' + + def sqlizer(query): + ''' + takes template sql and injects a query into it to return geojson-formatted geo data + ''' + #develop sql + sql = """ + SELECT jsonb_build_object( + 'type', 'FeatureCollection', 'features', jsonb_agg(features.feature) + ) + FROM ( + SELECT jsonb_build_object( + 'type', 'Feature','id', gid, 'geometry', + ST_AsGeoJSON(geom)::jsonb,'properties', + to_jsonb(inputs) - 'gid' - 'geom' + ) feature + FROM ( + """+query+""" + ) inputs + ) features; + """ + + url = 'http://portal.gss.stonybrook.edu/echoepa/index2.php?query=' + data_location = url + urllib.parse.quote_plus(sql) + '&pg' + #print(data_location) # Debugging + #print(sql) # Debugging + result = geopandas.read_file(data_location) + return result + + selection = selector(states) + + # Get the regions of interest (watersheds, zips, etc.) based on their intersection with the state(s) + if (region_type == "Census Tract"): + # Get all census tracts for this state + # Which state is it? FIPS look up + f = fips[states[0]] #assuming just one state for the time being + print(f) + # Get tracts + import requests, zipfile, io + url = "https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_"+f+"_tract10.zip" + r = requests.get(url) + z = zipfile.ZipFile(io.BytesIO(r.content)) + z.extractall("/content") + regions = geopandas.read_file("/content/tl_2010_"+f+"_tract10.shp") + regions.columns = regions.columns.str.lower() #convert columns to lowercase for consistency + + else: + #print(selection) # Debugging + query = """ + SELECT this.* + FROM """ + spatial_tables[region_type]['table_name'] + """ AS this + JOIN """ + spatial_tables["State"]['table_name'] + """ AS other + ON other.""" + spatial_tables["State"]['id_field'] + """ IN """ + selection + """ + AND ST_Within(this.geom,other.geom) """ + regions = sqlizer(query) + + # Get the intersecting geo (i.e. states) + query = """ + SELECT * + FROM """ + spatial_tables["State"]['table_name'] + """ + WHERE """ + spatial_tables["State"]['id_field'] + """ IN """ + selection + "" + states = sqlizer(query) #reset intersecting_geo to its spatial data + + return regions, states + # send results to the show_map function to display + #show_map(regions, states, region_type, spatial_tables) + +def write_dataset( df, base, type, state, regions ): + ''' + Write out a file of the Dataframe passed in. + + Parameters + ---------- + df : Dataframe + The data to write. + base: str + A base string of the file to write + type: str + The region type of the data + state: str + The state, or None + regions: list + The region identifiers, e.g. CD number, County, State, Zip code + ''' + if ( df is not None and len( df ) > 0 ): + if ( not os.path.exists( 'CSVs' )): + os.makedirs( 'CSVs' ) + filename = 'CSVs/' + base[:50] + if ( type != 'Zip Code' ): + filename += '-' + state + filename += '-' + type + if ( regions is not None ): + for region in regions: + filename += '-' + str(region) + filename = urllib.parse.quote_plus(filename, safe='/') + filename += '.csv' + df.to_csv( filename ) + print( "Wrote " + filename ) + else: + print( "There is no data to write." ) + + +def make_filename( base, type, state, region, filetype='csv' ): + ''' + Make a filename from the parameters and return it. + The filename will be in the Output directory relative to + the current working directory, and in a sub-directory + built out of the state and CD. + + Parameters + ---------- + base : str + A base string of the file + type : str + The region type + state : str + The state or None + region : str + The region + filetype : str + Optional file suffix. + + Returns + ------- + str + The filename created. + + Examples + -------- + >>> filename = make_filename( 'noncomp_CWA_pg6', *df_type ) + ''' + # If type is 'State', the state name is the region. + dir = 'Output/' + if ( type == 'State' ): + dir += region + filename = base + '_' + region + else: + dir += state + filename = base + '_' + state + if ( region is not None ): + dir += str(region) + filename += '-' + str(region) + x = datetime.datetime.now() + filename += '-' + x.strftime( "%m%d%y") +'.' + filetype + dir += '/' + if ( not os.path.exists( dir )): + os.makedirs( dir ) + return dir + filename + + +def get_top_violators( df_active, flag, noncomp_field, action_field, num_fac=10 ): + ''' + Sort the dataframe and return the rows that have the most number of + non-compliant quarters. + + Parameters + ---------- + df_active : Dataframe + Must have ECHO_EXPORTER fields + flag : str + Identifies the EPA programs of the facility (AIR_FLAG, NPDES_FLAG, etc.) + noncomp_field : str + The field with the non-compliance values, 'S' or 'V'. + action_field + The field with the count of quarters with formal actions + num_fac + The number of facilities to include in the returned Dataframe + + Returns + ------- + Dataframe + The top num_fac violators for the EPA program in the region + + Examples + -------- + >>> df_violators = get_top_violators( df_active, 'AIR_FLAG', + 'CAA_3YR_COMPL_QTRS_HISTORY', 'CAA_FORMAL_ACTION_COUNT', 20 ) + ''' + df = df_active.loc[ df_active[flag] == 'Y' ] + if ( len( df ) == 0 ): + return None + df_active = df.copy() + noncomp = df_active[ noncomp_field ] + noncomp_count = noncomp.str.count('S') + noncomp.str.count('V') + df_active['noncomp_count'] = noncomp_count + df_active = df_active[['FAC_NAME', 'noncomp_count', action_field, + 'DFR_URL', 'FAC_LAT', 'FAC_LONG']] + df_active = df_active[df_active['noncomp_count'] > 0] + df_active = df_active.sort_values( by=['noncomp_count', action_field], + ascending=False ) + df_active = df_active.head( num_fac ) + return df_active + +def chart_top_violators( ranked, state, selections, epa_pgm ): + ''' + Draw a horizontal bar chart of the top non-compliant facilities. + + Parameters + ---------- + ranked : Dataframe + The facilities to be charted + state : str + The state + selections : list + The selections + epa_pgm : str + The EPA program associated with this list of non-compliant facilities + + Returns + ------- + seaborn.barplot + The graph that is generated + ''' + if ranked is None: + print( 'There is no {} data to graph.'.format( epa_pgm )) + return None + unit = ranked.index + values = ranked['noncomp_count'] + if ( len(values) == 0 ): + return "No {} facilities with non-compliant quarters in {} - {}".format( + epa_pgm, state, str( selections )) + + sns.set(style='whitegrid') + fig, ax = plt.subplots(figsize=(10,10)) + #cmap = sns.color_palette("rocket", as_cmap=True) + #barplot_colors = [cmap(c) for c in values] + + try: + g = sns.barplot(x=values, y=unit, order=list(unit), orient="h", palette="rocket") + g.set_title('{} facilities with the most non-compliant quarters in {} - {}'.format( + epa_pgm, state, str( selections ))) + ax.set_xlabel("Non-compliant quarters") + ax.set_ylabel("Facility") + ax.set_yticklabels(ranked["FAC_NAME"]) + return ( g ) + except TypeError as te: + print( "TypeError: {}".format( str(te) )) + return None