diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 39b758f7..662bb898 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -11,7 +11,6 @@ # Public API exports from .api import ( - _check_profiles, get_codes, get_continuous, get_daily, @@ -19,6 +18,7 @@ get_latest_continuous, get_latest_daily, get_monitoring_locations, + get_reference_table, get_samples, get_time_series_metadata, ) @@ -37,9 +37,9 @@ "get_latest_continuous", "get_latest_daily", "get_monitoring_locations", + "get_reference_table", "get_samples", "get_time_series_metadata", - "_check_profiles", "CODE_SERVICES", "SERVICES", "PROFILES", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 63f7b819..11553984 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,11 +16,17 @@ from dataretrieval.utils import BaseMetadata, to_str from dataretrieval.waterdata.types import ( CODE_SERVICES, - PROFILE_LOOKUP, + METADATA_COLLECTIONS, PROFILES, SERVICES, ) -from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data +from dataretrieval.waterdata.utils import ( + SAMPLES_URL, + get_ogc_data, + _construct_api_requests, + _walk_pages, + _check_profiles +) # Set up logger for this module logger = logging.getLogger(__name__) @@ -685,6 +691,8 @@ def get_time_series_metadata( parameter_name: Optional[Union[str, List[str]]] = None, properties: Optional[Union[str, List[str]]] = None, statistic_id: Optional[Union[str, List[str]]] = None, + hydrologic_unit_code: Optional[Union[str, List[str]]] = None, + state_name: Optional[Union[str, List[str]]] = None, last_modified: Optional[Union[str, List[str]]] = None, begin: Optional[Union[str, List[str]]] = None, end: Optional[Union[str, List[str]]] = None, @@ -736,6 +744,17 @@ def get_time_series_metadata( Example codes include 00001 (max), 00002 (min), and 00003 (mean). A complete list of codes and their descriptions can be found at https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + hydrologic_unit_code : string or list of strings, optional + The United States is divided and sub-divided into successively smaller + hydrologic units which are classified into four levels: regions, + sub-regions, accounting units, and cataloging units. The hydrologic + units are arranged within each other, from the smallest (cataloging units) + to the largest (regions). Each hydrologic unit is identified by a unique + hydrologic unit code (HUC) consisting of two to eight digits based on the + four levels of classification in the hydrologic unit system. + state_name : string or list of strings, optional + The name of the state or state equivalent in which the monitoring location + is located. last_modified : string, optional The last time a record was refreshed in our database. This may happen due to regular operational processes and does not necessarily indicate @@ -1388,6 +1407,62 @@ def get_field_measurements( return get_ogc_data(args, output_id, service) +def get_reference_table( + collection: str, + limit: Optional[int] = None, + ) -> Tuple[pd.DataFrame, BaseMetadata]: + """Get metadata reference tables for the USGS Water Data API. + + Reference tables provide the range of allowable values for parameter + arguments in the waterdata module. + + Parameters + ---------- + collection : string + One of the following options: "agency-codes", "altitude-datums", + "aquifer-codes", "aquifer-types", "coordinate-accuracy-codes", + "coordinate-datum-codes", "coordinate-method-codes", "counties", + "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes", + "parameter-codes", "reliability-codes", "site-types", "states", + "statistic-codes", "topographic-codes", "time-zone-codes" + limit : numeric, optional + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 50000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. + """ + valid_code_services = get_args(METADATA_COLLECTIONS) + if collection not in valid_code_services: + raise ValueError( + f"Invalid code service: '{collection}'. " + f"Valid options are: {valid_code_services}." + ) + + req = _construct_api_requests( + service=collection, + limit=limit, + skip_geometry=True, + ) + # Run API request and iterate through pages if needed + return_list, response = _walk_pages( + geopd=False, req=req + ) + + # Give ID column a more meaningful name + if collection.endswith("s"): + return_list = return_list.rename( + columns={"id": f"{collection[:-1].replace('-', '_')}_id"} + ) + else: + return_list = return_list.rename( + columns={"id": f"{collection.replace('-', '_')}_id"} + ) + + # Create metadata object from response + metadata = BaseMetadata(response) + return return_list, metadata + def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: """Return codes from a Samples code service. @@ -1641,31 +1716,3 @@ def get_samples( return df, BaseMetadata(response) - -def _check_profiles( - service: SERVICES, - profile: PROFILES, -) -> None: - """Check whether a service profile is valid. - - Parameters - ---------- - service : string - One of the service names from the "services" list. - profile : string - One of the profile names from "results_profiles", - "locations_profiles", "activities_profiles", - "projects_profiles" or "organizations_profiles". - """ - valid_services = get_args(SERVICES) - if service not in valid_services: - raise ValueError( - f"Invalid service: '{service}'. Valid options are: {valid_services}." - ) - - valid_profiles = PROFILE_LOOKUP[service] - if profile not in valid_profiles: - raise ValueError( - f"Invalid profile: '{profile}' for service '{service}'. " - f"Valid options are: {valid_profiles}." - ) diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index 65e73394..f5e1496b 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -11,6 +11,27 @@ "states", ] +METADATA_COLLECTIONS = Literal[ + "agency-codes", + "altitude-datums", + "aquifer-codes", + "aquifer-types", + "coordinate-accuracy-codes", + "coordinate-datum-codes", + "coordinate-method-codes", + "counties", + "hydrologic-unit-codes", + "medium-codes", + "national-aquifer-codes", + "parameter-codes", + "reliability-codes", + "site-types", + "states", + "statistic-codes", + "topographic-codes", + "time-zone-codes", +] + SERVICES = Literal[ "activities", "locations", diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 46d58b62..ae245f6f 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -4,7 +4,7 @@ import os import re from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, get_args import pandas as pd import requests @@ -13,6 +13,12 @@ from dataretrieval.utils import BaseMetadata from dataretrieval import __version__ +from dataretrieval.waterdata.types import ( + PROFILE_LOOKUP, + PROFILES, + SERVICES, +) + try: import geopandas as gpd @@ -547,7 +553,7 @@ def _walk_pages( logger.info("Requesting: %s", req.url) if not geopd: - logger.warning( + logger.info( "Geopandas not installed. Geometries will be flattened into pandas DataFrames." ) @@ -648,35 +654,38 @@ def _arrange_cols( pd.DataFrame or gpd.GeoDataFrame The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. """ + + # Rename id column to output_id + df = df.rename(columns={"id": output_id}) + + # If properties are provided, filter to only those columns + # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): - if "id" not in properties: - # If user refers to service-specific output id in properties, - # then rename the "id" column to the output_id (id column is - # automatically included). - if output_id in properties: - df = df.rename(columns={"id": output_id}) - # If output id is not in properties, but user requests the plural - # of the output_id (e.g. "monitoring_locations_id"), then rename - # "id" to plural. This is pretty niche. - else: - plural = output_id.replace("_id", "s_id") - if plural in properties: - df = df.rename(columns={"id": plural}) + # Make sure geometry stays in the dataframe if skip_geometry is False + if 'geometry' in df.columns and 'geometry' not in properties: + properties.append('geometry') + # id is technically a valid column from the service, but these + # functions make the name more specific. So, if someone requests + # 'id', give them the output_id column + if 'id' in properties: + properties[properties.index('id')] = output_id df = df.loc[:, [col for col in properties if col in df.columns]] - else: - df = df.rename(columns={"id": output_id}) - + # Move meaningless-to-user, extra id columns to the end # of the dataframe, if they exist - extra_id_cols = set(df.columns).intersection({ + extra_id_col = set(df.columns).intersection({ "latest_continuous_id", "latest_daily_id", "daily_id", "continuous_id", "field_measurement_id" }) - if extra_id_cols: - id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols) + + # If the arbitrary id column is returned (either due to properties + # being none or NaN), then move it to the end of the dataframe, but + # if part of properties, keep in requested order + if extra_id_col and (properties is None or all(pd.isna(properties))): + id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col) df = df.loc[:, id_col_order] return df @@ -821,3 +830,31 @@ def get_ogc_data( return return_list, metadata +def _check_profiles( + service: SERVICES, + profile: PROFILES, +) -> None: + """Check whether a service profile is valid. + + Parameters + ---------- + service : string + One of the service names from the "services" list. + profile : string + One of the profile names from "results_profiles", + "locations_profiles", "activities_profiles", + "projects_profiles" or "organizations_profiles". + """ + valid_services = get_args(SERVICES) + if service not in valid_services: + raise ValueError( + f"Invalid service: '{service}'. Valid options are: {valid_services}." + ) + + valid_profiles = PROFILE_LOOKUP[service] + if profile not in valid_profiles: + raise ValueError( + f"Invalid profile: '{profile}' for service '{service}'. " + f"Valid options are: {valid_profiles}." + ) + diff --git a/demos/WaterData_demo.ipynb b/demos/WaterData_demo.ipynb new file mode 100644 index 00000000..3e60c8bb --- /dev/null +++ b/demos/WaterData_demo.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7d0ca866", + "metadata": {}, + "source": [ + "# Using the `waterdata` module to pull data from the USGS Water Data APIs\n", + "The `waterdata` module will eventually replace the `nwis` module for accessing USGS water data. It leverages the [Water Data APIs](https://api.waterdata.usgs.gov/) to download metadata, daily values, and instantaneous values. \n", + "\n", + "While the specifics of this transition timeline are opaque, it is advised to switch to the new functions as soon as possible to reduce unexpected interruptions in your workflow.\n", + "\n", + "As always, please report any issues you encounter on our [Issues](https://github.com/DOI-USGS/dataretrieval-python/issues) page. If you have questions or need help, please reach out to us at comptools@usgs.gov." + ] + }, + { + "cell_type": "markdown", + "id": "fcccb6e8", + "metadata": {}, + "source": [ + "## Prerequisite: Get your Water Data API key\n", + "We highly suggest signing up for your own API key [here](https://api.waterdata.usgs.gov/signup/) to afford yourself higher rate limits and more reliable access to the data. If you opt not to register for an API key, then the number of requests you can make to the Water Data APIs is considerably lower, and if you share an IP address across users or workflows, you may hit those limits even faster. Luckily, registering for an API key is free and easy.\n", + "\n", + "Once you've copied your API key and saved it in a safe place, you can set it as an environment variable in your python script for the current session:\n", + "\n", + "```python\n", + "import os\n", + "os.environ['API_USGS_PAT'] = 'your_api_key_here'\n", + "``` \n", + "Note that the environment variable name is `API_USGS_PAT`, which stands for \"API USGS Personal Access Token\".\n", + "\n", + "If you'd like a more permanent, repository-specific solution, you can use the `python-dotenv` package to read your API key from a `.env` file in your repository root directory, like this:\n", + "\n", + "```python\n", + "!pip install python-dotenv # only run this line once to install the package in your environment\n", + "from dotenv import load_dotenv\n", + "load_dotenv() # this will load the environment variables from the .env file\n", + "```\n", + "Make sure your `.env` file contains the following line:\n", + "```\n", + "API_USGS_PAT=your_api_key_here\n", + "```\n", + "Also, do not commit your `.env` file to version control, as it contains sensitive information. You can add it to your `.gitignore` file to prevent accidental commits." + ] + }, + { + "cell_type": "markdown", + "id": "4a2b3f0f", + "metadata": {}, + "source": [ + "## Lay of the Land\n", + "Now that your API key is configured, it's time to take a 10,000-ft view of the functions in the `waterdata` module.\n", + "\n", + "### Metadata endpoints\n", + "These functions retrieve metadata tables that can be used to refine your data requests.\n", + "\n", + "- `get_reference_table()` - Not sure which parameter code you're looking for, or which hydrologic unit your study area is in? This function will help you find the right input values for the data endpoints to retrieve the information you want.\n", + "- `get_codes()` - Similar to `get_reference_table()`, this function retrieves dataframes containing available input values that correspond to the Samples water quality database.\n", + "\n", + "### Data endpoints\n", + "- `get_daily()` - Daily values for monitoring locations, parameters, stat codes, and more.\n", + "- `get_continuous()` - Instantaneous values for monitoring locations, parameters, statistical codes, and more.\n", + "- `get_monitoring_locations()`- Monitoring location information such as name, monitoring location ID, latitude, longitude, huc code, site types, and more.\n", + "- `get_time_series_metadata()` - Timeseries metadata across monitoring locations, parameter codes, statistical codes, and more. Can be used to answer the question: what types of data are collected at my site(s) of interest and over what time period are/were they collected? \n", + "- `get_latest_continuous()` - Latest instantaneous values for requested monitoring locations, parameter codes, statistical codes, and more.\n", + "- `get_latest_daily()` - Latest daily values for requested monitoring locations, parameter codes, statistical codes, and more.\n", + "- `get_field_measurements()` - Physically measured values (a.k.a discrete) of gage height, discharge, groundwater levels, and more for requested monitoring locations.\n", + "- `get_samples()` - Discrete water quality sample results for monitoring locations, observed properties, and more." + ] + }, + { + "cell_type": "markdown", + "id": "19b5aebf", + "metadata": {}, + "source": [ + "### A few key tips\n", + "- You'll notice that each of the data functions have many unique inputs you can specify. **DO NOT** specify too many! Specify *just enough* inputs to return what you need. But do not provide redundant geographical or parameter information as this may slow down your query and lead to errors.\n", + "- Each function returns a Tuple, containing a dataframe and a Metadata class. If you have `geopandas` installed in your environment, the dataframe will be a `GeoDataFrame` with a geometry included. If you do not have `geopandas`, the dataframe will be a `pandas` dataframe with the geometry contained in a coordinates column. The Metadata object contains information about your query, like the query url.\n", + "- If you do not want to return the `geometry` column, use the input `skip_geometry=True`.\n", + "- All of these functions (except `get_samples()`) have a `limit` argument, which signifies the number of rows returned with each \"page\" of data. The Water Data APIs use paging to chunk up large responses and send data most efficiently to the requester. The `waterdata` functions collect the rows of data from each page and combine them into one final dataframe at the end. The default and maximum limit per page is 50,000 rows. In other words, if you request 100,000 rows of data from the database, it will return all the data in 2 pages, and each page counts as a \"request\" using your API key. If you were to change the argument to `limit=10000`, then each page returned would contain 10,000 rows, and it would take 10 requests/pages to return the total 100,000 rows. In general, there is no need to adjust the `limit` argument. However, if you are working with slow internet speeds, adjusting the `limit` argument may reduce chances of failures due to bandwidth." + ] + }, + { + "cell_type": "markdown", + "id": "68591b52", + "metadata": {}, + "source": [ + "## Examples\n", + "Let's get into some examples using the functions listed above. First, we need to load the `waterdata` module and a few other packages and functions to go through the examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ca9bb6a", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "import matplotlib.ticker as mtick\n", + "from IPython.display import display\n", + "from datetime import datetime, timedelta\n", + "from datetime import date\n", + "from dateutil.relativedelta import relativedelta\n", + "from dataretrieval import waterdata" + ] + }, + { + "cell_type": "markdown", + "id": "406762ab", + "metadata": {}, + "source": [ + "#### Reference tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1035ebbb", + "metadata": {}, + "outputs": [], + "source": [ + "pcodes,metadata = waterdata.get_reference_table(\"parameter-codes\")\n", + "display(pcodes.head())" + ] + }, + { + "cell_type": "markdown", + "id": "1e0eab77", + "metadata": {}, + "source": [ + "Let's say we want to find all parameter codes relating to streamflow discharge. We can use some string matching to find applicable codes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "665ccb23", + "metadata": {}, + "outputs": [], + "source": [ + "streamflow_pcodes = pcodes[pcodes['parameter_name'].str.contains('streamflow|discharge', case=False, na=False)]\n", + "display(streamflow_pcodes[['parameter_code_id', 'parameter_name']])" + ] + }, + { + "cell_type": "markdown", + "id": "d9487ee4", + "metadata": {}, + "source": [ + "Interesting that there are so many different streamflow-related parameter codes! Going on experience, let's use the most common one, `00060`, which is \"Discharge, cubic feet per second\".\n", + "\n", + "### Timeseries metadata\n", + "\n", + "Now that we know which parameter code we want to use, let's find all the stream monitoring locations that have recent discharge data and at least 10 years of daily values in the state of Nebraska. We will use the `waterdata.get_time_series_metadata()` function to suss out which sites fit the bill. This function will return a row for each *timeseries* that matches our inputs. It doesn't contain the daily discharge values themselves, just information *about* that timeseries." + ] + }, + { + "cell_type": "markdown", + "id": "70ee1da9", + "metadata": {}, + "source": [ + "First, let's get our expected date range in order. Note that the `waterdata` functions are capable of taking in bounded and unbounded date and datetime ranges. In this case, we want the start date of the discharge timeseries to be no more recent than 10 years ago, and we want the end date of the timeseries to be from at most a week ago. We can use the notation `{date}/..` to mean that we want all timeseries that end a week ago or more recently. For now, we need to set a lower bound for the beginning of the timeseries though, so we will pick a date that is clearly earlier than when gages were recording data: `1700-01-01/{date}`. This means that we want all timeseries that begin after 1700-01-01 (which is all of them!) and before the upper bound date (in our case, 10 years prior to today)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57e2c93a", + "metadata": {}, + "outputs": [], + "source": [ + "ten_years_ago =(date.today() - relativedelta(years=10)).strftime(\"%Y-%m-%d\")\n", + "one_week_ago = (datetime.now() - timedelta(days=7)).date().strftime(\"%Y-%m-%d\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a901f5fa", + "metadata": {}, + "outputs": [], + "source": [ + "NE_discharge,_ = waterdata.get_time_series_metadata(\n", + " state_name=\"Nebraska\",\n", + " parameter_code='00060',\n", + " begin=f\"1700-01-01/{ten_years_ago}\",\n", + " end=f\"{one_week_ago}/..\",\n", + " skip_geometry=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8809a98d", + "metadata": {}, + "outputs": [], + "source": [ + "display(NE_discharge.sort_values(\"monitoring_location_id\").head())\n", + "print(f\"There are {len(NE_discharge['monitoring_location_id'].unique())} sites with recent discharge data available in the state of Nebraska\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f464470", + "metadata": {}, + "source": [ + "In the dataframe above, we are looking at 5 timeseries returned, ordered by monitoring location. You can also see that the first two rows show two different kinds of discharge for the same monitoring location: a mean daily discharge timeseries (with statistic id 00003, which represents \"mean\") and an instantaneous discharge timeseries (with statistic id 00011, which represents \"points\" or \"instantaneous\" values). Look closely and you may also notice that the `parent_timeseries_id` column for daily mean discharge matches the `time_series_id` for the instantaneous discharge. This is because once instantaneous measurements began at the site, they were used to calculate the daily mean." + ] + }, + { + "cell_type": "markdown", + "id": "650ac2e6", + "metadata": {}, + "source": [ + "### Monitoring locations\n", + "Now that we know which sites have recent discharge data, let's find stream sites and plot them on a map. We will use the `waterdata.get_monitoring_locations()` function to grab more metadata about these sites. Even though we have a list of monitoring location IDs from the timeseries function, it's faster to use the `state_name` argument to return all stream sites, and then filter down to the ones we're interested in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce4df5fb", + "metadata": {}, + "outputs": [], + "source": [ + "NE_locations,_ = waterdata.get_monitoring_locations(\n", + " state_name=\"Nebraska\",\n", + " site_type_code=\"ST\"\n", + " )\n", + "\n", + "NE_locations_discharge = NE_locations.loc[NE_locations['monitoring_location_id'].isin(NE_discharge['monitoring_location_id'].unique().tolist())]\n", + "display(NE_locations_discharge[[\"monitoring_location_id\", \"monitoring_location_name\", \"hydrologic_unit_code\"]].head())" + ] + }, + { + "cell_type": "markdown", + "id": "f0fe5c4e", + "metadata": {}, + "source": [ + "If you have `geopandas` installed, the function will return a `GeoDataFrame` with a `geometry` column containing the monitoring locations' coordinates. If you don't have `geopandas` installed, it will return a regular `pandas` DataFrame with coordinate columns instead. Let's take a look at the site locations using `gpd.explore()`. Hover over the site points to see all the columns returned from `waterdata.get_monitoring_locations()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "659b19a5", + "metadata": {}, + "outputs": [], + "source": [ + "NE_locations_discharge.set_crs(crs=\"WGS84\").explore()" + ] + }, + { + "cell_type": "markdown", + "id": "3c7fd0df", + "metadata": {}, + "source": [ + "### Latest daily and instantaneous values\n", + "Now that we know which sites in Nebraska have recent discharge data, and we know where they are located, we can start downloading some actual flow values. Let's start with some of the most \"lightweight\" functions, `waterdata.get_latest_daily()` and `waterdata.get_latest_continuous()`, which will return only the latest value for each monitoring location requested. \n", + "\n", + "Recall from above, we are working with ~100 sites with discharge data. Conveniently, the `waterdata` functions are *usually* pretty good at handling requests of up to ~200 monitoring locations. However, if you have more than 200, you may be better off chopping up your list of sites into a few lists that you loop over. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a8cf6b", + "metadata": {}, + "outputs": [], + "source": [ + "latest_dv,_ = waterdata.get_latest_daily(\n", + " monitoring_location_id=NE_locations_discharge['monitoring_location_id'].tolist(),\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\"\n", + ")\n", + "display(latest_dv.head())" + ] + }, + { + "cell_type": "markdown", + "id": "f5a38bde", + "metadata": {}, + "source": [ + "Note that because these measurements are less than a week old, most of them are still tagged as \"Provisional\" in the `approval_status` column. Some may also be missing values in the `value` column. You can often check the `qualifier` column for clues as to why a value is missing, or additional information specific to that measurement. Let's map out the monitoring locations again and color the points based on the latest daily value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f97f172f", + "metadata": {}, + "outputs": [], + "source": [ + "latest_dv['date'] = latest_dv['time'].astype(str)\n", + "\n", + "latest_dv[['geometry', 'monitoring_location_id', 'date', 'value', 'unit_of_measure']].set_crs(crs=\"WGS84\").explore(column='value', tiles='CartoDB dark matter', cmap='YlOrRd', scheme=None, legend=True)" + ] + }, + { + "cell_type": "markdown", + "id": "aa6fa717", + "metadata": {}, + "source": [ + "Let's do the same routine with `waterdata.get_latest_continuous()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83c7f5d4", + "metadata": {}, + "outputs": [], + "source": [ + "latest_instantaneous,_ = waterdata.get_latest_continuous(\n", + " monitoring_location_id=NE_locations_discharge['monitoring_location_id'].tolist(),\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00011\"\n", + ")\n", + "latest_instantaneous['datetime'] = latest_instantaneous['time'].astype(str)\n", + "\n", + "latest_instantaneous[['geometry', 'monitoring_location_id', 'datetime', 'value', 'unit_of_measure']].set_crs(crs=\"WGS84\").explore(column='value', cmap='YlOrRd', scheme=None, legend=True)" + ] + }, + { + "cell_type": "markdown", + "id": "f179dfd0", + "metadata": {}, + "source": [ + "### Daily and continuous values datasets\n", + "While the \"latest\" functions might be helpful for \"realtime\" or \"current\" dashboards or reports, many users desire to work with a complete timeseries of daily summary (min, max, mean) or instantaneous values for their analyses. For these workflows, `waterdata.get_daily()` and `waterdata.get_continuous()` are helpful.\n", + "\n", + "Using our current example, let's say that you want to compare daily and instantaneous discharge values for monitoring locations along the Missouri River in Nebraska." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf50e007", + "metadata": {}, + "outputs": [], + "source": [ + "missouri_river_sites = NE_locations_discharge.loc[NE_locations_discharge['monitoring_location_name'].str.contains(\"Missouri\")]\n", + "display(missouri_river_sites[[\n", + " 'county_name',\n", + " 'site_type',\n", + " 'monitoring_location_id',\n", + " 'monitoring_location_name',\n", + " 'drainage_area',\n", + " 'altitude'\n", + " ]])" + ] + }, + { + "cell_type": "markdown", + "id": "c5c5881e", + "metadata": {}, + "source": [ + "Currently, users may only request 3 years or less of continuous data in one pull. For this example, let's pull the last 1 year of daily mean values and instantaneous values for these Missouri River sites. We'll skip pulling geometry in the `waterdata.get_daily()` function; the `waterdata.get_continuous()` function does not return geometry at all." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d812f4e", + "metadata": {}, + "outputs": [], + "source": [ + "one_year_ago = (date.today() - relativedelta(years=1)).strftime(\"%Y-%m-%d\")\n", + "missouri_site_ids = missouri_river_sites['monitoring_location_id'].tolist()\n", + "missouri_site_names = missouri_river_sites['monitoring_location_name'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bbda23d", + "metadata": {}, + "outputs": [], + "source": [ + "daily_values,_ = waterdata.get_daily(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00003\", # mean daily value\n", + " time=f\"{one_year_ago}/..\",\n", + " skip_geometry=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "795b9eca", + "metadata": {}, + "outputs": [], + "source": [ + "instantaneous_values,_ = waterdata.get_continuous(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " statistic_id=\"00011\", # instantaneous value\n", + " time=f\"{one_year_ago}T00:00:00Z/..\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc2c70d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 8), dpi=150, sharex=False, sharey=True)\n", + "axes = axes.ravel()\n", + "\n", + "# Y-axis formatter (with thousands separators)\n", + "tick_fmt = mtick.StrMethodFormatter('{x:,.0f}')\n", + "\n", + "for ax, site, site_name in zip(axes, missouri_site_ids, missouri_site_names):\n", + " # Filter per site & sort by time\n", + " inst = instantaneous_values.loc[instantaneous_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + " daily = daily_values.loc[daily_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + "\n", + " # Instantaneous (line)\n", + " ax.plot(\n", + " inst[\"time\"], inst[\"value\"],\n", + " color=\"#1f77b4\", lw=1.0, label=\"Instantaneous\", zorder=1\n", + " )\n", + "\n", + " # Daily mean (black dots)\n", + " ax.scatter(\n", + " daily[\"time\"], daily[\"value\"],\n", + " c=\"black\", s=2, label=\"Daily mean\", zorder=2\n", + " )\n", + "\n", + " # Axes styling\n", + " ax.set_title(f\"{site}\\n{site_name}\", fontsize=10)\n", + " ax.grid(True, which=\"both\", alpha=0.25)\n", + " ax.yaxis.set_major_formatter(tick_fmt)\n", + "\n", + " # Time ticks\n", + " ax.xaxis.set_major_locator(mdates.MonthLocator())\n", + " ax.xaxis.set_major_formatter(mdates.DateFormatter(\"%b %Y\"))\n", + " ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.MO))\n", + "\n", + "# Common axis labels (left y on both left subplots; x labels on bottom row)\n", + "axes[0].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_xlabel(\"\")\n", + "axes[3].set_xlabel(\"\")\n", + "\n", + "handles, labels = axes[-1].get_legend_handles_labels()\n", + "fig.legend(handles, labels, loc=\"lower center\", ncol=2, frameon=False)\n", + "fig.suptitle(f\"Missouri River sites - Daily Mean vs Instantaneous Discharge\")\n", + "fig.autofmt_xdate()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d04a8f8a", + "metadata": {}, + "source": [ + "### Field values\n", + "Finally, let's see if there are any discharge field measurements for these sites. These are manually recorded measurements (by a human), often used during calibration checks. We will use `waterdata.get_field_measurements()` to check. More commonly, a user would head to this function to gather groundwater level information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56bf6166", + "metadata": {}, + "outputs": [], + "source": [ + "field_measurements,_ = waterdata.get_field_measurements(\n", + " monitoring_location_id=missouri_site_ids,\n", + " parameter_code=\"00060\",\n", + " time=f\"{one_year_ago}T00:00:00Z/..\"\n", + ")\n", + "display(field_measurements.head())" + ] + }, + { + "cell_type": "markdown", + "id": "e621e45a", + "metadata": {}, + "source": [ + "Hey! We have some! Let's add these to our plots." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87a397a4", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, figsize=(14, 8), dpi=150, sharex=False, sharey=True)\n", + "axes = axes.ravel()\n", + "\n", + "# Y-axis formatter (with thousands separators)\n", + "tick_fmt = mtick.StrMethodFormatter('{x:,.0f}')\n", + "\n", + "for ax, site, site_name in zip(axes, missouri_site_ids, missouri_site_names):\n", + " # Filter per site & sort by time\n", + " inst = instantaneous_values.loc[instantaneous_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + " daily = daily_values.loc[daily_values['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + " field = field_measurements.loc[field_measurements['monitoring_location_id'] == site, [\"time\", \"value\"]].sort_values(\"time\")\n", + "\n", + " # Instantaneous (line)\n", + " ax.plot(\n", + " inst[\"time\"], inst[\"value\"],\n", + " color=\"#1f77b4\", lw=1.0, label=\"Instantaneous\", zorder=1\n", + " )\n", + "\n", + " # Daily mean (black dots)\n", + " ax.scatter(\n", + " daily[\"time\"], daily[\"value\"],\n", + " c=\"black\", s=2, label=\"Daily mean\", zorder=2\n", + " )\n", + "\n", + " # Field measurements (red dots)\n", + " ax.scatter(\n", + " field[\"time\"], field[\"value\"],\n", + " c=\"red\", s=4, label=\"Field\", zorder=3\n", + " )\n", + "\n", + " # Axes styling\n", + " ax.set_title(f\"{site}\\n{site_name}\", fontsize=10)\n", + " ax.grid(True, which=\"both\", alpha=0.25)\n", + " ax.yaxis.set_major_formatter(tick_fmt)\n", + "\n", + " # Time ticks\n", + " ax.xaxis.set_major_locator(mdates.MonthLocator())\n", + " ax.xaxis.set_major_formatter(mdates.DateFormatter(\"%b %Y\"))\n", + " ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.MO))\n", + "\n", + "axes[0].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_ylabel(\"Discharge (cubic feet per second)\")\n", + "axes[2].set_xlabel(\"\")\n", + "axes[3].set_xlabel(\"\")\n", + "\n", + "handles, labels = axes[-1].get_legend_handles_labels()\n", + "fig.legend(handles, labels, loc=\"lower center\", ncol=3, frameon=False)\n", + "fig.suptitle(f\"Missouri River sites - Daily Mean, Instantaneous, and Field Measurement Discharge\")\n", + "fig.autofmt_xdate()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dr-test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 096a50ae..abdd823b 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -16,6 +16,7 @@ get_latest_daily, get_field_measurements, get_time_series_metadata, + get_reference_table ) def mock_request(requests_mock, request_url, file_path): @@ -139,11 +140,20 @@ def test_get_daily_properties(): time="2025-01-01/..", properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"] ) - assert "daily_id" in df.columns - assert "geometry" in df.columns + assert "daily_id" == df.columns[0] + assert "geometry" == df.columns[-1] assert df.shape[1] == 6 assert df.parameter_code.unique().tolist() == ["00060"] +def test_get_daily_properties_id(): + df,_ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"] + ) + assert "daily_id" == df.columns[1] + def test_get_daily_no_geometry(): df,_ = get_daily( monitoring_location_id="USGS-05427718", @@ -187,7 +197,7 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) - assert "latest_continuous_id" in df.columns + assert "latest_continuous_id" == df.columns[-1] assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, 'url') @@ -204,6 +214,15 @@ def test_get_latest_daily(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_latest_daily_properties_geometry(): + df, md = get_latest_daily( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"], + properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure'] + ) + assert "geometry" in df.columns + assert df.shape[1] == 6 + def test_get_field_measurements(): df, md = get_field_measurements( monitoring_location_id="USGS-05427718", @@ -227,4 +246,14 @@ def test_get_time_series_metadata(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_reference_table(): + df, md = get_reference_table("agency-codes") + assert "agency_code_id" in df.columns + assert df.shape[0] > 0 + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') + +def test_get_reference_table_wrong_name(): + with pytest.raises(ValueError): + get_reference_table("agency-cod")