Skip to content
Merged
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 283 additions & 0 deletions dataretrieval/samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
"""
Tool for downloading data from the USGS Aquarius Samples database (https://waterqualitydata.us)

See https://api.waterdata.usgs.gov/samples-data/docs#/ for API reference

Not sure about avilable site types, characteristics, state codes, or other
input parameters? Check out the samples data code service API reference:
https://api.waterdata.usgs.gov/samples-data/codeservice/docs

"""

from __future__ import annotations

import warnings
import requests
from requests.models import PreparedRequest
from typing import List, Optional, Tuple, Union
from io import StringIO
from typing import TYPE_CHECKING

import pandas as pd

from utils import BaseMetadata, query, to_str

if TYPE_CHECKING:
from pandas import DataFrame

BASE_URL = "https://api.waterdata.usgs.gov/samples-data/"

services_dict = {
"results" : ["fullphyschem", "basicphyschem",
"fullbio", "basicbio", "narrow",
"resultdetectionquantitationlimit",
"labsampleprep", "count"],
"locations" : ["site", "count"],
"activities" : ["sampact", "actmetric",
"actgroup", "count"],
"projects" : ["project", "projectmonitoringlocationweight"],
"organizations" : ["organization", "count"]
}


def _check_profiles(
service,
profile
):
"""Check that services are paired correctly with profile in
a service call.

Parameters
----------
service : string
One of the service names from the "services" list.
profile : string
One of the profile names from "results_profiles",
"locations_profiles", "activities_profiles",
"projects_profiles" or "organizations_profiles".
"""

if service not in services_dict.keys():
raise TypeError(
f"{service} is not a Samples service. "
f"Valid options are {list(services_dict.keys())}."
)
if profile not in services_dict[service]:
raise TypeError(
f"{profile} is not a profile associated with "
f"the {service} service. Valid options are "
f"{services_dict[service]}."
)

def get_USGS_samples(
ssl_check=True,
service="results",
profile="fullphyschem",
activityMediaName=None,
activityStartDateLower=None,
activityStartDateUpper=None,
activityTypeCode=None,
characteristicGroup=None,
characteristc=None,
characteristicUserSupplied=None,
boundingBox=None,
countryFips=None,
stateFips=None,
countyFips=None,
siteTypeCode=None,
siteTypeName=None,
usgsPCode=None,
hydrologicUnit=None,
monitoringLocationIdentifier=None,
organizationIdentifier=None,
pointLocationLatitude=None,
pointLocationLongitude=None,
pointLocationWithinMiles=None,
projectIdentifier=None,
recordIdentifierUserSupplied=None
) -> Tuple[pd.DataFrame, BaseMetadata]:
"""Search Samples database for USGS water quality data.
This is a wrapper function for the Samples database API. All potential
filters are provided as arguments to the function, but please do not
populate all possible filters; leave as many as feasible with their default
value (None). This is important because overcomplicated web service queries
can bog down the database's ability to return an applicable dataset before
it times out.

The web GUI for the Samples database can be found here:
https://waterdata.usgs.gov/download-samples/#dataProfile=site

If you would like more details on feasible query parameters (complete with
examples), please visit the Samples database swagger docs, here:
https://api.waterdata.usgs.gov/samples-data/docs#/

Parameters
----------
ssl_check : bool, optional
Check the SSL certificate.
service : string
One of the available Samples services: "results", "locations", "activities",
"projects", or "organizations". Defaults to "results".
profile : string
One of the available profiles associated with a service. Options for each
service are:
results - "fullphyschem", "basicphyschem",
"fullbio", "basicbio", "narrow",
"resultdetectionquantitationlimit",
"labsampleprep", "count"
locations - "site", "count"
activities - "sampact", "actmetric",
"actgroup", "count"
projects - "project", "projectmonitoringlocationweight"
organizations - "organization", "count"
activityMediaName : string or list of strings, optional
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kinda crazy to me that the only doc pages on this type of parameter are from dataRetrieval or related packages. The WQP documentation just says something to the effect of "look at the output for more information." Why include it as a query-able parameter if the parameter's possible values aren't communicated to the user? This isn't something to solve in this PR, or even in the package overall, but I feel like this is something seriously lacking in the API documentation we expect users to read.

I found the "sample media" endpoint through the codeservice link you provided that looks to provide the actual values for these more obscure parameters. But pinging one endpoint to learn about the parameter values of a different endpoint seems overly complicated to me. Non-rhetorically: are these parameters so volatile that we couldn't provide static documentation somewhere that describes the values they can take?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you make great points. I believe @ldecicco-USGS has incorporated some of these types of lookup tables to prevent users from passing invalid options to the API, and there are some basic lookup tables in dataRetrieval too. I don't see it as too much trouble to have a set of functions called "[input name]_lookup" that a user can run to return all viable options for Samples. Doubly useful is Laura's approach of checking a user's query before it is made. I will add this as an issue (if it isn't there already) and we can tackle this as a separate PR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup! There's actually a web service to check available parameters:

https://github.com/DOI-USGS/dataRetrieval/blob/main/R/read_USGS_samples.R#L334

The main bit:

service_options <- c("characteristicgroup", "states", "counties",
                       "countries", "sitetype", "samplemedia",
                       "characteristics", "observedproperty")
  
  check_group_req <- httr2::request("https://api.waterdata.usgs.gov") |> 
    httr2::req_url_path_append("samples-data",
                               "codeservice",
                               service)

For instance:
So, https://api.waterdata.usgs.gov/samples-data/codeservice/samplemedia shows all the available sample media options. It's probably not described in the main documentation because (I think) it's dynamically populated from the values in the data base. So, if a data provider suddenly adds "Jello", the full documentation doesn't need a manual update. I think there's a codeservice Swagger somewhere.....

Here it is:
https://api.waterdata.usgs.gov/samples-data/codeservice/docs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Laura, I agree: while some of the options may not change very often, I still think it is a smart move to make the options a service rather than a "hard-coded" list....data managers are adding new PFAS compounds every day, for instance.

This PR links to that codeservice in the samples function documentation, but I do think Joe's point is valid that we can be nicer to the user in python and avoid sending them to the Swagger doc to find all available options. I bet it's a similar number of lines of code in python!

Name or code indicating environmental medium in which sample was taken.
Example: "Water".
activityStartDateLower : string, optional
The start date if using a date range. Takes the format YYYY-MM-DD.
The logic is inclusive, i.e. it will also return results that
match the date. If left as None, will pull all data on or before
activityStartDateUpper, of populated.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand this sentence -- "of populated" what?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! "if populated"!

activityStartDateUpper : string, optional
The end date if using a date range. Takes the format YYYY-MM-DD.
The logic is inclusive, i.e. it will also return results that
match the date. If left as None, will pull all data after
activityStartDateLower up to the most recent available results.
activityTypeCode : string or list of strings, optional
Text code that describes type of field activity performed.
Example: "Sample-Routine, regular".
characteristicGroup : string or list of strings, optional
Characteristic group is a broad category describing one or more
of results.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo?

Suggested change
of results.
results.

Example: "Organics, PFAS"
characteristc : string or list of strings, optional
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
characteristc : string or list of strings, optional
characteristic : string or list of strings, optional

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really was on a roll with these typos last week. Thanks for catching.

Characteristic is a specific category describing one or more results.
Example: "Suspended Sediment Discharge"
characteristicUserSupplied : string or list of strings, optional
A user supplied characteristic name describing one or more results.
boundingBox: list of four floats, optional
Filters on the the associated monitoring location's point location
by checking if it is located within the specified geographic area.
The logic is inclusive, i.e. it will include locations that overlap
with the edge of the bounding box. Values are separated by commas,
expressed in decimal degrees, NAD83, and longitudes west of Greenwich
are negative.
The format is a string consisting of:
- Western-most longitude
- Southern-most latitude
- Eastern-most longitude
- Northern-most longitude
Example: [-92.8,44.2,-88.9,46.0]
countryFips : string or list of strings, optional
Example: "US" (United States)
stateFips : string or list of strings, optional
Check out the code service for FIPS codes:
https://api.waterdata.usgs.gov/samples-data/codeservice/docs#/
Example: "US:15" (United States: Hawaii)
countyFips : string or list of strings, optional
Check out the code service for FIPS codes:
https://api.waterdata.usgs.gov/samples-data/codeservice/docs#/
Example: "US:15:001" (United States: Hawaii, Hawaii County)
siteTypeCode : string or list of strings, optional
An abbreviation for a certain site type.
Example: "GW" (Groundwater site)
siteTypeName : string or list of strings, optional
A full name for a certain site type.
Example: "Well"
usgsPCode : string or list of strings, optional
5-digit number used in the US Geological Survey computerized
data system, National Water Information System (NWIS), to
uniquely identify a specific constituent
Example: "00060" (Discharge, cubic feet per second)
hydrologicUnit : string or list of strings, optional
Max 12-digit number used to describe a hydrologic unit.
Example: "070900020502"
monitoringLocationIdentifier : string or list of strings, optional
A monitoring location identifier has two parts: the agency code
and the location number, separated by a dash (-).
Example: "USGS-040851385"
organizationIdentifier : string or list of strings, optional
Designator used to uniquely identify a specific organization.
Currently only accepting the organization "USGS".
pointLocationLatitude : float, optional
Latitude for a point/radius query (decimal degrees). Must be used
with pointLocationLongitude and pointLocationWithinMiles.
pointLocationLongitude : float, optional
Longitude for a point/radius query (decimal degrees). Must be used
with pointLocationLatitude and pointLocationWithinMiles.
pointLocationWithinMiles : float, optional
Radius for a point/radius query. Must be used with
pointLocationLatitude and pointLocationLongitude
projectIdentifier : string or list of strings, optional
Designator used to uniquely id a data collection project in
organization context.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is an "organization context?" Is there a good example?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, this was lifted directly from the Samples API descriptions or dataRetrieval. I'm going to rewrite and let me see if I can add an example.

recordIdentifierUserSupplied : string or list of strings, optional
Internal AQS record identifier that returns 1 entry. Only available
for the "results" service.
mimeType : string, optional

Returns
-------
df : ``pandas.DataFrame``
Formatted data returned from the API query.
md : :obj:`dataretrieval.utils.Metadata`
Custom ``dataretrieval`` metadata object pertaining to the query.

Examples
--------
.. code::

>>> # Get PFAS results within a bounding box
>>> df, md = dataretrieval.samples.get_USGS_samples(
... boundingBox=[-90.2,42.6,-88.7,43.2],
... characteristicGroup="Organics, PFAS"
... )

>>> # Get all activities for the Commonwealth of Virginia over a date range
>>> df, md = dataretrieval.samples.get_USGS_samples(
... service="activities",
... profile="sampact",
... activityStartDateLower="2023-10-01",
... activityStartDateUpper="2024-01-01",
... stateFips="US:51")

>>> # Get all pH samples for two sites in Utah
>>> df, md = dataretrieval.samples.get_USGS_samples(
... monitoringLocationIdentifier=['USGS-393147111462301', 'USGS-393343111454101'],
... usgsPCode='00400')

"""
_check_profiles(service, profile)

# Get all not-None inputs
params = {key: value for key, value in locals().items() if value is not None and key not in ['service', 'profile', 'ssl_check']}

if len(params) == 0:
raise TypeError("No filter parameters provided. You must add at least "
"one filter parameter beyond a service, profile, and format argument.")

# Add in file format (could be an input, too, though not sure about other formats)
params['mimeType'] = "text/csv"

# Convert bounding box to a string
if "boundingBox" in params:
params['boundingBox'] = to_str(params['boundingBox'])

# Build URL with service and profile
url = BASE_URL + service + "/" + profile

# Print URL
req = PreparedRequest()
req.prepare_url(url, params=params)
print(f"Request: {req.url}")

# Make a GET request with the filtered parameters
response = requests.get(url, params=params, verify=ssl_check)

response.raise_for_status

df = pd.read_csv(StringIO(response.text), delimiter=",")

#return response

return df, BaseMetadata(response)