-
Notifications
You must be signed in to change notification settings - Fork 52
Add samples #173
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add samples #173
Changes from 7 commits
f9df46f
38c673f
72320ea
7c2cdeb
d3b4430
fb94aa0
6027a32
dfef654
746f783
84b715b
f367bad
327484a
198c132
bc5d177
c9cbe67
45f54fa
fa7b5f7
01adcc4
b6753a2
e1b50b4
a4f71e3
3dfff7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,283 @@ | ||||||
| """ | ||||||
| Tool for downloading data from the USGS Aquarius Samples database (https://waterqualitydata.us) | ||||||
|
|
||||||
| See https://api.waterdata.usgs.gov/samples-data/docs#/ for API reference | ||||||
|
|
||||||
| Not sure about avilable site types, characteristics, state codes, or other | ||||||
| input parameters? Check out the samples data code service API reference: | ||||||
| https://api.waterdata.usgs.gov/samples-data/codeservice/docs | ||||||
|
|
||||||
| """ | ||||||
|
|
||||||
| from __future__ import annotations | ||||||
|
|
||||||
| import warnings | ||||||
| import requests | ||||||
| from requests.models import PreparedRequest | ||||||
| from typing import List, Optional, Tuple, Union | ||||||
| from io import StringIO | ||||||
| from typing import TYPE_CHECKING | ||||||
|
|
||||||
| import pandas as pd | ||||||
|
|
||||||
| from utils import BaseMetadata, query, to_str | ||||||
|
|
||||||
| if TYPE_CHECKING: | ||||||
| from pandas import DataFrame | ||||||
|
|
||||||
| BASE_URL = "https://api.waterdata.usgs.gov/samples-data/" | ||||||
|
|
||||||
| services_dict = { | ||||||
| "results" : ["fullphyschem", "basicphyschem", | ||||||
| "fullbio", "basicbio", "narrow", | ||||||
| "resultdetectionquantitationlimit", | ||||||
| "labsampleprep", "count"], | ||||||
| "locations" : ["site", "count"], | ||||||
| "activities" : ["sampact", "actmetric", | ||||||
| "actgroup", "count"], | ||||||
| "projects" : ["project", "projectmonitoringlocationweight"], | ||||||
| "organizations" : ["organization", "count"] | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| def _check_profiles( | ||||||
| service, | ||||||
| profile | ||||||
| ): | ||||||
| """Check that services are paired correctly with profile in | ||||||
| a service call. | ||||||
|
|
||||||
| Parameters | ||||||
| ---------- | ||||||
| service : string | ||||||
| One of the service names from the "services" list. | ||||||
| profile : string | ||||||
| One of the profile names from "results_profiles", | ||||||
| "locations_profiles", "activities_profiles", | ||||||
| "projects_profiles" or "organizations_profiles". | ||||||
| """ | ||||||
|
|
||||||
| if service not in services_dict.keys(): | ||||||
| raise TypeError( | ||||||
| f"{service} is not a Samples service. " | ||||||
| f"Valid options are {list(services_dict.keys())}." | ||||||
| ) | ||||||
| if profile not in services_dict[service]: | ||||||
| raise TypeError( | ||||||
| f"{profile} is not a profile associated with " | ||||||
| f"the {service} service. Valid options are " | ||||||
| f"{services_dict[service]}." | ||||||
| ) | ||||||
|
|
||||||
| def get_USGS_samples( | ||||||
| ssl_check=True, | ||||||
| service="results", | ||||||
| profile="fullphyschem", | ||||||
| activityMediaName=None, | ||||||
| activityStartDateLower=None, | ||||||
| activityStartDateUpper=None, | ||||||
| activityTypeCode=None, | ||||||
| characteristicGroup=None, | ||||||
| characteristc=None, | ||||||
| characteristicUserSupplied=None, | ||||||
| boundingBox=None, | ||||||
| countryFips=None, | ||||||
| stateFips=None, | ||||||
| countyFips=None, | ||||||
| siteTypeCode=None, | ||||||
| siteTypeName=None, | ||||||
| usgsPCode=None, | ||||||
| hydrologicUnit=None, | ||||||
| monitoringLocationIdentifier=None, | ||||||
| organizationIdentifier=None, | ||||||
| pointLocationLatitude=None, | ||||||
| pointLocationLongitude=None, | ||||||
| pointLocationWithinMiles=None, | ||||||
| projectIdentifier=None, | ||||||
| recordIdentifierUserSupplied=None | ||||||
| ) -> Tuple[pd.DataFrame, BaseMetadata]: | ||||||
| """Search Samples database for USGS water quality data. | ||||||
| This is a wrapper function for the Samples database API. All potential | ||||||
| filters are provided as arguments to the function, but please do not | ||||||
| populate all possible filters; leave as many as feasible with their default | ||||||
| value (None). This is important because overcomplicated web service queries | ||||||
| can bog down the database's ability to return an applicable dataset before | ||||||
| it times out. | ||||||
|
|
||||||
| The web GUI for the Samples database can be found here: | ||||||
| https://waterdata.usgs.gov/download-samples/#dataProfile=site | ||||||
|
|
||||||
| If you would like more details on feasible query parameters (complete with | ||||||
| examples), please visit the Samples database swagger docs, here: | ||||||
| https://api.waterdata.usgs.gov/samples-data/docs#/ | ||||||
|
|
||||||
| Parameters | ||||||
| ---------- | ||||||
| ssl_check : bool, optional | ||||||
| Check the SSL certificate. | ||||||
| service : string | ||||||
| One of the available Samples services: "results", "locations", "activities", | ||||||
| "projects", or "organizations". Defaults to "results". | ||||||
| profile : string | ||||||
| One of the available profiles associated with a service. Options for each | ||||||
| service are: | ||||||
| results - "fullphyschem", "basicphyschem", | ||||||
| "fullbio", "basicbio", "narrow", | ||||||
| "resultdetectionquantitationlimit", | ||||||
| "labsampleprep", "count" | ||||||
| locations - "site", "count" | ||||||
| activities - "sampact", "actmetric", | ||||||
| "actgroup", "count" | ||||||
| projects - "project", "projectmonitoringlocationweight" | ||||||
| organizations - "organization", "count" | ||||||
| activityMediaName : string or list of strings, optional | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kinda crazy to me that the only doc pages on this type of parameter are from dataRetrieval or related packages. The WQP documentation just says something to the effect of "look at the output for more information." Why include it as a query-able parameter if the parameter's possible values aren't communicated to the user? This isn't something to solve in this PR, or even in the package overall, but I feel like this is something seriously lacking in the API documentation we expect users to read. I found the "sample media" endpoint through the codeservice link you provided that looks to provide the actual values for these more obscure parameters. But pinging one endpoint to learn about the parameter values of a different endpoint seems overly complicated to me. Non-rhetorically: are these parameters so volatile that we couldn't provide static documentation somewhere that describes the values they can take?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you make great points. I believe @ldecicco-USGS has incorporated some of these types of lookup tables to prevent users from passing invalid options to the API, and there are some basic lookup tables in
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup! There's actually a web service to check available parameters: https://github.com/DOI-USGS/dataRetrieval/blob/main/R/read_USGS_samples.R#L334 The main bit: service_options <- c("characteristicgroup", "states", "counties",
"countries", "sitetype", "samplemedia",
"characteristics", "observedproperty")
check_group_req <- httr2::request("https://api.waterdata.usgs.gov") |>
httr2::req_url_path_append("samples-data",
"codeservice",
service)For instance: Here it is:
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Laura, I agree: while some of the options may not change very often, I still think it is a smart move to make the options a service rather than a "hard-coded" list....data managers are adding new PFAS compounds every day, for instance. This PR links to that codeservice in the samples function documentation, but I do think Joe's point is valid that we can be nicer to the user in python and avoid sending them to the Swagger doc to find all available options. I bet it's a similar number of lines of code in python! |
||||||
| Name or code indicating environmental medium in which sample was taken. | ||||||
| Example: "Water". | ||||||
| activityStartDateLower : string, optional | ||||||
| The start date if using a date range. Takes the format YYYY-MM-DD. | ||||||
| The logic is inclusive, i.e. it will also return results that | ||||||
| match the date. If left as None, will pull all data on or before | ||||||
| activityStartDateUpper, of populated. | ||||||
|
||||||
| activityStartDateUpper : string, optional | ||||||
| The end date if using a date range. Takes the format YYYY-MM-DD. | ||||||
| The logic is inclusive, i.e. it will also return results that | ||||||
| match the date. If left as None, will pull all data after | ||||||
| activityStartDateLower up to the most recent available results. | ||||||
| activityTypeCode : string or list of strings, optional | ||||||
| Text code that describes type of field activity performed. | ||||||
| Example: "Sample-Routine, regular". | ||||||
| characteristicGroup : string or list of strings, optional | ||||||
| Characteristic group is a broad category describing one or more | ||||||
| of results. | ||||||
|
||||||
| of results. | |
| results. |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| characteristc : string or list of strings, optional | |
| characteristic : string or list of strings, optional |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really was on a roll with these typos last week. Thanks for catching.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is an "organization context?" Is there a good example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, this was lifted directly from the Samples API descriptions or dataRetrieval. I'm going to rewrite and let me see if I can add an example.
Uh oh!
There was an error while loading. Please reload this page.