diff --git a/NEWS b/NEWS index ddc2a4f3..c75858c6 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,8 @@ is not stable over time. Moved other "id" columns to end of returned data frames encouraged to migrate to the "read_waterdata_metadata" functions. * Added no_paging argument. This will make the request more efficient, but is not recommended because it will silently cut off data after 50,000 rows. +* Removed max_results argument. Was confusing and redundant with the combination +of no_paging and limit. dataRetrieval 2.7.21 =================== diff --git a/R/construct_api_requests.R b/R/construct_api_requests.R index c0b7b079..03711132 100644 --- a/R/construct_api_requests.R +++ b/R/construct_api_requests.R @@ -136,28 +136,9 @@ check_limits <- function(args){ current_api_limit <- 50000 if(is.na(args[["limit"]])){ - if(!is.na(args[["max_results"]])){ - # we can leave limit empty unless we're doing no paging and the max is > limit - if(args[["max_results"]] > current_api_limit){ - args[["limit"]] <- current_api_limit - if(args[["no_paging"]]){ - warning("no_paging option is capped at ", current_api_limit, " max_results") - args[["max_results"]] <- current_api_limit - } - } else { - args[["limit"]] <- args[["max_results"]] - } - - } else { - args[["limit"]] <- current_api_limit - } - } else { - if(!is.na(args[["max_results"]])){ - if(args[["limit"]] > args[["max_results"]]) stop("limit cannot be greater than max_result") - } else if (args[["limit"]] > current_api_limit){ - args[["limit"]] <- current_api_limit - } - } + args[["limit"]] <- current_api_limit + } + return(args) } @@ -283,6 +264,11 @@ switch_arg_id <- function(ls, id_name, service){ #' start_end2 <- c("2021-01-01T12:15:00-0500", "") #' dataRetrieval:::format_api_dates(start_end2) #' +#' time = c("2014-05-01T00:00:00Z", "2014-05-01T12:00:00Z") +#' dataRetrieval:::format_api_dates(time) +#' +#' time = c("2014-05-01T00:00Z", "2014-05-01T12:00Z") +#' dataRetrieval:::format_api_dates(time) format_api_dates <- function(datetime, date = FALSE){ if(is.character(datetime)){ @@ -296,19 +282,31 @@ format_api_dates <- function(datetime, date = FALSE){ grepl("/", datetime)){ return(datetime) } else { + datetime1 <- tryCatch({ + lubridate::as_datetime(datetime) + }, + warning = function(w) { + strptime(datetime, format = "%Y-%m-%dT%H:%MZ", tz = "UTC") + }) if(date){ - datetime <- format(lubridate::as_datetime(datetime), "%Y-%m-%d") + datetime <- format(datetime1, "%Y-%m-%d") } else { - datetime <- lubridate::format_ISO8601(lubridate::as_datetime(datetime), usetz = "Z") + datetime <- lubridate::format_ISO8601(datetime1, usetz = "Z") } } } else if (length(datetime) == 2) { + datetime1 <- tryCatch({ + lubridate::as_datetime(datetime) + }, + warning = function(w) { + strptime(datetime, format = "%Y-%m-%dT%H:%MZ", tz = "UTC") + }) + if(date){ - datetime <- paste0(format(lubridate::as_datetime(datetime), "%Y-%m-%d"), collapse = "/") + datetime <- paste0(format(datetime1, "%Y-%m-%d"), collapse = "/") } else { - datetime <- paste0(lubridate::format_ISO8601(lubridate::as_datetime(datetime), - usetz = "Z"), + datetime <- paste0(lubridate::format_ISO8601(datetime1, usetz = "Z"), collapse = "/") } diff --git a/R/get_ogc_data.R b/R/get_ogc_data.R index ba984a11..487f5c3a 100644 --- a/R/get_ogc_data.R +++ b/R/get_ogc_data.R @@ -9,9 +9,7 @@ get_ogc_data <- function(args, output_id, service){ - - args[["service"]] <- service - + args <- switch_arg_id(args, id_name = output_id, service = service) @@ -23,9 +21,7 @@ get_ogc_data <- function(args, id = output_id) convertType <- args[["convertType"]] args[["convertType"]] <- NULL - - max_results <- args[["max_results"]] - args[["max_results"]] <- NULL + args[["service"]] <- service req <- do.call(construct_api_requests, args) @@ -37,9 +33,9 @@ get_ogc_data <- function(args, } if(no_paging){ - return_list <- get_csv(req, max_results) + return_list <- get_csv(req, limit = args[["limit"]]) } else { - return_list <- walk_pages(req, max_results) + return_list <- walk_pages(req) } if(is.na(args[["skipGeometry"]])){ @@ -51,10 +47,23 @@ get_ogc_data <- function(args, return_list <- deal_with_empty(return_list, properties, service, skipGeometry, convertType, no_paging) - if(convertType) return_list <- cleanup_cols(return_list, service = service) - return_list <- rejigger_cols(return_list, properties, output_id) + if(convertType){ + return_list <- cleanup_cols(return_list, service) + return_list <- order_results(return_list) + + # Mostly drop the id column except ts-meta, monitoring location: + if(!service %in% c("monitoring-locations", + "time-series-metadata", + "parameter-codes")){ + return_list <- return_list[, names(return_list)[names(return_list)!= output_id]] + } + # Move other id columns: + return_list <- move_id_col(return_list, + output_id) + } + attr(return_list, "request") <- req attr(return_list, "queryTime") <- Sys.time() return_list @@ -73,11 +82,7 @@ order_results <- function(df){ } move_id_col <- function(df, output_id){ - # attributes get dropped - req <- attr(df, "request") - queryTime <- attr(df, "queryTime") - - df <- df[, names(df)[names(df)!= output_id]] + if("time_series_id" %in% names(df)){ df <- df[, c(names(df)[names(df)!= "time_series_id"], "time_series_id")] @@ -87,10 +92,7 @@ move_id_col <- function(df, output_id){ df <- df[, c(names(df)[names(df)!= "field_visit_id"], "field_visit_id")] } - - attr(df, "request") <- req - attr(df, "queryTime") <- queryTime - + return(df) } diff --git a/R/read_waterdata.R b/R/read_waterdata.R index 14a6b992..5de835f5 100644 --- a/R/read_waterdata.R +++ b/R/read_waterdata.R @@ -59,7 +59,6 @@ read_waterdata <- function(service, match.arg(service, pkg.env$api_endpoints) args <- list(...) - args[["service"]] <- service output_id <- switch(service, "daily" = "daily_id", @@ -75,19 +74,20 @@ read_waterdata <- function(service, args[["properties"]] <- NA_character_ } + if(!"limit" %in% names(args)){ + args[["limit"]] <- NA_character_ + } + + args[["service"]] <- service + args <- check_limits(args) + data_req <- suppressWarnings(do.call(construct_api_requests, args)) data_req <- data_req |> httr2::req_headers(`Content-Type` = "application/query-cql-json") |> httr2::req_body_raw(CQL) - if("max_results" %in% names(args)){ - max_results <- args[["max_results"]] - } else { - max_results <- NA - } - - return_list <- walk_pages(data_req, max_results) + return_list <- walk_pages(data_req) if(is.null(args[["skipGeometry"]])){ skipGeometry <- FALSE @@ -101,16 +101,15 @@ read_waterdata <- function(service, service, skipGeometry, convertType) + + return_list <- rejigger_cols(return_list, args[["properties"]], output_id) - if(convertType) return_list <- cleanup_cols(return_list) - - # Add other time series services when they come online - if(service %in% c("daily")){ - return_list <- return_list[order(return_list$time, return_list$monitoring_location_id), ] + if(convertType){ + return_list <- cleanup_cols(return_list, service) + return_list <- order_results(return_list) + return_list <- move_id_col(return_list, output_id) } - return_list <- rejigger_cols(return_list, args[["properties"]], output_id) - return(return_list) } diff --git a/R/read_waterdata_continuous.R b/R/read_waterdata_continuous.R index be1fb2e7..a6cbb119 100644 --- a/R/read_waterdata_continuous.R +++ b/R/read_waterdata_continuous.R @@ -36,8 +36,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param convertType logical, defaults to `TRUE`. If `TRUE`, the function #' will convert the data to dates and qualifier to string vector, and sepcifically #' order the returning data frame by time and monitoring_location_id. @@ -80,7 +78,6 @@ read_waterdata_continuous <- function(monitoring_location_id = NA_character_, last_modified = NA_character_, time = NA_character_, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE){ @@ -97,12 +94,7 @@ read_waterdata_continuous <- function(monitoring_location_id = NA_character_, return_list <- get_ogc_data(args, output_id, service) - - if(convertType){ - return_list <- order_results(return_list) - return_list <- move_id_col(return_list, output_id) - } - + return(return_list) } diff --git a/R/read_waterdata_daily.R b/R/read_waterdata_daily.R index 6869dd59..6e11f640 100644 --- a/R/read_waterdata_daily.R +++ b/R/read_waterdata_daily.R @@ -32,8 +32,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param skipGeometry This option can be used to skip response geometries for #' each feature. The returning object will be a data frame with no spatial #' information. @@ -72,7 +70,6 @@ #' multi_site <- read_waterdata_daily(monitoring_location_id = c("USGS-01491000", #' "USGS-01645000"), #' parameter_code = c("00060", "00010"), -#' limit = 500, #' time = c("2023-01-01", "2024-01-01")) #' #' dv_data_quick <- read_waterdata_daily(monitoring_location_id = site, @@ -94,7 +91,6 @@ read_waterdata_daily <- function(monitoring_location_id = NA_character_, time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE){ @@ -106,11 +102,6 @@ read_waterdata_daily <- function(monitoring_location_id = NA_character_, output_id, service) - if(convertType){ - return_list <- order_results(return_list) - return_list <- move_id_col(return_list, output_id) - } - return(return_list) } diff --git a/R/read_waterdata_field_measurements.R b/R/read_waterdata_field_measurements.R index 4c560538..be427f70 100644 --- a/R/read_waterdata_field_measurements.R +++ b/R/read_waterdata_field_measurements.R @@ -35,8 +35,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param skipGeometry This option can be used to skip response geometries for #' each feature. The returning object will be a data frame with no spatial #' information. @@ -97,7 +95,6 @@ read_waterdata_field_measurements <- function(monitoring_location_id = NA_charac time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE){ @@ -109,11 +106,6 @@ read_waterdata_field_measurements <- function(monitoring_location_id = NA_charac output_id, service) - if(convertType){ - return_list <- order_results(return_list) - return_list <- move_id_col(return_list, output_id) - } - return(return_list) } diff --git a/R/read_waterdata_latest_continuous.R b/R/read_waterdata_latest_continuous.R index 6327e383..8d1f002b 100644 --- a/R/read_waterdata_latest_continuous.R +++ b/R/read_waterdata_latest_continuous.R @@ -34,8 +34,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param skipGeometry This option can be used to skip response geometries for #' each feature. The returning object will be a data frame with no spatial #' information. @@ -93,7 +91,6 @@ read_waterdata_latest_continuous <- function(monitoring_location_id = NA_charact time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE){ @@ -104,11 +101,6 @@ read_waterdata_latest_continuous <- function(monitoring_location_id = NA_charact return_list <- get_ogc_data(args, output_id, service) - - if(convertType){ - return_list <- order_results(return_list) - return_list <- move_id_col(return_list, output_id) - } return(return_list) } diff --git a/R/read_waterdata_latest_daily.R b/R/read_waterdata_latest_daily.R index 2f72a3b6..715ac2eb 100644 --- a/R/read_waterdata_latest_daily.R +++ b/R/read_waterdata_latest_daily.R @@ -32,8 +32,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param skipGeometry This option can be used to skip response geometries for #' each feature. The returning object will be a data frame with no spatial #' information. @@ -84,7 +82,6 @@ read_waterdata_latest_daily <- function(monitoring_location_id = NA_character_, time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE){ @@ -96,10 +93,6 @@ read_waterdata_latest_daily <- function(monitoring_location_id = NA_character_, output_id, service) - if(convertType){ - return_list <- order_results(return_list) - return_list <- move_id_col(return_list, output_id) - } return(return_list) } diff --git a/R/read_waterdata_metadata.R b/R/read_waterdata_metadata.R index 8a25c2db..7dce4e65 100644 --- a/R/read_waterdata_metadata.R +++ b/R/read_waterdata_metadata.R @@ -15,8 +15,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @examplesIf is_dataRetrieval_user() #' #' \donttest{ @@ -37,8 +35,7 @@ #' time_zone_codes <- read_waterdata_metadata("time-zone-codes") #' } read_waterdata_metadata <- function(collection, - max_results = NA, - limit = NA){ + limit = NA){ match.arg(collection, pkg.env$metadata) @@ -55,10 +52,9 @@ read_waterdata_metadata <- function(collection, data_req <- suppressWarnings(construct_api_requests(service = collection, skipGeometry = TRUE, properties = NA, - limit = limit, - max_results = max_results)) + limit = limit)) - return_list <- walk_pages(data_req, max_results) + return_list <- walk_pages(data_req) return_list <- rejigger_cols(df = return_list, properties = NA, diff --git a/R/read_waterdata_monitoring_location.R b/R/read_waterdata_monitoring_location.R index 50baa84f..3a60f5d8 100644 --- a/R/read_waterdata_monitoring_location.R +++ b/R/read_waterdata_monitoring_location.R @@ -1,4 +1,4 @@ -#' Get USGS Site Data +#' Get USGS Monitoring Location Data #' #' @description `r get_description("monitoring-locations")` #' @@ -58,8 +58,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @param skipGeometry This option can be used to skip response geometries for #' each feature. The returning object will be a data frame with no spatial #' information. @@ -87,10 +85,6 @@ #' #' bbox_vals = c(-94.00, 35.0, -93.5, 35.5) #' multi_site <- read_waterdata_monitoring_location(bbox = bbox_vals) -#' multi_site_n_100 <- read_waterdata_monitoring_location(bbox = bbox_vals, -#' max_results = 100) -#' multi_site_limit_100 <- read_waterdata_monitoring_location(bbox = bbox_vals, -#' limit = 100) #' } read_waterdata_monitoring_location <- function(monitoring_location_id = NA_character_, agency_code = NA_character_, @@ -135,7 +129,6 @@ read_waterdata_monitoring_location <- function(monitoring_location_id = NA_chara properties = NA_character_, bbox = NA, limit = NA, - max_results = NA, skipGeometry = NA){ service <- "monitoring-locations" diff --git a/R/read_waterdata_parameter_codes.R b/R/read_waterdata_parameter_codes.R index d1b9cf98..05f13ca6 100644 --- a/R/read_waterdata_parameter_codes.R +++ b/R/read_waterdata_parameter_codes.R @@ -23,8 +23,6 @@ #' limit is 50000. It may be beneficial to set this number lower if your internet #' connection is spotty. The default (`NA`) will set the limit to the maximum #' allowable limit for the service. -#' @param max_results The optional maximum number of rows to return. This value -#' must be less than the requested limit. #' @examplesIf is_dataRetrieval_user() #' #' \donttest{ @@ -57,8 +55,7 @@ read_waterdata_parameter_codes <- function(parameter_code = NA_character_, temperature_basis = NA_character_, epa_equivalence = NA_character_, properties = NA_character_, - limit = NA, - max_results = NA){ + limit = NA){ service <- "parameter-codes" output_id <- "parameter_code" @@ -67,7 +64,7 @@ read_waterdata_parameter_codes <- function(parameter_code = NA_character_, args[["convertType"]] <- FALSE args[["skipGeometry"]] <- TRUE args[["bbox"]] <- NA - args[["no_paging"]] <- FALSE # change if we're ever over 50,000 + args[["no_paging"]] <- FALSE # drops id if TRUE if(all(lengths(args) == 1)){ return_list <- suppressWarnings(get_ogc_data(args = args, @@ -79,8 +76,7 @@ read_waterdata_parameter_codes <- function(parameter_code = NA_character_, It is expected that updates to the API will eliminate this need.") return_list <- read_waterdata_metadata(collection = service, - max_results = max_results, - limit = limit) + limit = limit) args[["convertType"]] <- NULL args[["skipGeometry"]] <- NULL args[["no_paging"]] <- NULL diff --git a/R/read_waterdata_ts_meta.R b/R/read_waterdata_ts_meta.R index 9126445c..699dc9b9 100644 --- a/R/read_waterdata_ts_meta.R +++ b/R/read_waterdata_ts_meta.R @@ -88,7 +88,7 @@ read_waterdata_ts_meta <- function(monitoring_location_id = NA_character_, limit = NA, max_results = NA, bbox = NA, - convertType = FALSE, + convertType = TRUE, no_paging = FALSE){ service = "time-series-metadata" diff --git a/R/rejigger_cols.R b/R/rejigger_cols.R index cf28fc96..d82f5cb6 100644 --- a/R/rejigger_cols.R +++ b/R/rejigger_cols.R @@ -63,7 +63,7 @@ rejigger_cols <- function(df, properties, output_id){ #' service = "daily") #' df3 <- dataRetrieval:::cleanup_cols(df2) #' -cleanup_cols <- function(df, service = "daily"){ +cleanup_cols <- function(df, service){ if("qualifier" %in% names(df)){ if(!all(is.na(df$qualifier))){ diff --git a/R/walk_pages.R b/R/walk_pages.R index b035b436..5e564cd5 100644 --- a/R/walk_pages.R +++ b/R/walk_pages.R @@ -4,39 +4,28 @@ #' #' @noRd #' @return data.frame with attributes -walk_pages <- function(req, max_results){ +walk_pages <- function(req){ message("Requesting:\n", req$url) - current_api_limit <- 50000 - - if(is.na(max_results) | max_results > current_api_limit){ - resps <- httr2::req_perform_iterative(req, - next_req = next_req_url, - max_reqs = Inf, on_error = "return") - failures <- resps |> - httr2::resps_failures() |> - httr2::resps_requests() - - if(length(failures) > 0){ - stop(resps[[1]][["message"]]) - } - - return_list <- data.frame() - for(resp in resps){ - df1 <- get_resp_data(resp) - return_list <- rbind(return_list, df1) - } - - if(!is.na(max_results) & max_results > current_api_limit){ - return_list <- return_list[1:max_results, ] - } - - ###################################### - } else { - resps <- httr2::req_perform(req) - return_list <- get_resp_data(resps) + + resps <- httr2::req_perform_iterative(req, + next_req = next_req_url, + max_reqs = Inf, on_error = "return") + failures <- resps |> + httr2::resps_failures() |> + httr2::resps_requests() + + if(length(failures) > 0){ + stop(resps[[1]][["message"]]) } + return_list <- data.frame() + for(resp in resps){ + df1 <- get_resp_data(resp) + return_list <- rbind(return_list, df1) + } + + return(return_list) } @@ -114,16 +103,17 @@ next_req_url <- function(resp, req) { } -get_csv <- function(req, max_results){ - - if(is.na(max_results)){ - max_results <- 50000 - } +get_csv <- function(req, limit){ message("Requesting:\n", req$url) skip_geo <- grepl("skipGeometry=true", req$url, ignore.case = TRUE) resp <- httr2::req_perform(req) + header_info <- httr2::resp_headers(resp) + if(Sys.getenv("API_USGS_PAT") != ""){ + message("Remaining requests this hour:", header_info$`x-ratelimit-remaining`, " ") + } + if(httr2::resp_has_body(resp)){ return_list <- httr2::resp_body_string(resp) df <- suppressMessages(readr::read_csv(file = return_list)) @@ -134,7 +124,7 @@ get_csv <- function(req, max_results){ sf::st_crs(df) <- 4269 } - if(nrow(df) == max_results){ + if(nrow(df) == limit){ warning("Missing data is probable. Use no_paging = FALSE to ensure all requested data is returned.") } diff --git a/man/read_waterdata_continuous.Rd b/man/read_waterdata_continuous.Rd index 0f818683..e4e1a02b 100644 --- a/man/read_waterdata_continuous.Rd +++ b/man/read_waterdata_continuous.Rd @@ -17,7 +17,6 @@ read_waterdata_continuous( last_modified = NA_character_, time = NA_character_, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE ) @@ -81,9 +80,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates and qualifier to string vector, and sepcifically order the returning data frame by time and monitoring_location_id.} diff --git a/man/read_waterdata_daily.Rd b/man/read_waterdata_daily.Rd index 3c4195c9..72e5cd93 100644 --- a/man/read_waterdata_daily.Rd +++ b/man/read_waterdata_daily.Rd @@ -19,7 +19,6 @@ read_waterdata_daily( time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE ) @@ -91,9 +90,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates and qualifier to string vector.} @@ -135,7 +131,6 @@ dv_data_period <- read_waterdata_daily(monitoring_location_id = site, multi_site <- read_waterdata_daily(monitoring_location_id = c("USGS-01491000", "USGS-01645000"), parameter_code = c("00060", "00010"), - limit = 500, time = c("2023-01-01", "2024-01-01")) dv_data_quick <- read_waterdata_daily(monitoring_location_id = site, diff --git a/man/read_waterdata_field_measurements.Rd b/man/read_waterdata_field_measurements.Rd index 96341b9c..99aa852d 100644 --- a/man/read_waterdata_field_measurements.Rd +++ b/man/read_waterdata_field_measurements.Rd @@ -22,7 +22,6 @@ read_waterdata_field_measurements( time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE ) @@ -100,9 +99,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates and qualifier to string vector.} diff --git a/man/read_waterdata_latest_continuous.Rd b/man/read_waterdata_latest_continuous.Rd index eb2490e6..a933e799 100644 --- a/man/read_waterdata_latest_continuous.Rd +++ b/man/read_waterdata_latest_continuous.Rd @@ -19,7 +19,6 @@ read_waterdata_latest_continuous( time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE ) @@ -94,9 +93,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates and qualifier to string vector.} diff --git a/man/read_waterdata_latest_daily.Rd b/man/read_waterdata_latest_daily.Rd index 63c35c49..0b47e620 100644 --- a/man/read_waterdata_latest_daily.Rd +++ b/man/read_waterdata_latest_daily.Rd @@ -19,7 +19,6 @@ read_waterdata_latest_daily( time = NA_character_, bbox = NA, limit = NA, - max_results = NA, convertType = TRUE, no_paging = FALSE ) @@ -91,9 +90,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{convertType}{logical, defaults to \code{TRUE}. If \code{TRUE}, the function will convert the data to dates and qualifier to string vector.} diff --git a/man/read_waterdata_metadata.Rd b/man/read_waterdata_metadata.Rd index 83071a7c..cdd9f599 100644 --- a/man/read_waterdata_metadata.Rd +++ b/man/read_waterdata_metadata.Rd @@ -4,7 +4,7 @@ \alias{read_waterdata_metadata} \title{Generalized USGS Water Meta Data API retrieval function} \usage{ -read_waterdata_metadata(collection, max_results = NA, limit = NA) +read_waterdata_metadata(collection, limit = NA) } \arguments{ \item{collection}{character, can be any existing collection such @@ -14,9 +14,6 @@ as "parameter-codes", "agency-codes", "altitude-datums", "aquifer-codes", "national-aquifer-codes", "reliability-codes", "site-types", "statistic-codes", "topographic-codes", "time-zone-codes".} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{limit}{The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower if your internet diff --git a/man/read_waterdata_monitoring_location.Rd b/man/read_waterdata_monitoring_location.Rd index e45ad160..0a41c0b0 100644 --- a/man/read_waterdata_monitoring_location.Rd +++ b/man/read_waterdata_monitoring_location.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/read_waterdata_monitoring_location.R \name{read_waterdata_monitoring_location} \alias{read_waterdata_monitoring_location} -\title{Get USGS Site Data} +\title{Get USGS Monitoring Location Data} \usage{ read_waterdata_monitoring_location( monitoring_location_id = NA_character_, @@ -48,7 +48,6 @@ read_waterdata_monitoring_location( properties = NA_character_, bbox = NA, limit = NA, - max_results = NA, skipGeometry = NA ) } @@ -151,9 +150,6 @@ limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} - \item{skipGeometry}{This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information.} @@ -186,10 +182,6 @@ site_info_no_sf <- read_waterdata_monitoring_location(monitoring_location_id = s bbox_vals = c(-94.00, 35.0, -93.5, 35.5) multi_site <- read_waterdata_monitoring_location(bbox = bbox_vals) -multi_site_n_100 <- read_waterdata_monitoring_location(bbox = bbox_vals, - max_results = 100) -multi_site_limit_100 <- read_waterdata_monitoring_location(bbox = bbox_vals, - limit = 100) } \dontshow{\}) # examplesIf} } diff --git a/man/read_waterdata_parameter_codes.Rd b/man/read_waterdata_parameter_codes.Rd index 7fb569c0..b530d0de 100644 --- a/man/read_waterdata_parameter_codes.Rd +++ b/man/read_waterdata_parameter_codes.Rd @@ -17,8 +17,7 @@ read_waterdata_parameter_codes( temperature_basis = NA_character_, epa_equivalence = NA_character_, properties = NA_character_, - limit = NA, - max_results = NA + limit = NA ) } \arguments{ @@ -54,9 +53,6 @@ selected features that should be returned in each page. The maximum allowable limit is 50000. It may be beneficial to set this number lower if your internet connection is spotty. The default (\code{NA}) will set the limit to the maximum allowable limit for the service.} - -\item{max_results}{The optional maximum number of rows to return. This value -must be less than the requested limit.} } \description{ Parameter codes are 5-digit codes and associated descriptions used to identify the constituent measured and the units of measure. Some parameter code definitions include information about the sampling matrix, fraction, and methods used to measure the constituent. Some parameters are fixed-value (fxd) numeric codes having textual meaning (for example: parameter code 00041 is a weather code parameter, code of 60 means rain), but more commonly represent a numeric value for chemical, physical, or biological data. diff --git a/man/read_waterdata_ts_meta.Rd b/man/read_waterdata_ts_meta.Rd index 875d1e3c..51b6e287 100644 --- a/man/read_waterdata_ts_meta.Rd +++ b/man/read_waterdata_ts_meta.Rd @@ -28,7 +28,7 @@ read_waterdata_ts_meta( limit = NA, max_results = NA, bbox = NA, - convertType = FALSE, + convertType = TRUE, no_paging = FALSE ) } diff --git a/tests/testthat/tests_general.R b/tests/testthat/tests_general.R index 825b1ed0..843952d1 100644 --- a/tests/testthat/tests_general.R +++ b/tests/testthat/tests_general.R @@ -66,36 +66,13 @@ test_that("General NWIS retrievals working", { testthat::skip_on_cran() testthat::skip_on_ci() - multiSite <- readNWISdata( - sites = c("04025500", "040263491"), service = "iv", - parameterCd = "00060", - startDate = "2020-11-01", endDate = "2020-11-02" - ) - expect_is(multiSite$dateTime, "POSIXct") - - recent_uv <- readNWISdata( - siteNumber = "04025500", parameterCd = "00060", - service = "uv", - startDate = as.Date(Sys.Date() - 10), - endDate = Sys.Date() - ) - expect_equal(grep( - x = attr(recent_uv, "url"), - pattern = "https://waterservices.usgs.gov/nwis/iv/" - ), 1) - - older_uv <- readNWISdata( - siteNumber = "04025500", - parameterCd = "00060", - service = "uv", - startDate = "2016-01-01", - endDate = "2016-01-02" + multiSite <- read_waterdata_continuous( + monitoring_location_id = c("USGS-04025500", "USGS-040263491"), + parameter_code = "00060", + time = c("2020-11-01", "2020-11-02") ) - expect_equal(grep( - x = attr(older_uv, "url"), - pattern = "https://nwis.waterservices.usgs.gov/nwis/iv/" - ), 1) - + + expect_is(multiSite$time, "POSIXct") expect_error(readNWISdata(), "No arguments supplied") expect_error(readNWISdata(siteNumber = NA), "NA's are not allowed in query") @@ -113,24 +90,6 @@ test_that("General NWIS retrievals working", { expect_is(timeseriesInfo$begin, "POSIXct") - gw_data <- readNWISdata( - stateCd = "AL", - service = "gwlevels", - startDate = "2024-05-01", - endDate = "2024-05-30") - - expect_true(nrow(gw_data) > 0) - expect_equal(attr(gw_data, "url"), - "https://nwis.waterdata.usgs.gov/nwis/gwlevels?state_cd=AL&begin_date=2024-05-01&end_date=2024-05-30&date_format=YYYY-MM-DD&rdb_inventory_output=file&TZoutput=0&range_selection=date_range&list_of_search_criteria=state_cd&format=rdb") - - gw_data2 <- readNWISdata( - state_cd = "AL", - service = "gwlevels", - startDate = "2024-05-01", - endDate = "2024-05-30") - - expect_equal(nrow(gw_data), nrow(gw_data2)) - # nolint start: line_length_linter url <- httr2::request("https://waterservices.usgs.gov/nwis/dv/?site=09037500&format=rdb&ParameterCd=00060&StatCd=00003&startDT=1985-10-02&endDT=2012-09-06") dv <- importRDB1(url, asDateTime = FALSE) @@ -163,24 +122,23 @@ test_that("General NWIS retrievals working", { # Test list: args <- list( - sites = "05114000", service = "iv", - parameterCd = "00060", - startDate = "2014-05-01T00:00Z", - endDate = "2014-05-01T12:00Z" + monitoring_location_id = "USGS-05114000", + parameter_code = "00060", + time = c("2014-05-01T00:00Z", "2014-05-01T12:00Z") ) - instData <- readNWISdata(args) + instData <- do.call(read_waterdata_continuous, args) args2 <- list( monitoring_location_id = "USGS-05114000", parameter_code = "00060", - time = c("2014-05-01", endDate = "2014-05-01") + time = c("2014-05-01", "2014-05-01") ) daily_USGS <- do.call(read_waterdata_daily, args2) expect_lt(nrow(daily_USGS), nrow(instData)) - ohio <- read_waterdata_monitoring_location(state_name = "Ohio", + ohio <- read_waterdata_monitoring_location(state_name = "Ohio", site_type_code = "ST") bbox <- sf::st_bbox(ohio) what_sites <- read_waterdata_ts_meta(parameter_code = "00665", diff --git a/tests/testthat/tests_imports.R b/tests/testthat/tests_imports.R index ffdae6f9..79e33097 100644 --- a/tests/testthat/tests_imports.R +++ b/tests/testthat/tests_imports.R @@ -143,36 +143,36 @@ test_that("External importWaterML1 test", { expect_true(data.class(data$dateTime) == "POSIXct") expect_true(nrow(data) > 0) - expect_error(readNWISdata( - sites = "05114000", - service = "iv", - parameterCd = "00060", - startDate = "2014-05-01T00:00", - endDate = "2014-05-01T12:00", - tz = "blah" - )) - - arg.list <- list( - sites = "05114000", - parameterCd = "00060", - startDate = "2014-05-01T00:00", - endDate = "2014-05-01T12:00" - ) - - chi_iv <- readNWISdata(arg.list, - service = "iv", - tz = "America/Chicago" - ) - - expect_true(all(chi_iv$tz_cd == "America/Chicago")) - expect_equal(chi_iv$dateTime[1], as.POSIXct("2014-05-01T00:00", - format = "%Y-%m-%dT%H:%M", - tz = "America/Chicago" - )) - expect_equal(chi_iv$dateTime[nrow(chi_iv)], as.POSIXct("2014-05-01T12:00", - format = "%Y-%m-%dT%H:%M", - tz = "America/Chicago" - )) + # expect_error(readNWISdata( + # sites = "05114000", + # service = "iv", + # parameterCd = "00060", + # startDate = "2014-05-01T00:00", + # endDate = "2014-05-01T12:00", + # tz = "blah" + # )) +# +# arg.list <- list( +# sites = "05114000", +# parameterCd = "00060", +# startDate = "2014-05-01T00:00", +# endDate = "2014-05-01T12:00" +# ) +# +# chi_iv <- readNWISdata(arg.list, +# service = "iv", +# tz = "America/Chicago" +# ) +# +# expect_true(all(chi_iv$tz_cd == "America/Chicago")) +# expect_equal(chi_iv$dateTime[1], as.POSIXct("2014-05-01T00:00", +# format = "%Y-%m-%dT%H:%M", +# tz = "America/Chicago" +# )) +# expect_equal(chi_iv$dateTime[nrow(chi_iv)], as.POSIXct("2014-05-01T12:00", +# format = "%Y-%m-%dT%H:%M", +# tz = "America/Chicago" +# )) # Time over daylight saving switch: tzURL <- constructNWISURL( diff --git a/tests/testthat/tests_userFriendly_fxns.R b/tests/testthat/tests_userFriendly_fxns.R index 547411a6..94157935 100644 --- a/tests/testthat/tests_userFriendly_fxns.R +++ b/tests/testthat/tests_userFriendly_fxns.R @@ -386,33 +386,36 @@ test_that("Construct USGS urls", { monitoring_location_id = siteNumber, parameter_code = pCode, time = c(startDate, endDate), - statistic_id = c("00003", "00001")) + statistic_id = c("00003", "00001"), + limit = 10000) # nolint start: line_length_linter expect_equal(url_daily$url, - "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items?f=json&lang=en-US&time=2024-01-01%2F..&skipGeometry=FALSE") + "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items?f=json&lang=en-US&time=2024-01-01%2F..&limit=10000&skipGeometry=FALSE") - url_works <- dataRetrieval:::walk_pages(url_daily, max_results = 1) + url_works <- dataRetrieval:::walk_pages(url_daily) expect_true(nrow(url_works) > 0) url_ts_meta <- construct_api_requests(monitoring_location_id = siteNumber, parameter_code = pCode, - service = "time-series-metadata") + service = "time-series-metadata", + limit = 10000) expect_equal( url_ts_meta$url, - "https://api.waterdata.usgs.gov/ogcapi/v0/collections/time-series-metadata/items?f=json&lang=en-US&skipGeometry=FALSE" + "https://api.waterdata.usgs.gov/ogcapi/v0/collections/time-series-metadata/items?f=json&lang=en-US&limit=10000&skipGeometry=FALSE" ) - url_works_ts <- dataRetrieval:::walk_pages(url_ts_meta, max_results = 1) + url_works_ts <- dataRetrieval:::walk_pages(url_ts_meta) expect_true(nrow(url_works_ts) > 0) url_ml <- construct_api_requests(id = siteNumber, - service = "monitoring-locations") + service = "monitoring-locations", + limit = 50000) - expect_equal(url_ml$url, "https://api.waterdata.usgs.gov/ogcapi/v0/collections/monitoring-locations/items?f=json&lang=en-US&skipGeometry=FALSE&id=USGS-01594440") + expect_equal(url_ml$url, "https://api.waterdata.usgs.gov/ogcapi/v0/collections/monitoring-locations/items?f=json&lang=en-US&skipGeometry=FALSE&id=USGS-01594440&limit=50000") - url_works_ml <- dataRetrieval:::walk_pages(url_ml, max_results = 1) + url_works_ml <- dataRetrieval:::walk_pages(url_ml) expect_true(nrow(url_works_ml) > 0) url_use <- constructUseURL( diff --git a/tutorials/changes_slides_deck.qmd b/tutorials/changes_slides_deck.qmd index 57b932fa..adfc39a1 100644 --- a/tutorials/changes_slides_deck.qmd +++ b/tutorials/changes_slides_deck.qmd @@ -790,13 +790,11 @@ time_zone_codes <- read_waterdata_metadata("time-zone-codes") ## Limit Explanation -* [Limits](https://doi-usgs.github.io/dataRetrieval/articles/read_waterdata_functions.html#limit-vs-max_results) - - - `max_results` lets you define how many rows are returned - - `limit` lets you define how many rows are returned **per page** of data. With a good internet connection, you can probably get away with ignoring this argument. -I would ignore both most of the time. + - Leaving the `limit` argument to the default will return 50,000 rows per page. + + - `no_paging` argument (`TRUE`/`FALSE`) will only return 1 page of data, therefore the `limit` argument will define how many rows are returned. ## Adding API token to CI jobs: GitLab diff --git a/vignettes/read_waterdata_functions.Rmd b/vignettes/read_waterdata_functions.Rmd index 27d7b13d..21c086f3 100644 --- a/vignettes/read_waterdata_functions.Rmd +++ b/vignettes/read_waterdata_functions.Rmd @@ -716,14 +716,15 @@ Once the pipeline has completed, you can load the `ohio_discharge` data frame in tar_load(ohio_discharge) ``` -## limit vs max_results +## limit and no_paging -A user can specify a `limit` or `max_results`. - -The `max_results` argument defines how many rows are returned (assuming the data has at least `max_results` rows to return). This can be used as a handy way to make sure you aren't requesting a ton of data, perhaps to do some initial coding or troubleshooting. The `limit` argument defines how many rows are returned per page of data, but does NOT affect the overall number of rows returned. With a good internet connection, you can probably get away with ignoring this argument. By default it will be set to the highest value that the services allow. The reason you might want to change this argument is that it might be easier on a spotty internet connection to page through smaller sets of data. +The `no_paging` argument uses an option from the API that only returns one page of data. Therefore, the total number of rows returned will be capped at the `limit`. If no `limit` is defined, the maximum number of rows returned is 50,000, which is a hard-cap set by the API. The vast majority of users should use the default `no_paging = FALSE` to make sure they are getting all the data they asked for. `no_paging = TRUE` can be a bit more efficient since it is getting the data in a native table format (csv). However, it must be used with caution because it might not return all the requested data (`dataRetrieval` will give a warning). Because the data is coming from a csv format, the data may come back in a slightly different order (rows or columns) than the default `no_paging = FALSE` which is pulling data from a native json format. + +The combination of `no_paging` and `limit` can be used to specify exactly how many rows to return. + ## id Each API endpoint natively returns a column named "id". The results of the "id" column can be used as inputs into other endpoints, **HOWEVER** the input in those functions have different names. For example, the "id" column of the monitoring location endpoint is considered the "monitoring_location_id" when used as an input to any of the other functions. @@ -733,12 +734,10 @@ Therefore, `dataRetrieval` functions will rename the "id" column to whatever it ```{r echo=FALSE} df <- dplyr::tibble(Function = c("read_waterdata_monitoring_location", "read_waterdata_ts_meta", - "read_waterdata_daily", - "read_waterdata_latest_continuous"), + "read_waterdata_parameter_codes"), "ID returned" = c("monitoring_location_id", "time_series_id", - "daily_id", - "latest_continuous_id")) + "parameter_code")) knitr::kable(df) ```