Skip to content

Commit ce00d33

Browse files
authored
Implement new tokenizer (#416)
* implement new tokenizer * reimplement as_units based on new tokenizer * remove unsupported bits * run full pillar test * add more tests * treat numbers as prefixes * follow strict_tokenizer while formatting too * implement lookahead to enable numbers in the middle of symbols * try to convert only if parseable * bump version, update NEWS and revdep checks * more tests
1 parent 907dda2 commit ce00d33

File tree

18 files changed

+417
-447
lines changed

18 files changed

+417
-447
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Package: units
2-
Version: 0.8-7.3
2+
Version: 0.8-7.4
33
Title: Measurement Units for R Vectors
44
Authors@R: c(person("Edzer", "Pebesma", role = c("aut", "cre"), email = "edzer.pebesma@uni-muenster.de", comment = c(ORCID = "0000-0001-8049-7069")),
55
person("Thomas", "Mailund", role = "aut", email = "mailund@birc.au.dk"),

NEWS.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
# version devel
1+
# version 1.0-0
2+
3+
* Breaking change: a new tokenizer fixes longstanding issues with parsing
4+
complex unit expressions, but may break existing code that relied on the
5+
previous (buggy) behavior. The major change is that now numbers are
6+
consistently treated as prefixes, so that units like `ml / min / 1.73m^2`
7+
used in physiology are now correctly parsed as `ml / (min * 1.73 * m^2)`.
8+
See `?as_units` for details; #416 addressing #221, #383
29

310
* Vectorize `ud_*()` helpers; #405 addressing #404
411

R/RcppExports.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
22
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
33

4+
parse_unit <- function(x, strict = FALSE) {
5+
.Call('_units_parse_unit', PACKAGE = 'units', x, strict)
6+
}
7+
48
ud_exit <- function() {
59
invisible(.Call('_units_ud_exit', PACKAGE = 'units'))
610
}

R/conversion.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,10 @@ set_units.numeric <- function(x, value, ..., mode = units_options("set_units_mod
233233
value <- unitless
234234
else if (mode == "symbols") {
235235
value <- substitute(value)
236-
237236
if(is.numeric(value) && !identical(value, 1) && !identical(value, 1L))
238237
stop("The only valid number defining a unit is '1', signifying a unitless unit")
238+
if (is.name(value) || is.call(value))
239+
value <- format(value)
239240
}
240241

241242
units(x) <- as_units(value, ...)

R/make_units.R

Lines changed: 81 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@
121121
#' # or
122122
#' drop_units(y)
123123
make_units <- function(bare_expression, check_is_valid = TRUE) {
124-
as_units.call(substitute(bare_expression), check_is_valid = check_is_valid)
124+
as_units(format(substitute(bare_expression)), check_is_valid = check_is_valid)
125125
}
126126

127127
#' @name units
@@ -189,75 +189,76 @@ as_units.difftime <- function(x, value, ...) {
189189

190190
# ----- as_units.character helpers ------
191191

192-
backtick <- function(x) {
193-
# backtick all character runs uninterupted by one of ^()*^/`- or a space
194-
# don't double up backticks
195-
x <- gsub("`?([^() \\*^/`-]+)`?", "`\\1`", x)
196-
gsub("`([0-9]*\\.?[0-9]+)`", "\\1", x) # unbacktick bare numbers
192+
is_udunits_time <- function(s) {
193+
ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01")
197194
}
198195

199-
are_exponents_implicit <- function(s) {
200-
s <- trimws(s)
201-
has <- function(chr, regex = FALSE)
202-
grepl(chr, s, fixed = !regex, perl = regex)
203-
!has("^") && !has("*") && !has("/") && has("\\s|\\D.*\\d$", regex = TRUE)
196+
# from package:yasp, paste collapse with serial (oxford) comma
197+
pc_and <- function(..., sep = "") {
198+
x <- paste(..., sep = sep, collapse = NULL)
199+
lx <- length(x)
200+
if(lx == 0L)
201+
""
202+
else if (lx == 1L)
203+
x
204+
else if (lx == 2L)
205+
paste0(x, collapse = " and ")
206+
else
207+
paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx])
204208
}
205209

206-
is_udunits_time <- function(s) {
207-
ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01")
210+
.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) {
211+
212+
if (is.language(full_expr))
213+
full_expr <- deparse(full_expr)
214+
215+
is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is"
216+
217+
paste0("In ", sQuote(full_expr), ", ",
218+
pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n",
219+
"See a table of valid unit symbols and names with valid_udunits().\n",
220+
"Custom user-defined units can be added with install_unit().\n\n",
221+
"See a table of valid unit prefixes with valid_udunits_prefixes().\n",
222+
"Prefixes will automatically work with any user-defined unit.")
208223
}
209224

210225
#' @name units
211226
#' @export
212227
#'
228+
#' @param check_is_valid throw an error if all the unit symbols are not either
229+
#' recognized by udunits2, or a custom
230+
#' user defined via \code{install_unit()}. If \code{FALSE}, no check
231+
#' for validity is performed.
232+
#'
213233
#' @param force_single_symbol Whether to perform no string parsing and force
214234
#' treatment of the string as a single symbol.
215235
#'
216-
#' @param implicit_exponents If the unit string is in product power form (e.g.
217-
#' \code{"km m-2 s-1"}). Defaults to \code{NULL}, in which case a guess is made
218-
#' based on the supplied string. Set to \code{TRUE} or \code{FALSE} if the guess is
219-
#' incorrect.
220-
#'
221236
#' @section Character strings:
222237
#'
223238
#' Generally speaking, there are 3 types of unit strings are accepted in
224239
#' \code{as_units} (and by extension, \code{`units<-`}).
225240
#'
226-
#' The first, and likely most common, is a "standard" format unit
241+
#' The first type, and likely most common, is a "standard" format unit
227242
#' specification where the relationship between unit symbols or names is
228243
#' specified explicitly with arithmetic symbols for division \code{/},
229-
#' multiplication \code{*} and power exponents \code{^}, or other mathematical
230-
#' functions like \code{log()}. In this case, the string is parsed as an R
231-
#' expression via \code{parse(text = )} after backticking all unit symbols and
232-
#' names, and then passed on to \code{as_units.call()}. A heuristic is used to
233-
#' perform backticking, such that any continuous set of characters
234-
#' uninterrupted by one of \code{()\\*^-} are backticked (unless the character
235-
#' sequence consists solely of numbers \code{0-9}), with some care to not
236-
#' double up on pre-existing backticks. This heuristic appears to be quite
237-
#' robust, and works for units would otherwise not be valid R syntax. For
238-
#' example, percent (\code{"\%"}), feet (\code{"'"}), inches (\code{"in"}),
239-
#' and Tesla (\code{"T"}) are all backticked and parsed correctly.
240-
#'
241-
#' Nevertheless, for certain complex unit expressions, this backticking heuristic
242-
#' may give incorrect results. If the string supplied fails to parse as an R
243-
#' expression, then the string is treated as a single symbolic unit and
244-
#' \code{symbolic_unit(chr)} is used as a fallback with a warning. In that
245-
#' case, automatic unit simplification may not work properly when performing
246-
#' operations on unit objects, but unit conversion and other Math operations
247-
#' should still give correct results so long as the unit string supplied
248-
#' returns \code{TRUE} for \code{ud_is_parsable()}.
244+
#' multiplication \code{*} and power exponents \code{^}.
249245
#'
250246
#' The second type of unit string accepted is one with implicit exponents. In
251247
#' this format, \code{/}, \code{*}, and \code{^}, may not be present in the
252248
#' string, and unit symbol or names must be separated by a space. Each unit
253249
#' symbol may optionally be followed by a single number, specifying the power.
254250
#' For example \code{"m2 s-2"} is equivalent to \code{"(m^2)*(s^-2)"}.
255251
#'
256-
#' It must be noted that prepended numbers are supported too, but their
257-
#' interpretation slightly varies depending on whether they are separated from
258-
#' the unit string or not. E.g., \code{"1000 m"} is interpreted as magnitude
259-
#' and unit, but \code{"1000m"} is interpreted as a prefixed unit, and it is
260-
#' equivalent to \code{"km"} to all effects.
252+
#' If the string supplied fails to parse, then the string is treated as a
253+
#' single symbolic unit and \code{symbolic_unit(chr)} is used as a fallback
254+
#' with a warning. In that case, automatic unit simplification may not work
255+
#' properly when performing operations on unit objects, but unit conversion
256+
#' and other Math operations should still give correct results so long as
257+
#' the unit string supplied returns \code{TRUE} for \code{ud_is_parsable()}.
258+
#'
259+
#' It must be noted that prepended numbers are supported too, but are not
260+
#' treated as magnitudes. For example, \code{"1000 m"} is interpreted as
261+
#' a prefixed unit, and it is equivalent to \code{"km"} to all effects.
261262
#'
262263
#' The third type of unit string format accepted is the special case of
263264
#' udunits time duration with a reference origin, for example \code{"hours
@@ -268,157 +269,69 @@ is_udunits_time <- function(s) {
268269
#' otherwise encouraged to use \code{R}'s date and time functionality provided
269270
#' by \code{Date} and \code{POSIXt} classes.
270271
#'
271-
as_units.character <- function(x,
272+
#' @note By default, unit names are automatically substituted with unit names
273+
#' (e.g., kilogram --> kg). To turn off this behavior, set
274+
#' \code{units_options(auto_convert_names_to_symbols = FALSE)}
275+
#'
276+
#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}}
277+
as_units.character <- function(x, ...,
272278
check_is_valid = TRUE,
273-
implicit_exponents = NULL,
274-
force_single_symbol = FALSE, ...) {
279+
force_single_symbol = FALSE) {
275280

276281
stopifnot(is.character(x), length(x) == 1)
277282

278-
if (isTRUE(x == "")) return(unitless)
283+
if (any(is.na(x)))
284+
stop("a missing value for units is not allowed")
285+
286+
if (isTRUE(x == "" || x == "1"))
287+
return(.as.units(1, unitless))
279288

280289
if(force_single_symbol || is_udunits_time(x))
281290
return(symbolic_unit(x, check_is_valid = check_is_valid))
282291

283-
if(is.null(implicit_exponents))
284-
implicit_exponents <- are_exponents_implicit(x)
285-
286-
if(implicit_exponents)
287-
x <- convert_implicit_to_explicit_exponents(x)
288-
289-
x <- backtick(x)
290-
o <- try(expr <- parse(text = x)[[1]], silent = TRUE)
291-
292+
o <- try(su <- parse_unit(x, units_options("strict_tokenizer")), silent=TRUE)
292293
if(inherits(o, "try-error")) {
293-
warning("Could not parse expression: ", sQuote(x), # nocov
294-
". Returning as a single symbolic unit()", call. = FALSE) # nocov
295-
return(symbolic_unit(x, check_is_valid = check_is_valid)) # nocov
294+
warning("Could not parse expression: ", sQuote(x), # nocov
295+
". Returning as a single symbolic unit()", call. = FALSE) # nocov
296+
return(symbolic_unit(x, check_is_valid = check_is_valid)) # nocov
296297
}
297298

298-
as_units.call(expr, check_is_valid = check_is_valid)
299-
}
300-
301-
302-
convert_implicit_to_explicit_exponents <- function(x) {
303-
if (length(grep(c("[*/]"), x)) > 0)
304-
stop("If 'implicit_exponents = TRUE', strings cannot contain `*' or `/'")
305-
x <- gsub("\\b([^\\d-]+)([-]?\\d+)\\b", "\\1^(\\2)", x, perl =TRUE)
306-
x <- gsub("\\s+", " * ", trimws(x), perl = TRUE)
307-
x
308-
}
309-
310-
# ----- as_units.call helpers ------
311-
312-
# from package:yasp, paste collapse with serial (oxford) comma
313-
pc_and <- function(..., sep = "") {
314-
x <- paste(..., sep = sep, collapse = NULL)
315-
lx <- length(x)
316-
if(lx == 0L)
317-
""
318-
else if (lx == 1L)
319-
x
320-
else if (lx == 2L)
321-
paste0(x, collapse = " and ")
322-
else
323-
paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx])
324-
}
325-
326-
#`%not_in%` <- function(x, table) match(x, table, nomatch = 0L) == 0L
327-
328-
.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) {
329-
330-
if (is.language(full_expr))
331-
full_expr <- deparse(full_expr)
332-
333-
is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is"
334-
335-
paste0("In ", sQuote(full_expr), ", ",
336-
pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n",
337-
"See a table of valid unit symbols and names with valid_udunits().\n",
338-
"Custom user-defined units can be added with install_unit().\n\n",
339-
"See a table of valid unit prefixes with valid_udunits_prefixes().\n",
340-
"Prefixes will automatically work with any user-defined unit.")
341-
}
342-
343-
units_eval_env <- new.env(parent = baseenv())
344-
units_eval_env$ln <- function(x) base::log(x)
345-
units_eval_env$lg <- function(x) base::log(x, base = 10)
346-
units_eval_env$lb <- function(x) base::log(x, base = 2)
347-
348-
349-
#' @name units
350-
#' @export
351-
#'
352-
#' @param check_is_valid throw an error if all the unit symbols are not either
353-
#' recognized by udunits2, or a custom
354-
#' user defined via \code{install_unit()}. If \code{FALSE}, no check
355-
#' for validity is performed.
356-
#'
357-
#' @note By default, unit names are automatically substituted with unit names
358-
#' (e.g., kilogram --> kg). To turn off this behavior, set
359-
#' \code{units_options(auto_convert_names_to_symbols = FALSE)}
360-
#'
361-
#' @section Expressions:
362-
#'
363-
#' In \code{as_units()}, each of the symbols in the unit expression is treated
364-
#' individually, such that each symbol must be recognized by the udunits
365-
#' database, \emph{or} be a custom,
366-
#' user-defined unit symbol that was defined by \code{install_unit()}. To
367-
#' see which symbols and names are currently recognized by the udunits
368-
#' database, see \code{valid_udunits()}.
369-
#'
370-
#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}}
371-
as_units.call <- function(x, check_is_valid = TRUE, ...) {
372-
373-
if(missing(x) || identical(x, quote(expr =)) ||
374-
identical(x, 1) || identical(x, 1L))
375-
return(.as.units(1, unitless))
376-
377-
if (is.vector(x) && !is.expression(x) && any(is.na(x)))
378-
stop("a missing value for units is not allowed")
379-
380-
stopifnot(is.language(x))
381-
382-
vars <- all.vars(x)
383-
if(!length(vars))
384-
stop(call. = FALSE,
385-
"No symbols found. Please supply bare expressions with this approach.
386-
See ?as_units for usage examples.")
387-
388299
if (check_is_valid) {
300+
vars <- c(su$numerator, su$denominator)
389301
valid <- vapply(vars, ud_is_parseable, logical(1L))
390302
if (!all(valid))
391303
stop(.msg_units_not_recognized(vars[!valid], x), call. = FALSE)
392304
}
393305

394-
names(vars) <- vars
395-
tmp_env <- lapply(vars, symbolic_unit, check_is_valid = FALSE)
396-
397-
if (dont_simplify_here <- is.na(.units.simplify())) {
398-
units_options(simplify = FALSE)
399-
on.exit(units_options(simplify = NA))
306+
if (units_options("auto_convert_names_to_symbols")) {
307+
name_to_symbol <- function(chr)
308+
if (ud_is_parseable(chr) && length(sym <- ud_get_symbol(chr))) sym else chr
309+
su$numerator <- vapply(su$numerator, name_to_symbol, character(1), USE.NAMES=FALSE)
310+
su$denominator <- vapply(su$denominator, name_to_symbol, character(1), USE.NAMES=FALSE)
400311
}
401312

402-
unit <- tryCatch( eval(x, tmp_env, units_eval_env),
403-
error = function(e) stop( paste0( conditionMessage(e), "\n",
404-
"Did you try to supply a value in a context where a bare expression was expected?"
405-
), call. = FALSE ))
406-
407-
# if(as.numeric(unit) %not_in% c(1, 0)) # 0 if log() used.
408-
# stop(call. = FALSE,
409-
#"In ", sQuote(deparse(x)), " the numeric multiplier ", sQuote(as.numeric(unit)), " is invalid.
410-
#Use `install_unit()` to define a new unit that is a multiple of another unit.")
313+
if (is.na(.units.simplify())) {
314+
units_options(simplify = FALSE)
315+
on.exit(units_options(simplify = NA))
316+
}
317+
.simplify_units(1, su)
318+
}
411319

412-
.as.units(as.numeric(unit), units(unit))
320+
#' @name units
321+
#' @export
322+
as_units.call <- function(x, ...) {
323+
as_units(format(x), ...)
413324
}
414325

415326
#' @name units
416327
#' @export
417-
as_units.expression <- as_units.call
328+
as_units.expression <- function(x, ...) {
329+
as_units(as.character(x), ...)
330+
}
418331

419332
#' @name units
420333
#' @export
421-
as_units.name <- as_units.call
334+
as_units.name <- as_units.expression
422335

423336
#' @name units
424337
#' @export
@@ -442,7 +355,6 @@ as_units.Date = function(x, value, ...) {
442355

443356

444357
symbolic_unit <- function(chr, check_is_valid = TRUE) {
445-
446358
stopifnot(is.character(chr), length(chr) == 1)
447359

448360
if (check_is_valid && !ud_is_parseable(chr)) {

0 commit comments

Comments
 (0)