Skip to content

Commit f93a1e6

Browse files
authored
fix: improve data parser (#26)
1 parent 4e8e913 commit f93a1e6

File tree

1 file changed

+23
-13
lines changed

1 file changed

+23
-13
lines changed

R/data_parser.R

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,21 @@ library(openssl)
33
raw_fields <- function(df, columnSpecs = list()) {
44
validate_columnSpecs(columnSpecs)
55
cols <- colnames(df)
6+
if (nrow(df) > 1000) {
7+
df_sample <- df[sample(nrow(df),1000),]
8+
} else {
9+
df_sample <- df
10+
}
611
props <- lapply(seq_along(cols), function(i) {
7-
infer_prop(cols[i], i, df, columnSpecs)
12+
infer_prop(cols[i], i, df_sample, columnSpecs)
813
})
914
return(props)
1015
}
1116

1217
infer_prop <- function(col, i = NULL, df, columnSpecs = list()) {
1318
s <- df[[col]]
14-
semantic_type <- ifelse((col %in% names(columnSpecs)), columnSpecs[[col]]$semanticType, infer_semantic(s))
15-
analytic_type <- ifelse((col %in% names(columnSpecs)), columnSpecs[[col]]$analyticalType, infer_analytic(s))
19+
semantic_type <- ifelse((col %in% names(columnSpecs)), columnSpecs[[col]]$semanticType, infer_semantic(s, col))
20+
analytic_type <- ifelse((col %in% names(columnSpecs)), columnSpecs[[col]]$analyticalType, infer_analytic(s, col))
1621
prop <- list(
1722
fid = fname_encode(col),
1823
name = col,
@@ -22,28 +27,33 @@ infer_prop <- function(col, i = NULL, df, columnSpecs = list()) {
2227
return(prop)
2328
}
2429

25-
infer_semantic <- function(s) {
30+
is_geo_field <- function(field_name) {
31+
field_name <- tolower(trimws(field_name, which = "both", whitespace = " ."))
32+
return(field_name %in% c("latitude", "longitude", "lat", "long", "lon"))
33+
}
34+
35+
infer_semantic <- function(s, field_name) {
2636
v_cnt <- length(unique(s))
2737
kind <- class(s)
28-
if (any(sapply(c('numeric', 'integer'), inherits, x = s)) & v_cnt > 16) {
38+
if (all(kind %in% c("numeric", "integer", "double", "complex")) || is_geo_field(field_name)) {
2939
return('quantitative')
30-
} else if (any(sapply(c('POSIXct', 'POSIXlt', 'Date'), inherits, x = s))) {
40+
} else if (any(sapply(c('POSIXct', 'POSIXlt', 'POSIXt', 'Date'), inherits, x = s))) {
3141
return('temporal')
32-
} else if (inherits(s, 'ordered')) {
33-
return('ordinal')
3442
} else {
3543
return('nominal')
3644
}
3745
}
3846

39-
infer_analytic <- function(s) {
47+
infer_analytic <- function(s, field_name) {
4048
v_cnt <- length(unique(s))
4149
kind <- class(s)
42-
if ((inherits(s, 'numeric')) | (inherits(s, 'integer') & v_cnt > 16)) {
43-
return('measure')
44-
} else {
45-
return('dimension')
50+
if (is_geo_field(field_name)) {
51+
return("dimension")
52+
}
53+
if (all(kind %in% c("numeric", "integer", "double", "complex"))) {
54+
return("measure")
4655
}
56+
return("dimension")
4757
}
4858

4959
validate_columnSpecs <- function(columnSpecs) {

0 commit comments

Comments
 (0)