Skip to content

Commit bec83e0

Browse files
frankierFrankie Robertson
andauthored
Streamline adding a new dataset (#135)
* Add instructions to README for adding a new dataset * Add scripts to update the dataset metadata * Add update_doc method to only add a single dataset * Add HTML documentation generation to update_doc * Change update_doc to correctly round trip quotes in the metadata CSV * Sort datasets CSV * Allow datasets with a .RData extension as well as .rda --------- Co-authored-by: Frankie Robertson <[email protected]>
1 parent 700576d commit bec83e0

File tree

6 files changed

+189
-112
lines changed

6 files changed

+189
-112
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13
7474
mlmRev|guPrenat|Prenatal care in Guatemala|2449|15
7575
mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18
7676

77+
# How to add datasets from a new package
78+
79+
**Step 1: add the data from the package**
80+
81+
1. In your clone of this repo `mkdir -p data/$PKG`
82+
2. Go to CRAN
83+
3. Download the *source package*
84+
4. Extract one or more of the datasets in the `data` directory into the new directory
85+
86+
**Step 2: add the metadata**
87+
88+
Run the script:
89+
90+
$ scripts/update_doc_one.sh $PKG
91+
92+
Now it's ready for you to submit your pull request.
93+
7794
# Licensing and Intellectual Property
7895

7996
Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository.

doc/datasets.csv

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,36 @@
506506
"datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61
507507
"datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3
508508
"datasets","women","Average Heights and Weights for American Women",15,2
509+
"gamair","aral","aral",488,4
510+
"gamair","aral.bnd","aral.bnd",107,3
511+
"gamair","bird","bird",25100,7
512+
"gamair","blowfly","blowfly",180,3
513+
"gamair","bone","bone",23,4
514+
"gamair","brain","brain",1567,6
515+
"gamair","cairo","cairo",3780,7
516+
"gamair","chicago","chicago",5114,8
517+
"gamair","chl","chl",13840,7
518+
"gamair","co2s","co2s",507,4
519+
"gamair","coast","coast",2091,3
520+
"gamair","engine","engine",19,3
521+
"gamair","gas","gas",60,804
522+
"gamair","harrier","harrier",37,3
523+
"gamair","hubble","hubble",24,4
524+
"gamair","ipo","ipo",156,7
525+
"gamair","mack","mack",634,17
526+
"gamair","mackp","mackp",1162,9
527+
"gamair","med","med",1476,25
528+
"gamair","meh","meh",1476,24
529+
"gamair","mpg","mpg",205,27
530+
"gamair","prostate","prostate",654,530
531+
"gamair","sitka","sitka",1027,6
532+
"gamair","sole","sole",1575,8
533+
"gamair","sperm.comp1","sperm.comp1",15,5
534+
"gamair","sperm.comp2","sperm.comp2",24,11
535+
"gamair","stomata","stomata",24,4
536+
"gamair","swer","swer",2196,10
537+
"gamair","wesdr","wesdr",669,5
538+
"gamair","wine","wine",47,8
509539
"gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22
510540
"gap","aldh2","ALDH2 markers and Alcoholism",263,18
511541
"gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8
@@ -732,33 +762,3 @@
732762
"vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4
733763
"vcd","WeldonDice","Weldon's Dice Data",11,2
734764
"vcd","WomenQueue","Women in Queues",11,2
735-
"gamair","aral.bnd","aral.bnd",107,3
736-
"gamair","aral","aral",488,4
737-
"gamair","bird","bird",25100,7
738-
"gamair","blowfly","blowfly",180,3
739-
"gamair","bone","bone",23,4
740-
"gamair","brain","brain",1567,6
741-
"gamair","cairo","cairo",3780,7
742-
"gamair","chicago","chicago",5114,8
743-
"gamair","chl","chl",13840,7
744-
"gamair","co2s","co2s",507,4
745-
"gamair","coast","coast",2091,3
746-
"gamair","engine","engine",19,3
747-
"gamair","gas","gas",60,804
748-
"gamair","harrier","harrier",37,3
749-
"gamair","hubble","hubble",24,4
750-
"gamair","ipo","ipo",156,7
751-
"gamair","mack","mack",634,17
752-
"gamair","mackp","mackp",1162,9
753-
"gamair","med","med",1476,25
754-
"gamair","meh","meh",1476,24
755-
"gamair","mpg","mpg",205,27
756-
"gamair","prostate","prostate",654,530
757-
"gamair","sitka","sitka",1027,6
758-
"gamair","sole","sole",1575,8
759-
"gamair","sperm.comp1","sperm.comp1",15,5
760-
"gamair","sperm.comp2","sperm.comp2",24,11
761-
"gamair","stomata","stomata",24,4
762-
"gamair","swer","swer",2196,10
763-
"gamair","wesdr","wesdr",669,5
764-
"gamair","wine","wine",47,8

scripts/update_doc_all.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
R --no-save <<END
2+
source("src/update_doc.r")
3+
update_docs(".")
4+
END

scripts/update_doc_one.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
R --no-save <<END
2+
source("src/update_doc.r")
3+
update_package_doc(".", "$1")
4+
END

src/dataset.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ const Dataset_typedetect_rows = Dict{Tuple{String, String}, Union{Vector,Dict}}(
77
function dataset(package_name::AbstractString, dataset_name::AbstractString)
88
basename = joinpath(@__DIR__, "..", "data", package_name)
99

10+
rdataname = joinpath(basename, string(dataset_name, ".RData"))
11+
if isfile(rdataname)
12+
return load(rdataname)[dataset_name]
13+
end
14+
1015
rdaname = joinpath(basename, string(dataset_name, ".rda"))
1116
if isfile(rdaname)
1217
return load(rdaname)[dataset_name]

src/update_doc.r

Lines changed: 129 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,129 @@
1+
install_packages <- function(packages) {
2+
# Pick a decent mirror if none set
3+
r <- getOption("repos")
4+
if (r["CRAN"] == "@CRAN@") {
5+
r <- "http://cran.rstudio.com/"
6+
}
7+
suppressWarnings({install.packages(packages, repos = r)})
8+
}
9+
10+
install_packages(c("R2HTML"))
11+
library(R2HTML)
12+
13+
write_doc = function(package, dataset) {
14+
help.ref <- try(help(eval(dataset), package=eval(package)), silent = TRUE)
15+
doc <- try(utils:::.getHelpFile(help.ref), silent = TRUE)
16+
try(dir.create(paste0('doc/', package)), silent = TRUE)
17+
fn_doc <- paste0('doc/', package, '/', dataset, '.html')
18+
tools::Rd2HTML(doc, out = fn_doc)
19+
}
20+
21+
do_package_update <- function(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package) {
22+
suppressWarnings({library(package, character.only = TRUE)})
23+
24+
# Get package description
25+
pdesc <- packageDescription(package)
26+
new_row <- as.data.frame(pdesc[c("Package", "Title")],
27+
stringsAsFactors = FALSE)
28+
package_df <- rbind(package_df, new_row)
29+
30+
pdat <- data(package=package)$results
31+
32+
datasets <- dir(path = file.path(data_dir, package))
33+
34+
# Trim filenames to dataset names
35+
r <- "(.+)\\.(csv\\.gz|rda|RData)$"
36+
format_recognized <- grepl(r, datasets)
37+
if (!(all(format_recognized))) {
38+
stop("Unrecognized formats:\n",
39+
cat(datasets[!format_recognized], sep = "\n"))
40+
}
41+
datanames <- sub(r, "\\1", datasets)
42+
43+
for (dataname in datanames) {
44+
evaltext = paste0("data(", dataname, ", package=package)")
45+
eval(parse(text = evaltext))
46+
ds <- get(dataname)
47+
48+
write_doc(package, dataname)
49+
50+
# Get dataset description
51+
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
52+
if (length(title) != 1) {
53+
stop(package, "/", dataname, " had ", length(title), " descriptions.")
54+
}
55+
56+
# Old dims to fall back on
57+
old_row = subset(old_dataset_df,
58+
Dataset == dataname & Package == package)
59+
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
60+
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA
61+
62+
# Check against new dims when simple
63+
new_nr <- NROW(ds)
64+
new_nc <- NCOL(ds)
65+
if (!(any(c("table", "ltraj") %in% class(ds))) &&
66+
class(ds) != "list" &&
67+
is.numeric(new_nr) &&
68+
is.numeric(new_nc)) {
69+
70+
expected_cols <- c(nc, nc - 1) # row.names sometimes included
71+
if (!is.numeric(nr) || !is.numeric(nc)) {
72+
nr <- new_nr
73+
nc <- new_nc
74+
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {
75+
76+
new_row <- data.frame(Package = package,
77+
Dataset = dataname,
78+
Class = class(ds),
79+
OldRows = nr,
80+
OldColumns = nc,
81+
NewRows = new_nr,
82+
NewColumns = new_nc)
83+
84+
mismatched_dims_df <- rbind(mismatched_dims_df, new_row)
85+
86+
}
87+
}
88+
89+
new_row <- data.frame(Package = package,
90+
Dataset = dataname,
91+
Title = title,
92+
Rows = nr,
93+
Columns = nc,
94+
stringsAsFactors = FALSE)
95+
96+
dataset_df <- rbind(dataset_df, new_row)
97+
}
98+
return(list(package_df = package_df, dataset_df = dataset_df, mismatched_dims_df = mismatched_dims_df))
99+
}
100+
101+
update_package_doc <- function(pkg_dir, package) {
102+
data_dir <- file.path(pkg_dir, "data")
103+
doc_dir <- file.path(pkg_dir, "doc")
104+
105+
package_fn <- file.path(doc_dir, "packages.csv")
106+
dataset_fn <- file.path(doc_dir, "datasets.csv")
107+
108+
package_df <- read.csv(package_fn)
109+
dataset_df <- read.csv(dataset_fn)
110+
111+
install_packages(c(package))
112+
113+
mismatched_dims_df <- data.frame()
114+
dfs <- do_package_update(data_dir, package_df, dataset_df, dataset_df, mismatched_dims_df, package)
115+
package_df <- dfs$package_df
116+
dataset_df <- dfs$dataset_df
117+
mismatched_dims_df <- dfs$mismatched_dims_df
118+
119+
package_df <- sort_upper_first(clean(package_df), c("Package"))
120+
dataset_df <- sort_upper_first(clean(dataset_df), c("Package", "Dataset"))
121+
122+
write(package_df, package_fn)
123+
write(dataset_df, dataset_fn)
124+
return(mismatched_dims_df)
125+
}
126+
1127
update_docs <- function(pkg_dir) {
2128
data_dir <- file.path(pkg_dir, "data")
3129
doc_dir <- file.path(pkg_dir, "doc")
@@ -16,90 +142,11 @@ update_docs <- function(pkg_dir) {
16142
# Install any missing packages
17143
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
18144
if (length(new_packages)) {
19-
# Pick a decent mirror if none set
20-
r <- getOption("repos")
21-
if (r["CRAN"] == "@CRAN@") {
22-
r <- "http://cran.rstudio.com/"
23-
}
24-
suppressWarnings({install.packages(new_packages, repos = r)})
145+
install_packages(new_packages)
25146
}
26147

27148
for (package in packages) {
28-
suppressWarnings({library(package, character.only = TRUE)})
29-
30-
# Get package description
31-
pdesc <- packageDescription(package)
32-
new_row <- as.data.frame(pdesc[c("Package", "Title")],
33-
stringsAsFactors = FALSE)
34-
package_df <- rbind(package_df, new_row)
35-
36-
pdat <- data(package=package)$results
37-
38-
datasets <- dir(path = file.path(data_dir, package))
39-
40-
# Trim filenames to dataset names
41-
r <- "(.+)\\.(csv\\.gz|rda)$"
42-
format_recognized <- grepl(r, datasets)
43-
if (!(all(format_recognized))) {
44-
stop("Unrecognized formats:\n",
45-
cat(datasets[!format_recognized], sep = "\n"))
46-
}
47-
datanames <- sub(r, "\\1", datasets)
48-
49-
for (dataname in datanames) {
50-
eval(parse(text = paste0("data(", dataname, ", package=package)")))
51-
ds <- get(dataname)
52-
53-
# TODO: Write rst and html doc per dataset
54-
55-
# Get dataset description
56-
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
57-
if (length(title) != 1) {
58-
stop(package, "/", title, " had ", length(title), " descriptions.")
59-
}
60-
61-
# Old dims to fall back on
62-
old_row = subset(old_dataset_df,
63-
Dataset == dataname & Package == package)
64-
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
65-
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA
66-
67-
# Check against new dims when simple
68-
new_nr <- NROW(ds)
69-
new_nc <- NCOL(ds)
70-
if (!(any(c("table", "ltraj") %in% class(ds))) &&
71-
class(ds) != "list" &&
72-
is.numeric(new_nr) &&
73-
is.numeric(new_nc)) {
74-
75-
expected_cols <- c(nc, nc - 1) # row.names sometimes included
76-
if (!is.numeric(nr) || !is.numeric(nc)) {
77-
nr <- new_nr
78-
nc <- new_nc
79-
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {
80-
81-
new_row <- data.frame(Package = package,
82-
Dataset = dataname,
83-
Class = class(ds),
84-
OldRows = nr,
85-
OldColumns = nc,
86-
NewRows = new_nr,
87-
NewColumns = new_nc)
88-
89-
mismatched_dims_df <- rbind(mismatched_dims_df, new_row)
90-
91-
}
92-
}
93-
94-
new_row <- data.frame(Package = package,
95-
Dataset = dataname,
96-
Title = title,
97-
Rows = nr,
98-
Columns = nc,
99-
stringsAsFactors = FALSE)
100-
101-
dataset_df <- rbind(dataset_df, new_row)
102-
}
149+
do_package_update(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package)
103150
}
104151

105152
stopifnot(nrow(dataset_df) > 0)
@@ -114,7 +161,7 @@ update_docs <- function(pkg_dir) {
114161
}
115162

116163
write <- function(df, fn) {
117-
write.table(df, file = fn, sep = ",", qmethod = "escape", row.names = FALSE)
164+
write.table(df, file = fn, sep = ",", qmethod = "double", row.names = FALSE)
118165
}
119166

120167
clean <- function(df) {

0 commit comments

Comments
 (0)