Skip to content

Commit 379b920

Browse files
committed
Add penguins and penguins_raw datasets
1 parent 5c231d7 commit 379b920

File tree

6 files changed

+1701
-0
lines changed

6 files changed

+1701
-0
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Code adapted from the palmerpenguin package
2+
# by Allison Horst, Alison Hill, and Kristen Gorman
3+
# https://github.com/allisonhorst/palmerpenguins
4+
5+
source("./src/library/datasets/data/penguins_raw.R")
6+
7+
penguins <- penguins_raw[, c("Species", "Island",
8+
"Culmen Length (mm)", "Culmen Depth (mm)",
9+
"Flipper Length (mm)", "Body Mass (g)",
10+
"Sex", "Date Egg")]
11+
colnames(penguins) <- c(
12+
"species", "island", "bill_length_mm", "bill_depth_mm", "flipper_length_mm",
13+
"body_mass_g", "sex", "year"
14+
)
15+
penguins$species <- regmatches(penguins$species,
16+
regexpr("^\\w+\\b", penguins$species))
17+
penguins$species <- as.factor(penguins$species)
18+
penguins$island <- as.factor(penguins$island)
19+
penguins$flipper_length_mm <- as.integer(penguins$flipper_length_mm)
20+
penguins$body_mass_g <- as.integer(penguins$body_mass_g)
21+
penguins$sex <- tolower(penguins$sex)
22+
penguins$sex <- as.factor(penguins$sex)
23+
penguins$year <- regmatches(penguins$year,
24+
regexpr("\\d{4}", penguins$year))
25+
penguins$year <- as.integer(penguins$year)
26+
27+
dump("penguins", "./src/library/datasets/data/penguins.R")
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Code adapted from the palmerpenguin package
2+
# by Allison Horst, Alison Hill, and Kristen Gorman
3+
# https://github.com/allisonhorst/palmerpenguins
4+
5+
# penguins raw ------------------------------------------------------------
6+
7+
# Download raw data
8+
# Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
9+
uri_adelie <- "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.219.3&entityid=002f3893385f710df69eeebe893144ff"
10+
11+
# Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
12+
uri_gentoo <- "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.220.3&entityid=e03b43c924f226486f2f0ab6709d2381"
13+
14+
# Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
15+
uri_chinstrap <- "https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-pal.221.2&entityid=fe853aa8f7a59aa84cdd3197619ef462"
16+
17+
# Combining the URIs
18+
uris <- c(uri_adelie, uri_gentoo, uri_chinstrap)
19+
20+
# Download data and combine into one dataframe
21+
penguins_raw_list <- lapply(uris, read.csv)
22+
penguins_raw <- do.call(rbind, penguins_raw_list)
23+
24+
# Adjustments to make penguins_raw identical to palmerpenguins:::penguins_raw
25+
penguins_raw$Sample.Number <- as.numeric(penguins_raw$Sample.Number)
26+
penguins_raw$Date.Egg <- as.Date(penguins_raw$Date.Egg)
27+
penguins_raw$Flipper.Length..mm. <- as.numeric(penguins_raw$Flipper.Length..mm.)
28+
penguins_raw$Body.Mass..g. <- as.numeric(penguins_raw$Body.Mass..g.)
29+
penguins_raw$Sex <- replace(penguins_raw$Sex, penguins_raw$Sex %in% c("", "."), NA)
30+
penguins_raw$Comments <- replace(penguins_raw$Comments, penguins_raw$Comments == "", NA)
31+
32+
colnames(penguins_raw) <- c(
33+
"studyName", "Sample Number", "Species", "Region", "Island", "Stage",
34+
"Individual ID", "Clutch Completion", "Date Egg", "Culmen Length (mm)",
35+
"Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)", "Sex",
36+
"Delta 15 N (o/oo)", "Delta 13 C (o/oo)", "Comments"
37+
)
38+
39+
# add sample numbers that correspond to test/train set in Gorman et al. (2014)
40+
# these have been provided by Kristen Gorman
41+
ADPE_train_sample_nums <- c(
42+
1, 2, 3, 5, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 32, 33,
43+
34, 41, 42, 43, 45, 46, 47, 49, 50, 52, 56, 57, 61, 62, 63, 64, 66, 67, 71,
44+
73, 74, 76, 78, 81, 84, 85, 88, 89, 91, 92, 93, 94, 95, 96, 98, 99, 102,
45+
104, 105, 107, 108, 112, 113, 115, 116, 117, 118, 119, 120, 123, 124, 125,
46+
128, 129, 130, 133, 136, 138, 142, 143, 144, 145, 147, 148, 149, 150, 151,
47+
152
48+
)
49+
50+
ADPE_test_sample_nums <- c(
51+
6, 13, 15, 28, 35, 36, 37, 38, 44, 51, 53, 54, 55, 58, 59, 60, 65, 68, 72,
52+
75, 77, 79, 80, 82, 83, 86, 87, 90, 97, 100, 101, 103, 106, 109, 110, 111,
53+
114, 126, 127, 134, 135, 137, 141, 146
54+
)
55+
56+
CHPE_train_sample_nums <- c(
57+
3, 5, 6, 7, 8, 9, 13, 15, 16, 19, 22, 29, 30, 32, 33, 34, 35, 37, 41, 42,
58+
43, 45, 47, 48, 50, 52, 53, 54, 55, 56, 57, 58, 61, 63, 67, 68
59+
)
60+
61+
CHPE_test_sample_nums <- c(4, 10, 11, 12, 14, 20, 21, 31, 36, 38, 44, 46, 49,
62+
51, 59, 60, 62, 64)
63+
64+
GEPE_train_sample_nums <- c(
65+
2, 4, 5, 7, 9, 10, 13, 14, 15, 20, 21, 22, 24, 25, 26, 28, 30, 31, 32, 33,
66+
34, 35, 36, 37, 38, 39, 40, 44, 49, 50, 52, 53, 54, 55, 60, 62, 63, 64, 65,
67+
66, 69, 70, 73, 75, 76, 77, 78, 79, 82, 84, 85, 86, 89, 90, 91, 93, 94, 95,
68+
97, 98, 99, 101, 102, 103, 106, 109, 110, 112, 114, 115, 118, 121, 123, 124
69+
)
70+
71+
GEPE_test_sample_nums <- c(
72+
1, 3, 6, 8, 16, 17, 18, 19, 23, 29, 43, 45, 46, 51, 56, 57, 58, 59, 61, 68,
73+
71, 72, 74, 80, 81, 83, 87, 88, 92, 96, 100, 104, 107, 108, 111, 113, 116,
74+
122
75+
)
76+
77+
# get count of each species
78+
n_Adelie <- sum(grepl("Adelie", penguins_raw$Species))
79+
n_Gentoo <- sum(grepl("Gentoo", penguins_raw$Species))
80+
n_Chinstrap <- sum(grepl("Chinstrap", penguins_raw$Species))
81+
82+
# vector of train/test for each species, then together
83+
Adelie_sample <- rep(NA, n_Adelie)
84+
Adelie_sample[ADPE_train_sample_nums] <- "train"
85+
Adelie_sample[ADPE_test_sample_nums] <- "test"
86+
Gentoo_sample <- rep(NA, n_Gentoo)
87+
Gentoo_sample[GEPE_train_sample_nums] <- "train"
88+
Gentoo_sample[GEPE_test_sample_nums] <- "test"
89+
Chinstrap_sample <- rep(NA, n_Chinstrap)
90+
Chinstrap_sample[CHPE_train_sample_nums] <- "train"
91+
Chinstrap_sample[CHPE_test_sample_nums] <- "test"
92+
Sample <- c(Adelie_sample, Gentoo_sample, Chinstrap_sample)
93+
94+
# Add sample column to penguins_raw
95+
penguins_raw$Sample <- Sample
96+
97+
dump("penguins_raw", "./src/library/datasets/data/penguins_raw.R")

0 commit comments

Comments
 (0)