-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path01_meta_simulation.R
More file actions
76 lines (63 loc) · 3.7 KB
/
01_meta_simulation.R
File metadata and controls
76 lines (63 loc) · 3.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#LIBRARY########################################################################
library(dplyr) #data manipulation
library(magrittr) #data manipulation
library(stringi) #String manipulation
library(stringr) #String manipulation
library(simstudy) #Generate simulated data
library(data.table) #Data manipulation
#SEED###########################################################################
set.seed(951823) #Set seed for randomization
#IMPORT FILES###################################################################
Ctry_Cont <- read.csv("data-raw/source/Countries_Continents.csv", header = TRUE) #Countries and Continent
Ctry_income <- read.csv("data-raw/source/income_groups.csv", header = TRUE) #Countries and income groups
#Set the distribution
Ava_dist <- setDT(read.csv("data-raw/source/AvailabilityDist.csv", header = TRUE, fileEncoding="UTF-8-BOM")) #Self defined metadata distribution
#METADATA GENERATOR#############################################################
##Generate Study name###########################################################
###DO NOT RERUN START###########################################################
#stri_rand_lipsum CANNOT guarantee the same results even with set seed
#Save the result and DO NOT re-run this section
# studyname <- stringi::stri_rand_lipsum(2) %>% # Generates (pseudo)random lorem ipsum text
# stringi::stri_split_regex( "\\p{Punct}", omit_empty = TRUE) %>% # Separate string by punctuation
# base::unlist() %>% # unlist
# base::trimws("l") %>% # remove leading spaces
# str_to_title() %>% # convert first letter to uppercase
# stringr::str_trunc(width = 30, side ='right', ellipsis = '') # Truncated for easiler presentation
# studyname <- studyname[str_detect(studyname, " ")] # Remove study name with only one word
# nstudy <- length(studyname)
# ##Generate Study Acronyms
# studyacr <- base::abbreviate(studyname, minlength = 3, use.classes = TRUE,
# dot = FALSE, strict = FALSE,
# method = c("left.kept", "both.sides"), named = TRUE)
# metatemp <- data.frame(STUDY = studyacr, FULLNAME = studyname)
# row.names(metatemp) <- NULL
#
# save(metatemp, file = "source/metatemp.RData")
###DO NOT RERUN END#############################################################
###Load the generated study name################################################
#load("source/metatemp.RData")
##CONTINENT, COUNTRY, INCOMEGROUP###############################################
Ctry_Cont_Inc <- merge(Ctry_Cont, Ctry_income, by="Country", ) #word bank data
Ctry_sample <- Ctry_Cont_Inc %>%
group_by(Continent) %>%
sample_n(size = 5)
simmeta <- cbind(metatemp, Ctry_sample[, c("Continent", "Country", "IncomeGroup")])
##Other metadata################################################################
simmeta$ACCESS <- rbinom(30, 1, 0.85) #Data can be accessed through Repository
simmeta$STUDYFOLLOW <- rbinom(30, 1, 0.7) #Follow up data available
simmeta$MINAGE <- sample(c(18,18,18,21,25,30,35,40,60,65,70,75,80), 30, replace=T) #Minimum data at recruitment
simmeta$STUDYSIZE <- sample(500:5000, 30, replace=T) #Sample Size
##Some formatting###############################################################
simmeta <- simmeta %>%
rename_all(toupper) %>%
arrange(STUDY)
col_order <- c("STUDY", "FULLNAME", "ACCESS", "STUDYFOLLOW",
"CONTINENT", "COUNTRY", "INCOMEGROUP",
"MINAGE", "STUDYSIZE")
simmeta <- simmeta[, col_order]
simmeta <- simmeta[, 1:9]
##Data Availability by categories###############################################
Ava <- genData(30, Ava_dist)
simmeta <- cbind(simmeta,Ava) %>% select(-id)
#SAVE DATA######################################################################
save(simmeta, file = "data-raw/temp/simmeta.RData")