Skip to content
This repository was archived by the owner on Oct 12, 2023. It is now read-only.

Commit a6e51c9

Browse files
authored
Feature/container (#153)
* force add PATH to current user * checkin docker setup script * Update cluster_setup.sh * install docker and start container on cluster setup * WIP: Run task in container * fix merge conflict * run tasks and merge task from within container * refactor code to proper docker commands and make a single R container per job * refactor command line utils into its own file * refactor job utilities into its own file * move cluster setup script to inst folder * remove unnecessary curl installs * remove starting container from setup script * check in WIP * add apt_install file * make required directories * update cluster setup files as needed * include libxml2 packages in apt installs * working cluster create with cran and github dependencies * update job prep to install apt-get and not each task * use rocker containers instead of r-base * remove unused & commented code * remove unused install function * address several lintr issues * initial test dockerfile * add spacing between commands * temporarily point wget to feature branch * update bioconductor install for non-jobPrep installs * Delete Dockerfile * minor changes to install bioc * resolve merge conflicts * update cluster to correctly install BioC packages using install_bioconductor * fix issue where some packages were not getting installed * add missing BioConductorCommand initializer * remove print lines * initial dockerfile implementations * update docker files * Only install packages if they are required * Remove requirement on bioconductor installer script on start task * remove duplicate environment variable entry * update docs for container support * update version to 0.6.0 * refactor changes updates * remove poorly formatted whitespaces * add full path to pacakges directory * fix docker command line * update file share sample * update azure files cluster name * update mandelbrot sample * update package management sample * update plyr samples * make montecarlo sample more consistent * update montecarlo sample * remove plyr example * fix bad environment pointer * fix linter issues * more linter fixes * more linter issues * use latest rAzureBatch version * update resource files example * remove reference to deleted sample * pr feedback * PR docs feedback * Print errors from worker (#154) * Fixed pool package command line lintr test * Package installation tests fixed - too long lines * Fixed json in customize cluster docs * Fix: Typos in customize cluster docs * Cleaning up files * Feature/githubbiopackage (#150) * install github package worked for foreach loop * fix lintr error * tests for github and bioc packages installation * lintr fix * add back lost code due to merge and update docs * The Travis CI build failed for feature/githubbiopackage * remove incorrect parameter for install_github * Updated job prep task to have default command * Use the latest version of rAzureBatch * Updated description + Generate cluster config * Fix: Bioconductor and Github packages installation (#155) * Added multiple package install test and fix obj reading args * Fixed naming for packages install * Replaced validation exclusion for linter * Fixed test validate test * Fixing all interactive tests with skip * Fixed renaming validation * Removed default test - cannot be tested * Removed in validation * Added cluster package install tests (#156)
1 parent 4eb3773 commit a6e51c9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1735
-949
lines changed

.lintr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
exclusions: list("R/validators.R")
1+
exclusions: list("R/validationUtilities.R")

DESCRIPTION

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: doAzureParallel
22
Type: Package
33
Title: doAzureParallel
4-
Version: 0.5.0
4+
Version: 0.6.0
55
Author: Brian Hoang
66
Maintainer: Brian Hoang <[email protected]>
77
Description: The project is for data experts who use R at scale. The project
@@ -17,7 +17,7 @@ Depends:
1717
foreach (>= 1.4.3),
1818
iterators (>= 1.0.8)
1919
Imports:
20-
rAzureBatch (>= 0.5.1),
20+
rAzureBatch (>= 0.5.3),
2121
jsonlite,
2222
rjson,
2323
xml2,
@@ -27,5 +27,5 @@ Suggests:
2727
caret,
2828
plyr,
2929
lintr
30-
Remotes: Azure/[email protected].1
30+
Remotes: Azure/[email protected].3
3131
RoxygenNote: 6.0.1

R/cluster.R

Lines changed: 67 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,11 @@ generateClusterConfig <- function(fileName) {
8686
max = 3),
8787
autoscaleFormula = "QUEUE"
8888
),
89+
containerImage = "rocker/tidyverse:latest",
8990
rPackages = list(
9091
cran = vector(),
9192
github = vector(),
93+
bioconductor = vector(),
9294
githubAuthenticationToken = ""
9395
),
9496
commandLine = vector()
@@ -143,6 +145,7 @@ makeCluster <-
143145

144146
installCranCommand <- NULL
145147
installGithubCommand <- NULL
148+
installBioconductorCommand <- NULL
146149

147150
if (!is.null(poolConfig$rPackages) &&
148151
!is.null(poolConfig$rPackages$cran) &&
@@ -158,21 +161,63 @@ makeCluster <-
158161
getPoolPackageInstallationCommand("github", poolConfig$rPackages$github)
159162
}
160163

161-
packages <- NULL
162-
if (!is.null(installCranCommand)) {
163-
packages <- installCranCommand
164+
if (!is.null(poolConfig$rPackages) &&
165+
!is.null(poolConfig$rPackages$bioconductor) &&
166+
length(poolConfig$rPackages$bioconductor) > 0) {
167+
installBioconductorCommand <-
168+
getPoolPackageInstallationCommand("bioconductor", poolConfig$rPackages$bioconductor)
164169
}
165170

166-
if (!is.null(installGithubCommand) && is.null(packages)) {
167-
packages <- installGithubCommand
171+
packages <- c()
172+
if (!is.null(installCranCommand)) {
173+
packages <- c(installCranCommand, packages)
174+
}
175+
if (!is.null(installGithubCommand)) {
176+
packages <- c(installGithubCommand, packages)
177+
}
178+
if (!is.null(installBioconductorCommand)) {
179+
packages <- c(installBioconductorCommand, packages)
168180
}
169-
else if (!is.null(installGithubCommand) && !is.null(packages)) {
170-
packages <- c(installCranCommand, installGithubCommand)
181+
182+
if (length(packages) == 0) {
183+
packages <- NULL
171184
}
172185

173186
commandLine <- NULL
187+
188+
# install docker and create docker container
189+
dockerImage <- "rocker/tidyverse:latest"
190+
if (!is.null(poolConfig$containerImage)) {
191+
dockerImage <- poolConfig$containerImage
192+
}
193+
194+
config$containerImage <- dockerImage
195+
installAndStartContainerCommand <- paste("cluster_setup.sh",
196+
dockerImage,
197+
sep = " ")
198+
199+
containerInstallCommand <- c(
200+
#TODO: Updates branch to point at master!
201+
paste0(
202+
"wget https://raw.githubusercontent.com/Azure/doAzureParallel/",
203+
"feature/container_wip/inst/startup/cluster_setup.sh"),
204+
"chmod u+x cluster_setup.sh",
205+
paste0(
206+
"wget https://raw.githubusercontent.com/Azure/doAzureParallel/",
207+
"feature/container_wip/inst/startup/install_bioconductor.R"),
208+
"chmod u+x install_bioconductor.R",
209+
installAndStartContainerCommand
210+
)
211+
174212
if (!is.null(poolConfig$commandLine)) {
175-
commandLine <- poolConfig$commandLine
213+
commandLine <- c(containerInstallCommand, poolConfig$commandLine)
214+
}
215+
216+
if (!is.null(packages)) {
217+
# install packages
218+
commandLine <-
219+
c(commandLine,
220+
dockerRunCommand(dockerImage, packages, NULL, FALSE, FALSE))
176221
}
177222

178223
environmentSettings <- NULL
@@ -189,17 +234,17 @@ makeCluster <-
189234
}
190235

191236
if (!is.null(poolConfig[["pool"]])) {
192-
validateDeprecatedClusterConfig(clusterSetting)
237+
validation$isValidDeprecatedClusterConfig(clusterSetting)
193238
poolConfig <- poolConfig[["pool"]]
194239
}
195240
else {
196-
validateClusterConfig(clusterSetting)
241+
validation$isValidClusterConfig(clusterSetting)
197242
}
198243

199244
tryCatch({
200-
`Validators`$isValidPoolName(poolConfig$name)
245+
validation$isValidPoolName(poolConfig$name)
201246
},
202-
error = function(e){
247+
error = function(e) {
203248
stop(paste("Invalid pool name: \n",
204249
e))
205250
})
@@ -219,19 +264,19 @@ makeCluster <-
219264
if (grepl("PoolBeingDeleted", response)) {
220265
pool <- rAzureBatch::getPool(poolConfig$name)
221266

222-
cat(
223-
sprintf(
224-
paste("Cluster '%s' already exists and is being deleted.",
225-
"Another cluster with the same name cannot be created",
226-
"until it is deleted. Please wait for the cluster to be deleted",
227-
"or create one with a different name"),
228-
poolConfig$name
267+
cat(sprintf(
268+
paste(
269+
"Cluster '%s' already exists and is being deleted.",
270+
"Another cluster with the same name cannot be created",
271+
"until it is deleted. Please wait for the cluster to be deleted",
272+
"or create one with a different name"
229273
),
230-
fill = TRUE
231-
)
274+
poolConfig$name
275+
),
276+
fill = TRUE)
232277

233278
while (areShallowEqual(rAzureBatch::getPool(poolConfig$name)$state,
234-
"deleting")) {
279+
"deleting")) {
235280
cat(".")
236281
Sys.sleep(10)
237282
}

R/commandLineUtilities.R

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
getJobPackageInstallationCommand <- function(type, packages) {
2+
script <- ""
3+
if (type == "cran") {
4+
script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_cran.R"
5+
}
6+
else if (type == "github") {
7+
script <- "Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_github.R"
8+
}
9+
else if (type == "bioconductor") {
10+
script <-
11+
"Rscript $AZ_BATCH_JOB_PREP_WORKING_DIR/install_bioconductor.R"
12+
}
13+
else {
14+
stop("Using an incorrect package source")
15+
}
16+
17+
if (!is.null(packages) && length(packages) > 0) {
18+
packageCommands <- paste0(packages, collapse = " ")
19+
script <- paste0(script, " ", packageCommands)
20+
}
21+
}
22+
23+
getPoolPackageInstallationCommand <- function(type, packages) {
24+
poolInstallationCommand <- character(length(packages))
25+
26+
sharedPackagesDirectory <- "/mnt/batch/tasks/shared/R/packages"
27+
28+
libPathsCommand <- paste0('\'.libPaths( c( \\\"',
29+
sharedPackagesDirectory,
30+
'\\\", .libPaths()));')
31+
32+
installCommand <-
33+
paste("Rscript -e \'args <- commandArgs(TRUE)\'",
34+
"-e \'options(warn=2)\'")
35+
36+
# At this point we cannot use install_cran.R and install_github.R because they are not yet available.
37+
if (type == "cran") {
38+
script <-
39+
paste(installCommand,
40+
paste("-e",
41+
libPathsCommand,
42+
"install.packages(args[1])\' %s")
43+
)
44+
}
45+
else if (type == "github") {
46+
script <-
47+
paste(
48+
installCommand,
49+
paste(
50+
"-e",
51+
libPathsCommand,
52+
"devtools::install_github(args[1])\' %s"
53+
)
54+
)
55+
}
56+
else if (type == "bioconductor") {
57+
script <- "Rscript /mnt/batch/tasks/startup/wd/install_bioconductor.R %s"
58+
}
59+
else {
60+
stop("Using an incorrect package source")
61+
}
62+
63+
for (i in 1:length(packages)) {
64+
poolInstallationCommand[i] <- sprintf(script, packages[i])
65+
}
66+
67+
poolInstallationCommand
68+
}
69+
70+
dockerRunCommand <-
71+
function(containerImage,
72+
command,
73+
containerName = NULL,
74+
runAsDaemon = FALSE,
75+
includeEnvironmentVariables = TRUE) {
76+
dockerOptions <- paste(
77+
"--rm",
78+
"-v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR",
79+
"-e AZ_BATCH_NODE_ROOT_DIR=$AZ_BATCH_NODE_ROOT_DIR",
80+
"-e AZ_BATCH_NODE_STARTUP_DIR=$AZ_BATCH_NODE_STARTUP_DIR",
81+
sep = " "
82+
)
83+
84+
if (runAsDaemon) {
85+
dockerOptions <- paste(dockerOptions, "-d", dockerOptions, sep = " ")
86+
}
87+
88+
if (!is.null(containerName)) {
89+
dockerOptions <-
90+
paste(dockerOptions, "--name", containerName, dockerOptions, sep = " ")
91+
}
92+
93+
if (includeEnvironmentVariables) {
94+
dockerOptions <-
95+
paste(
96+
dockerOptions,
97+
"-e AZ_BATCH_TASK_ID=$AZ_BATCH_TASK_ID",
98+
"-e AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID",
99+
"-e AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR",
100+
"-e AZ_BATCH_JOB_PREP_WORKING_DIR=$AZ_BATCH_JOB_PREP_WORKING_DIR",
101+
"-e BLOBXFER_SASKEY=$BLOBXFER_SASKEY",
102+
sep = " "
103+
)
104+
}
105+
106+
dockerRunCommand <-
107+
paste("docker run", dockerOptions, containerImage, command, sep = " ")
108+
dockerRunCommand
109+
}
110+
111+
linuxWrapCommands <- function(commands = c()) {
112+
# Sanitize the vector and don't allow empty values
113+
cleanCommands <- commands[lapply(commands, length) > 0]
114+
115+
commandLine <- ""
116+
if (length(cleanCommands) > 0) {
117+
# Do not allow absolute paths is enforced in lintr
118+
commandLine <-
119+
sprintf("/bin/bash -c \"set -e; set -o pipefail; %s wait\"",
120+
paste0(paste(
121+
cleanCommands, sep = " ", collapse = "; "
122+
), ";"))
123+
}
124+
125+
commandLine
126+
}

R/doAzureParallel.R

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ registerDoAzureParallel <- function(cluster) {
1111
fun = .doAzureParallel,
1212
data = list(
1313
config = list(cluster$batchAccount, cluster$storageAccount),
14-
poolId = cluster$poolId
14+
poolId = cluster$poolId,
15+
containerImage = cluster$containerImage
1516
),
1617
info = .info
1718
)
@@ -128,6 +129,18 @@ setHttpTraffic <- function(value = FALSE) {
128129
.doAzureParallel <- function(obj, expr, envir, data) {
129130
stopifnot(inherits(obj, "foreach"))
130131

132+
githubPackages <- eval(obj$args$github)
133+
bioconductorPackages <- eval(obj$args$bioconductor)
134+
135+
# Remove special arguments, github and bioconductor, from args list
136+
if (!is.null(obj$args[["github"]])) {
137+
obj$args[["github"]] <- NULL
138+
}
139+
140+
if (!is.null(obj$args[["bioconductor"]])) {
141+
obj$args[["bioconductor"]] <- NULL
142+
}
143+
131144
storageCredentials <- rAzureBatch::getStorageCredentials()
132145

133146
it <- iterators::iter(obj)
@@ -193,6 +206,8 @@ setHttpTraffic <- function(value = FALSE) {
193206
assign("expr", expr, .doAzureBatchGlobals)
194207
assign("exportenv", exportenv, .doAzureBatchGlobals)
195208
assign("packages", obj$packages, .doAzureBatchGlobals)
209+
assign("github", githubPackages, .doAzureBatchGlobals)
210+
assign("bioconductor", bioconductorPackages, .doAzureBatchGlobals)
196211
assign("pkgName", pkgName, .doAzureBatchGlobals)
197212

198213
if (!is.null(obj$options$azure$job)) {
@@ -204,8 +219,8 @@ setHttpTraffic <- function(value = FALSE) {
204219
}
205220

206221
tryCatch({
207-
`Validators`$isValidStorageContainerName(id)
208-
`Validators`$isValidJobName(id)
222+
validation$isValidStorageContainerName(id)
223+
validation$isValidJobName(id)
209224
},
210225
error = function(e){
211226
stop(paste("Invalid job name: \n",
@@ -394,7 +409,10 @@ setHttpTraffic <- function(value = FALSE) {
394409
poolId = data$poolId,
395410
resourceFiles = resourceFiles,
396411
metadata = metadata,
397-
packages = obj$packages
412+
packages = obj$packages,
413+
github = githubPackages,
414+
bioconductor = bioconductorPackages,
415+
containerImage = data$containerImage
398416
)
399417

400418
if (response$status_code == 201) {
@@ -466,7 +484,8 @@ setHttpTraffic <- function(value = FALSE) {
466484
args = argsList[startIndex:endIndex],
467485
envir = .doAzureBatchGlobals,
468486
packages = obj$packages,
469-
outputFiles = obj$options$azure$outputFiles
487+
outputFiles = obj$options$azure$outputFiles,
488+
containerImage = data$containerImage
470489
)
471490

472491
return(taskId)
@@ -489,12 +508,15 @@ setHttpTraffic <- function(value = FALSE) {
489508
packages = obj$packages,
490509
dependsOn = tasks,
491510
cloudCombine = cloudCombine,
492-
outputFiles = obj$options$azure$outputFiles
511+
outputFiles = obj$options$azure$outputFiles,
512+
containerImage = data$containerImage
493513
)
494514
}
495515

496516
if (wait) {
497-
if (!is.null(obj$packages)) {
517+
if (!is.null(obj$packages) ||
518+
!is.null(githubPackages) ||
519+
!is.null(bioconductorPackages)) {
498520
waitForJobPreparation(id, data$poolId)
499521
}
500522

0 commit comments

Comments
 (0)