Retrieve config file from S3 bucket (#10)

nikki-t · web-flow · commit faf7f771698d · 2025-01-22T08:19:34.000-05:00
* Download config file from config S3 bucket

* Copy over sos_read operations to Docker container image

* Fix bug that caused output file to not be sourced

* Add local option for loading in config.R from prediagnostics directory
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -56,6 +56,8 @@ jobs:
 
       # Check out GitHub repo
       - uses: actions/checkout@v4
+        with:
+            submodules: 'recursive'
 
       # SNYK IAC scan and report - TODO
       # - name: Run Snyk IAC to test and report
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "sos_read"]
+	path = sos_read
+	url = https://github.com/SWOT-Confluence/sos_read
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ RUN echo "America/New_York" | tee /etc/timezone \
 		build-essential \
 		gcc \
 		gfortran \
-        locales \
+    locales \
 		libcurl4-gnutls-dev \
 		libfontconfig1-dev \
 		libfribidi-dev \
@@ -34,15 +34,22 @@ RUN apt -y install \
 		r-base \
 		r-base-dev \
 	&& /usr/bin/Rscript -e "install.packages('RNetCDF', dependencies=TRUE, repos='http://cran.rstudio.com/')" \
-    && /usr/bin/Rscript -e "install.packages('rjson', dependencies=TRUE, repos='http://cran.rstudio.com/')"\
-	&& /usr/bin/Rscript -e "install.packages('dplyr', dependencies=TRUE, repos='http://cran.rstudio.com/')"
+    && /usr/bin/Rscript -e "install.packages('rjson', dependencies=TRUE, repos='http://cran.rstudio.com/') "\
+	&& /usr/bin/Rscript -e "install.packages('dplyr', dependencies=TRUE, repos='http://cran.rstudio.com/')" \
+	&& /usr/bin/Rscript -e "install.packages('reticulate', dependencies=TRUE, repos='http://cran.rstudio.com/')" \
+	&& /usr/bin/Rscript -e "install.packages('optparse', dependencies=TRUE, repos='http://cran.rstudio.com/')"
 
-# STAGE 2 set up I/O directories, copy geobamdata installer and R script
+# STAGE 2 - Python and python packages for S3 functionality
 FROM stage1 as stage2
-COPY ./prediagnostics/ /app/prediagnostics/
+RUN apt update && apt -y install python3 python3-dev python3-pip python3-venv python3-boto3
 
-# STAGE 3 - Execute algorithm
+# STAGE 3 set up I/O directories, copy geobamdata installer and R script
 FROM stage2 as stage3
+COPY ./prediagnostics/ /app/prediagnostics/
+COPY ./sos_read /app/prediagnostics/sos_read/
+
+# STAGE 4 - Execute algorithm
+FROM stage3 as stage4
 LABEL version="1.0" \
 	description="Containerized prediagnostics module." \
 	"confluence.contact"="ntebaldi@umass.edu" \
diff --git a/prediagnostics/run_prediagnostics.R b/prediagnostics/run_prediagnostics.R
@@ -1,7 +1,13 @@
-source("/app/prediagnostics/config.R")
+library(optparse)
+library(reticulate)
+
 source("/app/prediagnostics/input.R")
-source("/app/prediagnostics/prediagnostics.R")
 source("/app/prediagnostics/output.R")
+source("/app/prediagnostics/prediagnostics.R")
+
+PYTHON_EXE = "/usr/bin/python3"
+PYTHON_FILE = "/app/prediagnostics/sos_read/sos_read.py"
+TMP_PATH = "/tmp"
 
 start = Sys.time()
 
@@ -10,29 +16,45 @@ input_dir = file.path("/mnt", "data", "input")
 output_dir = file.path("/mnt", "data", "output")
 
 # Command line arguments
-args = commandArgs(trailingOnly=TRUE)
-# we want to specify index and reach json for local run
-if (length(args)>=2){
-    index = strtoi(args[1]) + 1
-    reaches_json = file.path(input_dir, paste(args[2]))
-
-    # we want to specify reach json for aws run
-    if (length(args)>=3){
-        index = strtoi(Sys.getenv("AWS_BATCH_JOB_ARRAY_INDEX")) + 1
-    }
-
-# we want to specify only index for local run
-} else if (length(args)>=1) {
-    index = strtoi(args[1]) + 1
-    reaches_json = file.path(input_dir, 'reaches.json')
-    # we want to run on default settings for aws
-} else{
-    index = strtoi(Sys.getenv("AWS_BATCH_JOB_ARRAY_INDEX")) + 1
-    reaches_json = file.path(input_dir, 'reaches.json')
+option_list <- list(
+  make_option(c("-i", "--index"), type = "integer", default = NULL, help = "Index to run on"),
+  make_option(c("-b", "--config_bucket"), type = "character", default = "", help = "Bucket key to find the sos"),
+  make_option(c("-r", "--reaches_json"), type = "character", default = "reaches.json", help = "Name of reaches.json")
+)
+opt_parser <- OptionParser(option_list = option_list)
+opts <- parse_args(opt_parser)
+
+index <- opts$index
+if (index == -256){
+  index <- strtoi(Sys.getenv("AWS_BATCH_JOB_ARRAY_INDEX"))
 }
+index <- index + 1    # Add 1 to AWS 0-based index
 
-# Run Diagnostics
-output=run_diagnostics(input_dir, reaches_json, index, output_dir)
+config_bucket <- opts$config_bucket
+reaches_json = file.path(input_dir, opts$reaches_json)
+
+# Load Config file from S3
+if (config_bucket != "") {
+  use_python(PYTHON_EXE)
+  source_python(PYTHON_FILE)
+
+  config_filepath = file.path(TMP_PATH, "config.R")
+  download_sos(config_bucket, config_filepath)
+
+  # Run Diagnostics on S3 config file
+  if (file.exists(config_filepath)) {
+    source(config_filepath)
+    output=run_diagnostics(input_dir, reaches_json, index, output_dir)
+  } else {
+    print("Config file could not be downloaded and prediagnostics will not run.")
+  }
+
+ # Run Diagnostics on local config file
+} else {
+  print("Config file will be run on local config: '/app/prediagnostics/config.R'")
+  source("/app/prediagnostics/config.R")
+  output=run_diagnostics(input_dir, reaches_json, index, output_dir)
+}
 
 end = Sys.time()
 print(paste0("Execution time: ", end - start))
diff --git a/sos_read b/sos_read
@@ -0,0 +1 @@
+Subproject commit 652bf6a134a828f2e75937cbd25de0fa26507b2e

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "sos_read"]`
	`2`	`+ path = sos_read`
	`3`	`+ url = https://github.com/SWOT-Confluence/sos_read`