Reproducibility for DOI 10.5281/zenodo.6853185 (#9)

ahaim5357 · web-flow · commit 43ea964f7178 · 2023-08-23T14:18:54.000-04:00
* Bug fixes and template logic changes

* DOI 10.5281/zenodo.6853185
diff --git a/10-5281_zenodo-6853067/Dockerfile b/10-5281_zenodo-6853067/Dockerfile
@@ -1,5 +1,5 @@
 # Set global arguments
-ARG JAMMIES_VER=0.4.3
+ARG JAMMIES_VER=0.4.5
 
 # Get and patch project for working directory
 FROM python:3.11.2-alpine3.17 as projects
@@ -22,7 +22,7 @@ RUN apk add git
 
 ## Install jammies and run
 RUN python3 -m pip install "jammies[all]==${JAMMIES_VER}"
-RUN jammies patch src
+RUN jammies patch src -y
 
 # Setup project specific info
 FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
diff --git a/10-5281_zenodo-6853067/project_metadata.json b/10-5281_zenodo-6853067/project_metadata.json
@@ -79,7 +79,6 @@
             },
             "https://github.com/bpaassen/sparfae": {
                 "name": "#github",
-                "license": "#gpl3later",
 
                 "tags": [
                     {
diff --git a/10-5281_zenodo-6853185/.dockerignore b/10-5281_zenodo-6853185/.dockerignore
@@ -0,0 +1,8 @@
+**/clean
+**/env
+**/src
+**/.dockerignore
+**/Dockerfile*
+**/README.md
+**/instructions.md
+**/issues.md
diff --git a/10-5281_zenodo-6853185/.gitignore b/10-5281_zenodo-6853185/.gitignore
@@ -0,0 +1,13 @@
+# Ignore clean and src directories
+/clean
+/src
+
+# Ignore environments
+/env
+
+# Ignore IDEs
+.vscode
+
+# Ignore caches
+__pycache__
+.jammies.toml
diff --git a/10-5281_zenodo-6853185/Dockerfile b/10-5281_zenodo-6853185/Dockerfile
@@ -0,0 +1,62 @@
+# Set global arguments
+ARG JAMMIES_VER=0.4.5
+
+# Get and patch project for working directory
+FROM python:3.11.2-alpine3.17 as projects
+
+## Set local arguments
+ARG JAMMIES_VER
+
+## Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+## Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+## Copy files to directory
+COPY . ./
+
+## Add git to alpine to pull necessary repositories
+RUN apk update
+RUN apk add git
+
+## Install jammies and run
+RUN python3 -m pip install "jammies[all]==${JAMMIES_VER}"
+RUN jammies patch src -y
+
+# Setup Java runtime via jlink
+FROM eclipse-temurin:17.0.8_7-jdk-jammy as java
+
+# Create custom runtime
+RUN $JAVA_HOME/bin/jlink \
+    --add-modules ALL-MODULE-PATH \
+    --strip-debug \
+    --no-man-pages \
+    --no-header-files \
+    --compress=2 \
+    --output /javaruntime
+
+# Setup project specific info
+FROM python:3.11.4-bookworm
+
+## Keeps Python from generating .pyc files in the container
+ENV PYTHONDONTWRITEBYTECODE=1
+
+## Turns off buffering for easier container logging
+ENV PYTHONUNBUFFERED=1
+
+## Copy Java runtime over
+ENV JAVA_HOME=/opt/java/openjdk
+ENV PATH $JAVA_HOME/bin:$PATH
+COPY --from=java /javaruntime $JAVA_HOME
+
+## Copy project files from previous stage here
+RUN mkdir /src
+COPY --from=projects /src /src
+WORKDIR /src
+
+## Setup python
+RUN python3 -m pip install .
+
+## Setup script run
+CMD [ "python3", "./helper_code/models/regression/train_rf_regression_full_cv.py", "--training-data-filepath", "./VLE_datasets/v1/VLE_12k_dataset_v1.csv", "--output-dir", "./results" ]
diff --git a/10-5281_zenodo-6853185/README.md b/10-5281_zenodo-6853185/README.md
@@ -0,0 +1,108 @@
+# [Can Population-based Engagement Improve Personalisation? A Novel Dataset and Experiment](https://doi.org/10.5281/zenodo.6853185)
+
+![Not at All Reproducible](https://img.shields.io/badge/Status-Not%20at%20All%20Reproducible-red)
+
+This is a project constructor for the paper [*Can Population-based Engagement Improve Personalisation? A Novel Dataset and Experiment*](https://doi.org/10.5281/zenodo.6853185) by Sahan Bulathwela, Meghana Verma, [María Pérez-Ortiz](https://orcid.org/0000-0003-1302-6093), [Emine Yilmaz](https://orcid.org/0000-0003-4734-4532), [John Shawe-Taylor](https://orcid.org/0000-0002-2030-0073).
+
+### Associated Metadata
+
+#### Tested Systems
+
+![Debian: bullseye (11) | bookworm (12)](https://img.shields.io/badge/Debian-bullseye%20%2811%29%20%7C%20bookworm%20%2812%29-informational)  
+![Docker NVIDIA: 20.10 | 23.0](https://img.shields.io/badge/Docker%20NVIDIA-20.10%20%7C%2023.0-informational)  
+
+#### Languages
+![java: 17.0.8](https://img.shields.io/badge/java-17.0.8-informational)  
+![Python: 3.11.2 | 3.11.4](https://img.shields.io/badge/Python-3.11.2%20%7C%203.11.4-informational)  
+
+#### Resources
+
+* [Can Population-based Engagement Improve Personalisation? A Novel Dataset and Experiment](https://doi.org/10.5281/zenodo.6853185) (Public)
+    * Contains paper under [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/)
+* [GitHub](https://github.com/sahanbull/VLE-Dataset) (Public)
+    * Contains data under ARR
+    * Contains materials under ARR
+
+## Project Files
+
+The constructor downloads the following files: 
+* [Cloned GitHub](https://github.com/ahaim5357/VLE-Dataset) under ARR
+
+## Setup Instructions
+
+### Method 1: Docker
+
+This project contains the necessary files needed to setup a [docker container][docker]. Make sure you have Docker installed before attempting anything below. 
+
+To build the docker container, navigate to this directory and run the following command:
+
+```sh
+docker build -t <image_name> .
+```
+
+`image_name` should be replaced with whatever name you would like to refer to the docker container as. It will take around 30 minutes to an hour to build the image.
+
+From there, you can load into the terminal via:
+
+```sh
+docker run --rm -itv <local_directory>:/volume <image_name> sh
+```
+
+A `volume` directory will be created within the image which will link to the `local_directory` specified. You can specify the current directory of execution via `${PWD}`.
+
+> We are loading into the terminal instead of into Python to copy any generated figures onto the local machine as they cannot otherwise be easily viewed.
+
+Once in the docker terminal, you can run the Python script via:
+
+```sh
+python3 ./helper_code/models/regression/train_rf_regression_full_cv.py --training-data-filepath VLE_datasets/v1/VLE_12k_dataset_v1.csv --output-dir ./results
+```
+
+You can look through the terminal output and compare the numbers within the paper. To view the figures on the local machine, you can copy them to the volume via:
+
+```sh
+cp -R ./results /volume
+```
+
+## Method 2: Local Setup
+
+This project uses the Python package `jammies[all]` to setup and fix any issues in the codebase. For instructions on how to download and generate the project from this directory, see the [`jammies`][jammies] repository.
+
+You will also need a version of [Java][java] to run Spark, as consumed by the codebase. Any version of Java 8+ will work, though this setup guide recommends using the latest LTS, which is 17 as of the writing of this guide.
+
+Spark also takes advantage of [Apache Hadoop][hadoop], but this is not necessary to run the codebase, nor does it affect the outcomes, so it will not be used in this guide.
+
+The following instructions have been reproduced using [Python][python] 3.11.4. This project does not make any guarantees that this will work outside of the specified version. Make sure you have Python, along with gcc for Cython, before attempting anything below.
+
+First, you will need to navigate to the generated `src` directory. You will need to install the required dependencies into the global Python instance or a virtual environment via:
+
+```sh
+python3 -m pip install .
+```
+
+> `python3` is replaced with `py` on Windows machines. Additionally, the `python3 -m` prefix is unnecessary if `pip` is properly added to the path.
+
+After installing the required dependencies, run the Python script via:
+
+```sh
+python3 ./helper_code/models/regression/train_rf_regression_full_cv.py --training-data-filepath VLE_datasets/v1/VLE_12k_dataset_v1.csv --output-dir ./results
+```
+
+You can look through the `results` directory and compare the numbers within the paper.
+
+[docker]: https://www.docker.com/
+[jammies]: https://github.com/ahaim5357/jammies
+[java]: https://adoptium.net/temurin/releases/?version=17
+[hadoop]: http://apache.github.io/hadoop/
+[python]: https://www.python.org/
+
+## Issues
+
+None of the results generated match anything reported in the papers. The `results.csv` generated reports the RMSE, but not for the 12k results, so while the code may work, no direct correlation can be interpreted from the results in the paper.
+
+As such, no consistent results are reported in the paper.
+
+*[ARR]: All Rights Reserved
+*[Cloned GitHub]: Cloned GitHub Repository
+*[GitHub]: GitHub Repository
+*[CC-BY-4.0]: Creative Commons Attribution 4.0 International
diff --git a/10-5281_zenodo-6853185/instructions.md b/10-5281_zenodo-6853185/instructions.md
@@ -0,0 +1,67 @@
+## Setup Instructions
+
+### Method 1: Docker
+
+This project contains the necessary files needed to setup a [docker container][docker]. Make sure you have Docker installed before attempting anything below. 
+
+To build the docker container, navigate to this directory and run the following command:
+
+```sh
+docker build -t <image_name> .
+```
+
+`image_name` should be replaced with whatever name you would like to refer to the docker container as. It will take around 30 minutes to an hour to build the image.
+
+From there, you can load into the terminal via:
+
+```sh
+docker run --rm -itv <local_directory>:/volume <image_name> sh
+```
+
+A `volume` directory will be created within the image which will link to the `local_directory` specified. You can specify the current directory of execution via `${PWD}`.
+
+> We are loading into the terminal instead of into Python to copy any generated figures onto the local machine as they cannot otherwise be easily viewed.
+
+Once in the docker terminal, you can run the Python script via:
+
+```sh
+python3 ./helper_code/models/regression/train_rf_regression_full_cv.py --training-data-filepath VLE_datasets/v1/VLE_12k_dataset_v1.csv --output-dir ./results
+```
+
+You can look through the terminal output and compare the numbers within the paper. To view the figures on the local machine, you can copy them to the volume via:
+
+```sh
+cp -R ./results /volume
+```
+
+## Method 2: Local Setup
+
+This project uses the Python package `jammies[all]` to setup and fix any issues in the codebase. For instructions on how to download and generate the project from this directory, see the [`jammies`][jammies] repository.
+
+You will also need a version of [Java][java] to run Spark, as consumed by the codebase. Any version of Java 8+ will work, though this setup guide recommends using the latest LTS, which is 17 as of the writing of this guide.
+
+Spark also takes advantage of [Apache Hadoop][hadoop], but this is not necessary to run the codebase, nor does it affect the outcomes, so it will not be used in this guide.
+
+The following instructions have been reproduced using [Python][python] 3.11.4. This project does not make any guarantees that this will work outside of the specified version. Make sure you have Python, along with gcc for Cython, before attempting anything below.
+
+First, you will need to navigate to the generated `src` directory. You will need to install the required dependencies into the global Python instance or a virtual environment via:
+
+```sh
+python3 -m pip install .
+```
+
+> `python3` is replaced with `py` on Windows machines. Additionally, the `python3 -m` prefix is unnecessary if `pip` is properly added to the path.
+
+After installing the required dependencies, run the Python script via:
+
+```sh
+python3 ./helper_code/models/regression/train_rf_regression_full_cv.py --training-data-filepath VLE_datasets/v1/VLE_12k_dataset_v1.csv --output-dir ./results
+```
+
+You can look through the `results` directory and compare the numbers within the paper.
+
+[docker]: https://www.docker.com/
+[jammies]: https://github.com/ahaim5357/jammies
+[java]: https://adoptium.net/temurin/releases/?version=17
+[hadoop]: http://apache.github.io/hadoop/
+[python]: https://www.python.org/
diff --git a/10-5281_zenodo-6853185/issues.md b/10-5281_zenodo-6853185/issues.md
@@ -0,0 +1,5 @@
+## Issues
+
+None of the results generated match anything reported in the papers. The `results.csv` generated reports the RMSE, but not for the 12k results, so while the code may work, no direct correlation can be interpreted from the results in the paper.
+
+As such, no consistent results are reported in the paper.
diff --git a/10-5281_zenodo-6853185/project_metadata.json b/10-5281_zenodo-6853185/project_metadata.json
@@ -0,0 +1,68 @@
+{
+    "files": [
+        {
+            "type": "git",
+            "name": "Github",
+            "repository": "https://github.com/ahaim5357/VLE-Dataset.git",
+            "commit": "6f992d8f5ac4837cc364bfd7421a983976306e89",
+            "extra": {
+                "name": "#github_cloned",
+                "link": "https://github.com/ahaim5357/VLE-Dataset"
+            }
+        }
+    ],
+    "extra": {
+        "schema_version": 1,
+        "status": 1,
+        "systems": {
+            "_": [
+                "debian-11",
+                "debian-12",
+                "docker-20.10-nvidia",
+                "docker-23.0-nvidia"
+            ]
+        },
+        "languages": {
+            "python": [
+                "3.11.2",
+                "3.11.4"
+            ],
+            "java": [
+                "17.0.8"
+            ]
+        },
+        "authors": [
+            "Sahan Bulathwela",
+            "Meghana Verma",
+            "https://orcid.org/0000-0003-1302-6093",
+            "https://orcid.org/0000-0003-4734-4532",
+            "https://orcid.org/0000-0002-2030-0073"
+        ],
+        "groups": [
+            "conference",
+            "short_paper",
+            "edm",
+            "2022"
+        ],
+        "links": {
+            "https://doi.org/10.5281/zenodo.6853185": {
+                "name": "Can Population-based Engagement Improve Personalisation? A Novel Dataset and Experiment",
+
+                "tags": [
+                    {
+                        "value": "paper",
+                        "license": "cc4"
+                    }
+                ]
+            },
+            "https://github.com/sahanbull/VLE-Dataset": {
+                "name": "#github",
+
+                "tags": [
+                    "data",
+                    "materials"
+                ]
+            }
+        }
+    }
+}
diff --git a/templates/python.Dockerfile b/templates/python.Dockerfile
@@ -1,5 +1,5 @@
 # Set global arguments
-ARG JAMMIES_VER=0.4.3
+ARG JAMMIES_VER=0.4.5
 
 # Get and patch project for working directory
 FROM python:3.11.2-alpine3.17 as projects
@@ -22,9 +22,9 @@ RUN apk add git
 
 ## Install jammies and run
 RUN python3 -m pip install "jammies[all]==${JAMMIES_VER}"
-RUN jammies patch src
+RUN jammies patch src -y
 
-## Setup project specific info
+# Setup project specific info
 FROM python:3.11.4-bookworm
 
 ## Keeps Python from generating .pyc files in the container
@@ -36,9 +36,10 @@ ENV PYTHONUNBUFFERED=1
 ## Copy project files from previous stage here
 RUN mkdir /src
 COPY --from=projects /src /src
+WORKDIR /src
 
 ## Setup python
-RUN python3 -m pip install -r /src/requirements.txt
+RUN python3 -m pip install -r requirements.txt
 
 ## Setup script run
 CMD [ "python3", "<file_name>" ]
diff --git a/templates/python.gpu.Dockerfile b/templates/python.gpu.Dockerfile

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,6 @@`
`79`	`79`	`},`
`80`	`80`	`"https://github.com/bpaassen/sparfae": {`
`81`	`81`	`"name": "#github",`
`82`		`- "license": "#gpl3later",`
`83`	`82`
`84`	`83`	`"tags": [`
`85`	`84`	`{`