Merge pull request #11 from seqeralabs/input_via_env_var

pinin4fjords · web-flow · commit c6093b99606b · 2025-09-25T10:31:55.000+01:00
Allow for env vars to be used for inputs
diff --git a/README.md b/README.md
@@ -42,9 +42,34 @@ All examples follow the same deployment process:
    - Select your compute environment
    - Adjust CPU, GPU, and memory allocations as needed
    - Mount any required data using the **Mount data** option
+   - Configure environment variables if the example supports them (see [Environment Variables](#environment-variables) section)
 5. Review the configuration in the **Summary** section
 6. Click **Add and start** to create and launch the Studio
 
+## Environment Variables
+
+Some examples support environment variable configuration to customize data paths and application settings without modifying the container image. This makes those examples more flexible and reusable across different datasets and configurations.
+
+### Examples with Environment Variables
+
+Only the following examples support environment variable configuration:
+- **CellxGene**: `DATASET_FILE`, `DATASET_TITLE` - Configure dataset path and display title
+- **Shiny**: `DATA_PATH` - Configure data file path with automatic cloud storage path conversion
+
+### Examples without Environment Variables
+
+These examples work with their default configurations and don't require environment variable setup:
+- **Marimo**: Interactive Python notebook environment
+- **Streamlit**: MultiQC visualization with web-based data loading interface
+- **TTYD**: Web-based terminal with pre-installed bioinformatics tools
+
+### Using Environment Variables in Seqera Studios
+
+When deploying to Seqera Studios, you can configure environment variables in the **Compute and Data** section:
+1. Expand the **Environment variables** section
+2. Add key-value pairs for the variables you want to customize
+3. The application will use these values instead of the defaults
+
 ## Documentation
 
 - [Official documentation on building custom studio environments](https://docs.seqera.io/platform-cloud/studios/custom-envs#custom-containers)
diff --git a/cellxgene/Dockerfile b/cellxgene/Dockerfile
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------------
 # 1) Multi-stage build: Pull the connect-client binary
 # ---------------------------------------------------------------
-ARG CONNECT_CLIENT_VERSION
+ARG CONNECT_CLIENT_VERSION=0.8
 FROM public.cr.seqera.io/platform/connect-client:${CONNECT_CLIENT_VERSION} AS connect
 
 # ---------------------------------------------------------------
@@ -29,8 +29,11 @@ RUN apt-get update && apt-get install -y \
 # Install CellxGene and its dependencies
 RUN pip install cellxgene==1.3.0
 
-# Define CellxGene dataset name
-ENV DATASET_NAME=pbmc3k.h5ad
+# Define CellxGene dataset path and title with defaults
+ENV DATASET_FILE=s3://cellxgene_datasets/pbmc3k.h5ad
+ENV DATASET_TITLE="PBMCs 3k test dataset"
+ENV USER_DATA_DIR=/user-data/cellxgene
+ENV ANNOTATIONS_DIR=/user-data/cellxgene
 
 # Create user-data directory
 RUN mkdir -p /user-data/cellxgene
@@ -47,13 +50,40 @@ RUN /usr/bin/connect-client --install
 ENTRYPOINT ["/usr/bin/connect-client", "--entrypoint"]
 
 # ---------------------------------------------------------------
-# 4) Command: Run CellxGene with a dataset from a data link defined via DATASET_NAME
+# 4) Command: Run CellxGene with cloud storage path translation
 # ---------------------------------------------------------------
 # The port is set by CONNECT_TOOL_PORT environment variable
-CMD /usr/local/bin/cellxgene launch \
-    --host 0.0.0.0 \
-    --port ${CONNECT_TOOL_PORT} \
-    --user-generated-data-dir /user-data/cellxgene \
-    --annotations-dir /user-data/cellxgene \
-    --title "PBMCs 3k test dataset" \
-    /workspace/data/cellxgene_datasets/${DATASET_NAME}
+# Convert cloud storage paths to local Studio paths
+CMD ["/bin/bash", "-c", "bash <<'EOF'\n\
+# Function to convert cloud storage path to local Studio path\n\
+convert_path() {\n\
+  local input_path=\"$1\"\n\
+  if [[ \"$input_path\" =~ ^(s3|gs|az):// ]]; then\n\
+    local cloud_path=${input_path#*://}\n\
+    local bucket_name=${cloud_path%%/*}\n\
+    local object_path=${cloud_path#*/}\n\
+    echo \"/workspace/data/$bucket_name/$object_path\"\n\
+  else\n\
+    echo \"$input_path\"\n\
+  fi\n\
+}\n\
+\n\
+# Process dataset path\n\
+DATASET_CLOUD=${DATASET_FILE#*://}\n\
+DATASET_BUCKET=${DATASET_CLOUD%%/*}\n\
+DATASET_OBJECT=${DATASET_CLOUD#*/}\n\
+DATASET_LOCAL=\"/workspace/data/${DATASET_BUCKET}/${DATASET_OBJECT}\"\n\
+\n\
+# Process user data and annotations paths using the function\n\
+USERDATA_LOCAL=$(convert_path \"${USER_DATA_DIR}\")\n\
+ANNOTATIONS_LOCAL=$(convert_path \"${ANNOTATIONS_DIR}\")\n\
+\n\
+# Ensure directories exist\n\
+mkdir -p \"${USERDATA_LOCAL}\" \"${ANNOTATIONS_LOCAL}\"\n\
+\n\
+# Launch cellxgene\n\
+/usr/local/bin/cellxgene launch --host 0.0.0.0 --port ${CONNECT_TOOL_PORT} \\\n\
+  --user-generated-data-dir \"${USERDATA_LOCAL}\" \\\n\
+  --annotations-dir \"${ANNOTATIONS_LOCAL}\" \\\n\
+  --title \"${DATASET_TITLE}\" \"${DATASET_LOCAL}\"\n\
+EOF"]
diff --git a/cellxgene/README.md b/cellxgene/README.md
@@ -35,14 +35,17 @@ For specific versions, use the release tag (e.g., `ghcr.io/seqeralabs/custom-stu
 - Support for .h5ad datasets
 - Interactive single-cell data exploration
 - Automatic data mounting via datalinks
+- Configurable dataset path, title, and storage directories via environment variables
+- Cloud storage path support with automatic translation to local Studio paths
 
 > [!NOTE]
 > For common features shared across all examples, see the [main README](../README.md#common-features).
 
 ## Files
 
 - `Dockerfile`: Container definition using multi-stage build
-- `pbmc3k.h5ad`: Example dataset (mounted via datalink)
+- `README.md`: This documentation file
+- `screenshot.png`: Example screenshot of the CellxGene interface
 
 ## Prerequisites
 
@@ -91,21 +94,59 @@ docker run -p 3000:3000 --entrypoint /usr/local/bin/cellxgene -v $(pwd)/data:/wo
 
 The app will be available at http://localhost:3000
 
+## Cloud Storage Path Translation
+
+The container automatically converts cloud storage paths to local Studio paths. Supported providers include:
+
+- **Amazon S3**: `s3://bucket/path/to/dataset.h5ad`
+- **Google Cloud Storage**: `gs://bucket/path/to/dataset.h5ad`  
+- **Azure Blob Storage**: `az://container/path/to/dataset.h5ad`
+
+**Examples:**
+- S3: `s3://my-genomics-data/single-cell/experiment1.h5ad` → `/workspace/data/my-genomics-data/single-cell/experiment1.h5ad`
+- GCS: `gs://research-bucket/datasets/pbmc3k.h5ad` → `/workspace/data/research-bucket/datasets/pbmc3k.h5ad`
+- Azure: `az://data-container/studies/cellxgene.h5ad` → `/workspace/data/data-container/studies/cellxgene.h5ad`
+
+**Requirements:**
+- Mount the cloud storage bucket/container from Data Explorer in Seqera Studios
+- Provide cloud storage paths in the `DATASET_FILE` environment variable
+
+> [!WARNING]
+> **Bucket Mounting Required**: When using cloud storage paths (`s3://`, `gs://`, `az://`), ensure the corresponding buckets are mounted in your Studio via the **Mount data** option. Unmounted buckets will cause the Studio to fail when trying to access the converted paths.
+
 ## Using in Seqera Studios
 
 > [!NOTE]
 > For the common deployment process, see the [main README](../README.md#deploying-to-seqera-studios).
 
 Additional steps specific to this example:
-1. Create a data link called 'cellxgene_datasets' and place your .h5ad file there
+1. In the **Compute and Data** tab, click the **Mount data** button to mount your cloud storage bucket/container
 2. Follow the common deployment process
-3. When mounting data, ensure to mount 'cellxgene_datasets' using the **Mount data** option
+3. Configure environment variables:
+   - `DATASET_FILE`: Cloud storage path to your .h5ad file
+     - Supports S3 (`s3://`), Google Cloud Storage (`gs://`), and Azure Blob Storage (`az://`) paths
+     - Example: `s3://my-genomics-data/single-cell/experiment1.h5ad`
+   - `DATASET_TITLE`: Title to display in the CellxGene interface
+     - Example: `"My Single-Cell Analysis"`
+   - `USER_DATA_DIR`: Path for user-generated data storage
+     - Default: `/user-data/cellxgene` (local directory)
+     - Supports cloud storage paths (automatically converted to local Studio paths)
+     - Example: `s3://my-bucket/user-data/cellxgene`
+   - `ANNOTATIONS_DIR`: Path for annotations storage
+     - Default: `/user-data/cellxgene` (local directory)
+     - Supports cloud storage paths (automatically converted to local Studio paths)
+     - Example: `s3://my-bucket/annotations/cellxgene`
+
+> [!WARNING]
+> **Bucket Mounting**: If using cloud storage paths for `USER_DATA_DIR` or `ANNOTATIONS_DIR`, ensure the corresponding buckets are mounted in your Studio. Unmounted buckets will cause the Studio to fail when trying to access the converted paths.
 
 ## Notes
 
 - The app uses CellxGene 1.3.0 for interactive single-cell data visualization
-- User data and annotations are stored in /user-data/cellxgene
-- The default dataset is pbmc3k.h5ad, but can be changed via the DATASET_NAME environment variable
+- User data and annotations directories can be configured via environment variables
+- Default storage locations: `/user-data/cellxgene` (can be overridden with cloud storage paths)
+- Specify your dataset via the DATASET_FILE environment variable
+- Customize the display title via the DATASET_TITLE environment variable
 
 > [!NOTE]
 > For common technical notes, see the [main README](../README.md#common-features).
diff --git a/shiny-simple-example/Dockerfile b/shiny-simple-example/Dockerfile
@@ -50,6 +50,9 @@ RUN /usr/bin/connect-client --install
 # Set a working directory for your app
 WORKDIR /app
 
+# Define data path with default
+ENV DATA_PATH=s3://shiny-inputs/data.csv
+
 # Copy your Shiny code and data
 COPY app_plot_demo.R /app/
 COPY data.csv /app/
diff --git a/shiny-simple-example/README.md b/shiny-simple-example/README.md
@@ -31,18 +31,19 @@ For specific versions, use the release tag (e.g., `ghcr.io/seqeralabs/custom-stu
 
 ## Features
 
-- Simple scatter plot visualization
-- Interactive data filtering
+- Advanced data visualization with multiple plot types (scatter, line, bar, box, density)
+- Interactive controls and color themes
 - Compatible with both local Docker testing and Seqera Studios
 - Efficient package management with micromamba
 - Easy data mounting via datalinks
+- Configurable data path via environment variables
+- Cloud storage path support with automatic translation to local Studio paths
 
 ## Files
 
 - `app_plot_demo.R`: The main Shiny application
 - `example_data.csv`: Sample data for the visualization
 - `Dockerfile`: Container definition
-- `run.sh`: Entrypoint script that handles both local and Studios environments
 
 ## Prerequisites
 
@@ -78,23 +79,44 @@ docker run -p 3000:3000 --entrypoint micromamba -v $(pwd)/../data/shiny-inputs:/
 
 The app will be available at http://localhost:3000
 
+## Cloud Storage Path Translation
+
+The application automatically converts cloud storage paths to local Studio paths. Supported providers include:
+
+- **Amazon S3**: `s3://bucket/path/to/data.csv`
+- **Google Cloud Storage**: `gs://bucket/path/to/data.csv`  
+- **Azure Blob Storage**: `az://container/path/to/data.csv`
+
+**Examples:**
+- S3: `s3://my-data-bucket/datasets/experiment.csv` → `/workspace/data/my-data-bucket/datasets/experiment.csv`
+- GCS: `gs://research-data/analysis/results.csv` → `/workspace/data/research-data/analysis/results.csv`
+- Azure: `az://data-container/studies/sample.csv` → `/workspace/data/data-container/studies/sample.csv`
+
+**Requirements:**
+- Mount the cloud storage bucket/container from Data Explorer in Seqera Studios
+- Provide cloud storage paths in the `DATA_PATH` environment variable
+
 ## Using in Seqera Studios
 
 > [!NOTE]
 > For the common deployment process, see the [main README](../README.md#deploying-to-seqera-studios).
 
 Additional steps specific to this example:
-1. Create a data link called 'shiny-inputs' and place your input file called 'data.csv' there
+1. In the **Compute and Data** tab, click the **Mount data** button to mount your cloud storage bucket/container
 2. Follow the common deployment process
-3. When mounting data, ensure to mount 'shiny-inputs' using the **Mount data** option
+3. Configure environment variables:
+   - `DATA_PATH`: Cloud storage path to your CSV file
+     - Supports S3 (`s3://`), Google Cloud Storage (`gs://`), and Azure Blob Storage (`az://`) paths
+     - Example: `s3://my-data-bucket/datasets/experiment.csv`
 
 ## Notes
 
-- The app uses a simple scatter plot to demonstrate Shiny's capabilities
+- The app provides advanced data visualization with multiple plot types and interactive controls
 - The Dockerfile uses micromamba for efficient package management
 - The container is built for linux/amd64 platform compatibility
 - Data files should be in CSV format
-- The example includes a sample dataset for demonstration
+- Specify your data file via the DATA_PATH environment variable
+- Cloud storage paths are automatically converted to local Studio paths
 
 ## References
 
diff --git a/shiny-simple-example/app_plot_demo.R b/shiny-simple-example/app_plot_demo.R
@@ -100,7 +100,20 @@ ui <- fluidPage(
 server <- function(input, output, session) {
     # Read data
     data <- reactive({
-        file_path <- '/workspace/data/shiny-inputs/data.csv'
+        data_path <- Sys.getenv('DATA_PATH', 's3://shiny-inputs/data.csv')
+        
+        # Convert cloud storage path to local Studio path
+        if (grepl('^s3://|^gs://|^az://', data_path)) {
+            # Remove any cloud storage prefix and convert to local path
+            cloud_path <- sub('^[^:]+://', '', data_path)
+            bucket_name <- strsplit(cloud_path, '/')[[1]][1]
+            object_path <- sub(paste0('^', bucket_name, '/'), '', cloud_path)
+            file_path <- paste0('/workspace/data/', bucket_name, '/', object_path)
+        } else {
+            # Use path as-is for local paths
+            file_path <- data_path
+        }
+        
         if (file.exists(file_path)) {
             read.csv(file_path)
         } else {
@@ -114,9 +127,20 @@ server <- function(input, output, session) {
     
     # Data source message
     output$data_source <- renderText({
-        file_path <- '/workspace/data/shiny-inputs/data.csv'
+        data_path <- Sys.getenv('DATA_PATH', 's3://shiny-inputs/data.csv')
+        
+        # Convert cloud storage path to local Studio path (same logic as data reading)
+        if (grepl('^s3://|^gs://|^az://', data_path)) {
+            cloud_path <- sub('^[^:]+://', '', data_path)
+            bucket_name <- strsplit(cloud_path, '/')[[1]][1]
+            object_path <- sub(paste0('^', bucket_name, '/'), '', cloud_path)
+            file_path <- paste0('/workspace/data/', bucket_name, '/', object_path)
+        } else {
+            file_path <- data_path
+        }
+        
         if (file.exists(file_path)) {
-            "Using external data file"
+            paste("Using external data file:", data_path)
         } else {
             "Using built-in random data"
         }