diff --git a/.github/docker/Rstudio/Dockerfile b/.github/docker/Rstudio/Dockerfile deleted file mode 100644 index 126e87e..0000000 --- a/.github/docker/Rstudio/Dockerfile +++ /dev/null @@ -1,64 +0,0 @@ -# This Dockerfile is installing R and RStudio Server on Ubuntu 22.04.03 -# purpose of this dockerfile is to create a test environment for the ICRN manager -# this docker is intended to mimic the NCSA ICRN JupyterHub environment -# Use the minimal-notebook as base -ARG JUPYTER_VERSION=latest -FROM jupyter/minimal-notebook:${JUPYTER_VERSION} - -ARG ICRN_MANAGER_PATH=/sw/icrn/jupyter/icrn_ncsa_resources/tools/icrn_manager/ -ARG ICRN_KERNELS_PATH=/sw/icrn/jupyter/icrn_ncsa_resources/kernels/ -# ARG ICRN_TESTS_PATH=/sw/icrn/jupyter/icrn_ncsa_resources/tests/ - -# Switch to root to install additional packages -USER root - -WORKDIR / - -# Install R and its dependencies -RUN apt update -qq && \ - apt install -y --no-install-recommends software-properties-common dirmngr && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" && \ - apt install -y r-base - -# Installs Rstudio Server -RUN apt install -y --no-install-recommends gdebi-core && \ - wget https://download2.rstudio.org/server/jammy/amd64/rstudio-server-2024.12.0-467-amd64.deb && \ - gdebi -n rstudio-server-2024.12.0-467-amd64.deb && \ - rm -f rstudio-server-2024.12.0-467-amd64.deb - -# Install tidyverse and jq (required by icrn_manager) -RUN apt update -qq && apt install --yes --no-install-recommends wget ca-certificates gnupg jq && \ - wget -q -O- https://eddelbuettel.github.io/r2u/assets/dirk_eddelbuettel_key.asc | tee -a /etc/apt/trusted.gpg.d/cranapt_key.asc && \ - echo "deb [arch=amd64] https://r2u.stat.illinois.edu/ubuntu jammy main" > /etc/apt/sources.list.d/cranapt.list && \ - apt update -qq && \ - apt install --yes --no-install-recommends r-cran-data.table r-cran-tidyverse && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* - -# Installs the jupyter-rsession-proxy -RUN pip install jupyter-rsession-proxy && \ - chown -R $NB_USER:users /home/$NB_USER/.cache - -# Switch to default working directory -WORKDIR /home/$NB_USER - -RUN mkdir -p $ICRN_KERNELS_PATH && \ - mkdir -p $ICRN_MANAGER_PATH - -# Copy icrn_manager tools to /usr/local/bin for system-wide access -COPY ./icrn_manager /usr/local/bin/icrn_manager -COPY ./update_r_libs.sh /usr/local/bin/update_r_libs.sh - -# Make the icrn_manager tools executable -RUN chmod +x /usr/local/bin/icrn_manager && \ - chmod +x /usr/local/bin/update_r_libs.sh - -# eh? no. this is a shared volume; i'm pretty sure. -# RUN chown -R $NB_USER:users /sw - -# Switch to default NB_USER -USER $NB_USER - -# after switch, run init, so user's homedir is set up with catalog, etc. -RUN icrn_manager kernels init $ICRN_KERNELS_PATH \ No newline at end of file diff --git a/.github/docker/Rstudio/cowsay_conda.sh b/.github/docker/Rstudio/cowsay_conda.sh deleted file mode 100644 index ff25e1f..0000000 --- a/.github/docker/Rstudio/cowsay_conda.sh +++ /dev/null @@ -1,6 +0,0 @@ -icrn_manager kernels use none -conda create --solver=libmamba -c r -y -n R_cowsay r-base=4.4.3 -conda activate R_cowsay -Rscript -e 'install.packages("cowsay", repos="http://cran.us.r-project.org")' -conda install -y --solver=libmamba conda-pack -conda pack -n R_cowsay -o ~/conda-packs/R_cowsay.conda.pack.tar.gz \ No newline at end of file diff --git a/.github/workflows/docker-build-indexer.yml b/.github/workflows/docker-build-indexer.yml new file mode 100644 index 0000000..b34ff83 --- /dev/null +++ b/.github/workflows/docker-build-indexer.yml @@ -0,0 +1,69 @@ +name: Kernel Indexer Build and Push + +on: + push: + branches: [ main, develop ] + paths: + - 'kernel-indexer/**' + - 'kernel_indexer' + - '.github/workflows/docker-build-indexer.yml' + pull_request: + branches: [ main, develop ] + paths: + - 'kernel-indexer/**' + - 'kernel_indexer' + - '.github/workflows/docker-build-indexer.yml' + workflow_dispatch: + +env: + REGISTRY: docker.io + IMAGE_NAME: ${{ secrets.DOCKERHUB_REPO }}/icrn-kernel-indexer + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix={{branch}}- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push Docker image + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: kernel-indexer/Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Output image info + run: | + echo "Built and pushed image: ${{ steps.meta.outputs.tags }}" + echo "Image digest: ${{ steps.build.outputs.digest }}" + diff --git a/.github/workflows/docker-build-webserver.yml b/.github/workflows/docker-build-webserver.yml new file mode 100644 index 0000000..28ed729 --- /dev/null +++ b/.github/workflows/docker-build-webserver.yml @@ -0,0 +1,67 @@ +name: Kernel Webserver Build and Push + +on: + push: + branches: [ main, develop ] + paths: + - 'web/**' + - '.github/workflows/docker-build-webserver.yml' + pull_request: + branches: [ main, develop ] + paths: + - 'web/**' + - '.github/workflows/docker-build-webserver.yml' + workflow_dispatch: + +env: + REGISTRY: docker.io + IMAGE_NAME: ${{ secrets.DOCKERHUB_REPO }}/icrn-kernel-webserver + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix={{branch}}- + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push Docker image + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: web/Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Output image info + run: | + echo "Built and pushed image: ${{ steps.meta.outputs.tags }}" + echo "Image digest: ${{ steps.build.outputs.digest }}" + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index a3db2b1..0000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: Test ICRN Manager - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - workflow_dispatch: - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y jq tar - - - name: Make scripts executable - run: | - chmod +x icrn_manager - chmod +x update_r_libs.sh - chmod +x tests/run_tests.sh - - - name: Run test suite - run: | - ./tests/run_tests.sh - - - name: Upload test results - uses: actions/upload-artifact@v4 - if: always() - with: - name: test-results - path: tests/test_results.log - - test-docker: - runs-on: ubuntu-latest - needs: test - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Build Docker image - run: | - docker build -t icrn-manager-test .github/docker/Rstudio/ - - - name: Run tests in Docker - run: | - docker run --rm \ - -v $(pwd):/workspace \ - -w /workspace \ - icrn-manager-test \ - bash -c "cd /workspace && ./tests/run_tests.sh" - - - name: Upload Docker test results - uses: actions/upload-artifact@v4 - if: always() - with: - name: docker-test-results - path: tests/test_results.log \ No newline at end of file diff --git a/kernel-indexer/.dockerignore b/kernel-indexer/.dockerignore new file mode 100644 index 0000000..a15fc41 --- /dev/null +++ b/kernel-indexer/.dockerignore @@ -0,0 +1,27 @@ +# Documentation +DESIGN.md +README.md +*.md + +# Git files +.git +.gitignore + +# Python cache +__pycache__ +*.pyc +*.pyo +*.pyd +.Python + +# IDE files +.vscode +.idea +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db + diff --git a/kernel-indexer/DESIGN.md b/kernel-indexer/DESIGN.md new file mode 100644 index 0000000..5d61998 --- /dev/null +++ b/kernel-indexer/DESIGN.md @@ -0,0 +1,414 @@ +# Kernel Indexer Docker Container - Design Document + +## Overview + +This document describes the design and approach for creating a Docker container that runs the kernel indexer script to index kernel repositories and generate JSON files for consumption by the web server. + +## Architecture + +### Components + +1. **Kernel Indexer Container** (this design) + - Runs the `kernel_indexer` bash script + - Indexes kernels from the repository + - Generates `collated_manifests.json` and `package_index.json` + - Designed to run as a Kubernetes CronJob + +2. **Web Server Container** (existing) + - Reads the generated JSON files + - Serves them via REST API + - Runs continuously + +### Data Flow + +``` +Kernel Repository (/sw/icrn/jupyter/icrn_ncsa_resources/Kernels) + ↓ (read-write mount) +Kernel Indexer Container (CronJob) + ↓ + ├─→ Index all kernels + │ └─→ Writes package_manifest.json to each kernel directory + │ (e.g., R/kernel_name/version/package_manifest.json) + └─→ Collate results + └─→ Writes to kernel repo root: + ├─→ collated_manifests.json + └─→ package_index.json + ↓ +Kernel Repository (updated with manifests and collated files) + ↓ (read-only or read-write mounts) + ├─→ Web Server Container + ├─→ Other Service Containers + └─→ Other Endpoints + All read the same files from kernel repository +``` + +## Directory Structure + +``` +kernel-indexer/ +├── Dockerfile # Container definition +├── entrypoint.sh # Main entrypoint script +├── README.md # Usage instructions +├── DESIGN.md # This file +└── .dockerignore # Files to exclude from build +``` + +## Container Design + +### Base Image + +- **Base**: `continuumio/miniconda3` or `condaforge/mambaforge` + - Provides `conda` command required by kernel_indexer + - Includes Python for any future enhancements + - Lightweight compared to full Anaconda + +### Dependencies + +1. **System packages**: + - `jq` - JSON processing (required by kernel_indexer) + - `bash` - Shell interpreter + - `findutils` - For `find` command + - `coreutils` - Standard Unix utilities + +2. **Conda**: + - Already included in base image + - Used by kernel_indexer to query kernel environments + +### File Organization + +1. **kernel_indexer script**: + - Copy from repo root (`../kernel_indexer`) + - Place at `/usr/local/bin/kernel_indexer` + - Make executable + +2. **Entrypoint script**: + - Handles execution logic + - Manages error handling and logging + - Configurable via environment variables + +### Volume Mounts + +The container will need access to: + +1. **Kernel Repository** (read-write): + - Mount: `/sw/icrn/jupyter/icrn_ncsa_resources/Kernels` + - Purpose: + - Source of kernels to index (read) + - Write `package_manifest.json` files into each kernel directory (write) + - Write `collated_manifests.json` and `package_index.json` to repo root (write) + - **Critical**: Must be read-write to allow writing manifests back to kernel directories + +## Execution Flow + +### Entrypoint Script Logic + +1. **Validation**: + - Check that `KERNEL_ROOT` directory exists (fail if missing - this is core infrastructure) + - Check that `KERNEL_ROOT` is mounted and accessible + - Verify `KERNEL_ROOT` is writable (needed for writing manifests) + - Verify `kernel_indexer` script is executable + - Check that `jq` and `conda` are available + - **Critical**: Do NOT attempt to create `KERNEL_ROOT` if missing - this indicates a serious infrastructure problem + +2. **Configuration**: + - Read `KERNEL_ROOT` from environment variable (default: `/sw/icrn/jupyter/icrn_ncsa_resources/Kernels`) + - Read `OUTPUT_DIR` from environment variable (default: `$KERNEL_ROOT` - write to kernel repo root) + - Read `LANGUAGE_FILTER` from environment variable (optional, for filtering by language) + - Determine if using separate output directory or kernel repo root + +3. **Execution**: + - Run: `kernel_indexer index --kernel-root $KERNEL_ROOT [--language $LANGUAGE_FILTER]` + - This writes `package_manifest.json` into each kernel directory + - Each manifest is written atomically by kernel_indexer script + - Run: `kernel_indexer collate --kernel-root $KERNEL_ROOT --output-dir $OUTPUT_DIR [--language $LANGUAGE_FILTER]` + - This creates `collated_manifests.json` and `package_index.json` in output directory + - **Atomic Writes for Collated Files**: + - Write to temporary files first: `collated_manifests.json.tmp` and `package_index.json.tmp` + - Validate JSON structure using `jq` + - Atomically rename: `mv collated_manifests.json.tmp collated_manifests.json` + - This ensures other services never read partially-written files + +4. **Error Handling** (Fail-Fast Strategy): + - Exit immediately with non-zero code on any error + - Do NOT retry or re-attempt within the same job run + - Let the cron schedule trigger the next attempt (likely hourly) + - Log errors to stderr for Kubernetes logging + - Exit codes indicate the type of failure for debugging + +5. **Output**: + - Log progress to stdout + - Log summary statistics (kernels indexed, packages found, etc.) + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `KERNEL_ROOT` | `/sw/icrn/jupyter/icrn_ncsa_resources/Kernels` | Path to kernel repository root (must be read-write) | +| `OUTPUT_DIR` | (same as `KERNEL_ROOT`) | Directory where collated JSON files will be written. Defaults to kernel repo root. | +| `LANGUAGE_FILTER` | (empty) | Optional: Filter by language (R, Python, etc.) | +| `LOG_LEVEL` | `INFO` | Logging verbosity (DEBUG, INFO, WARN, ERROR) | +| `ATOMIC_WRITES` | `true` | Use atomic writes for collated files (write to temp, then rename) | + +## Kubernetes Integration + +### CronJob Configuration + +**Option 1: Write to Kernel Repository Root (Recommended)** + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kernel-indexer +spec: + schedule: "0 2 * * *" # Run daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: kernel-indexer + image: icrn-kernel-indexer:latest + env: + - name: KERNEL_ROOT + value: "/sw/icrn/jupyter/icrn_ncsa_resources/Kernels" + # OUTPUT_DIR defaults to KERNEL_ROOT, so collated files go to repo root + volumeMounts: + - name: kernel-repo + mountPath: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + # readOnly: false (default) - needed to write manifests + volumes: + - name: kernel-repo + hostPath: + path: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + type: Directory + # Note: Directory type requires the path to exist - will fail if missing + # This is intentional as this is core infrastructure that must be present + restartPolicy: Never + # Never retry on failure - fail fast and let cron schedule handle next attempt +``` + +### Volume Strategy + +**Primary Strategy: Kernel Repository as Single Source of Truth** + +Since the indexer writes `package_manifest.json` files back into kernel directories AND creates collated files, the kernel repository becomes the single source of truth: + +1. **Kernel Repository Mount**: + - **Indexer**: Mount as read-write (to write manifests and collated files) + - **Web Server**: Mount as read-only (only needs to read files) + - **Other Services**: Mount as read-only or read-write depending on needs + - All services read from the same location, ensuring consistency + +2. **File Locations**: + - Individual manifests: `$KERNEL_ROOT/R/kernel_name/version/package_manifest.json` + - Collated files: `$KERNEL_ROOT/collated_manifests.json` and `$KERNEL_ROOT/package_index.json` + +3. **Data Synchronization**: + - No explicit sync needed - all services mount the same kernel repository + - Files are written atomically (write to temp, then rename) to prevent partial reads + - Web server's hourly auto-reload will pick up new files automatically + + +## Error Handling and Logging + +### Fail-Fast Strategy + +The indexer implements a **fail-fast** approach: +- **No retries**: On any error, the job exits immediately with a non-zero code +- **No re-attempts**: The job does not retry within the same execution +- **Cron-driven recovery**: The next scheduled run (likely hourly) will attempt indexing again +- **Immediate failure**: Validation errors cause immediate exit before any indexing begins +- **Partial failure handling**: If indexing fails partway through, exit immediately (don't attempt collation) + +This approach ensures: +- Problems are surfaced immediately rather than masked by retries +- Resource usage is predictable (no runaway retry loops) +- The cron schedule provides natural backoff and retry mechanism +- Logs clearly show what failed without retry noise + +### Exit Codes + +- `0`: Success - indexing and collation completed successfully +- `1`: General error - check logs for details +- `2`: Missing dependencies - jq or conda not found +- `3`: Kernel root not accessible or missing (infrastructure problem) +- `4`: Indexing failed - one or more kernels failed to index +- `5`: Collation failed - indexing succeeded but collation failed + +### Logging Strategy + +- **stdout**: Progress information, summary statistics +- **stderr**: Errors, warnings +- **Format**: Structured logging with timestamps +- **Kubernetes**: Logs captured automatically via container logs +- **On failure**: Clear error messages indicating what failed and why + +## Performance Considerations + +1. **Indexing Time**: + - Depends on number of kernels + - Each kernel requires conda environment activation + - Consider parallelization for large repositories (future enhancement) + +2. **Resource Requirements**: + - Memory: 2-4 GB (for conda operations) + - CPU: 1-2 cores (mostly I/O bound) + - Disk: Minimal (only script and temp files) + +3. **Caching**: + - Kernel indexer already creates `package_manifest.json` in each kernel directory + - Re-indexing only updates changed kernels (if implemented) + - Current design: Full re-index on each run + +## Security Considerations + +1. **Kernel Repository Write Access**: + - Indexer container needs read-write access to write `package_manifest.json` files + - Other containers (web server, etc.) can use read-only mounts + - Consider file ownership and permissions to prevent unauthorized writes + +2. **Atomic Writes**: + - Use atomic write pattern for collated files (write to `.tmp`, then `mv` to final name) + - Prevents other services from reading partially-written files + - Entrypoint script should implement this if kernel_indexer doesn't + +3. **File Permissions**: + - Ensure indexer can write to kernel directories + - Ensure web server and other services can read the files + - Consider using group permissions or specific user IDs + +4. **Container Security**: + - Run as non-root user if possible (may require permission adjustments) + - Minimal base image reduces attack surface + - Limit container capabilities if possible + +## Testing Strategy + +1. **Local Testing**: + - Build Docker image locally + - Test with sample kernel repository + - Verify JSON output format + +2. **Integration Testing**: + - Test with actual kernel repository + - Verify web server can read generated files + - Test error scenarios (missing kernels, invalid paths) + +3. **Kubernetes Testing**: + - Deploy as CronJob in test cluster + - Verify scheduling and execution + - Check logs and output files + + + +## Implementation Checklist + +- [ ] Create Dockerfile +- [ ] Create entrypoint.sh script +- [ ] Add error handling and logging +- [ ] Test locally with sample data +- [ ] Create README.md with usage instructions +- [ ] Add .dockerignore file +- [ ] Test Kubernetes CronJob deployment +- [ ] Document volume mounting strategy +- [ ] Add example Kubernetes manifests + +## Data Sharing and Synchronization + +### Challenge +Multiple containers and endpoints need access to: +1. Individual `package_manifest.json` files (in each kernel directory) +2. Collated files (`collated_manifests.json` and `package_index.json`) + +### Solution: Kernel Repository as Single Source + +**Write Strategy**: +- Indexer writes all files to kernel repository +- Individual manifests: `$KERNEL_ROOT/{R,Python}/kernel_name/version/package_manifest.json` + - Written directly by `kernel_indexer index` command + - Each file is written atomically (kernel_indexer uses `jq` to write JSON) +- Collated files: `$KERNEL_ROOT/collated_manifests.json` and `$KERNEL_ROOT/package_index.json` + - Written by `kernel_indexer collate` command + - Entrypoint script wraps this with additional atomic write protection + +**Read Strategy**: +- All services mount the same kernel repository +- Web server: Read-only mount (configurable path via env vars) +- Other services: Read-only or read-write mounts as needed +- No explicit synchronization needed - all read from same source +- Filesystem-level consistency ensures all services see the same data + +**Atomic Write Implementation** (in entrypoint.sh): + +```bash +# Function to write collated file atomically +atomic_write_collated() { + local output_file=$1 + local temp_file="${output_file}.tmp" + + # kernel_indexer writes to temp file first + # Then we validate and rename atomically + if [ -f "$temp_file" ]; then + # Validate JSON + if jq '.' "$temp_file" >/dev/null 2>&1; then + # Atomic rename (single filesystem operation) + mv "$temp_file" "$output_file" + return 0 + else + echo "ERROR: Invalid JSON in $temp_file" >&2 + rm -f "$temp_file" + return 1 + fi + fi + return 1 +} + +# After collate command: +atomic_write_collated "$OUTPUT_DIR/collated_manifests.json" +atomic_write_collated "$OUTPUT_DIR/package_index.json" +``` + +**Alternative: Direct Atomic Write**: +- Modify entrypoint to redirect kernel_indexer output to temp files +- Validate and rename atomically +- Ensures no partial reads by other services + +**Web Server Configuration**: +- Update web server to read from kernel repository mount +- Option 1: Mount kernel repo to `/app/data` and use default paths +- Option 2: Use environment variables: + - `COLLATED_MANIFESTS_PATH=/sw/icrn/jupyter/icrn_ncsa_resources/Kernels/collated_manifests.json` + - `PACKAGE_INDEX_PATH=/sw/icrn/jupyter/icrn_ncsa_resources/Kernels/package_index.json` + +**Concurrency Considerations**: +- If multiple indexers could run simultaneously, add lock file mechanism +- Use `flock` or similar to ensure only one indexer runs at a time +- Lock file: `$KERNEL_ROOT/.indexing.lock` + +## Questions to Resolve + +1. **Output Location**: + - Should output be written to kernel repo root or separate location? + - **Resolution**: Default to kernel repo root (simpler, single source of truth). Allow override via `OUTPUT_DIR` if needed. + +2. **Language Filtering**: + - Should the cron job index all languages or be configurable? + - **Resolution**: Make it configurable via `LANGUAGE_FILTER` env var, default to all languages + +3. **Indexing Strategy**: + - Full re-index every time or incremental? + - **Resolution**: Start with full re-index (simpler), enhance later if needed + +4. **Failure Handling**: + - What happens if indexing fails partially? + - **Resolution**: Fail fast - exit immediately on any error with non-zero code. No retries within the same job run. The cron schedule (likely hourly) will trigger the next attempt. Partial writes to individual manifests are acceptable (they'll be overwritten on next successful run). Kubernetes `restartPolicy: Never` ensures no automatic retries. + +5. **Web Server Refresh**: + - How does web server know to reload files? + - **Resolution**: Web server already has hourly auto-reload, plus manual refresh endpoint. Files are written atomically so reloads are safe. + +6. **File Locking**: + - Do we need file locking during writes? + - **Resolution**: Atomic writes (temp + rename) should be sufficient. If multiple indexers run concurrently, consider adding lock file mechanism. + diff --git a/kernel-indexer/Dockerfile b/kernel-indexer/Dockerfile new file mode 100644 index 0000000..82f34f8 --- /dev/null +++ b/kernel-indexer/Dockerfile @@ -0,0 +1,31 @@ +FROM continuumio/miniconda3:latest + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y \ + jq \ + bash \ + findutils \ + coreutils \ + && \ + rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy kernel_indexer script from repo root +COPY kernel_indexer /usr/local/bin/kernel_indexer + +# Make kernel_indexer executable +RUN chmod +x /usr/local/bin/kernel_indexer + +# Copy entrypoint script from kernel-indexer directory +COPY kernel-indexer/entrypoint.sh /app/entrypoint.sh + +# Fix line endings and make executable +RUN sed -i 's/\r$//' /app/entrypoint.sh && \ + chmod +x /app/entrypoint.sh + +# Set entrypoint +ENTRYPOINT ["/app/entrypoint.sh"] + diff --git a/kernel-indexer/README.md b/kernel-indexer/README.md new file mode 100644 index 0000000..b4b8d55 --- /dev/null +++ b/kernel-indexer/README.md @@ -0,0 +1,221 @@ +# Kernel Indexer Docker Container + +This Docker container runs the kernel indexer script to index kernel repositories and generate JSON files for consumption by the web server and other services. + +## Overview + +The kernel indexer container: +- Indexes all kernels in the repository (creates `package_manifest.json` in each kernel directory) +- Collates results into two JSON files: `collated_manifests.json` and `package_index.json` +- Designed to run as a Kubernetes CronJob +- Implements fail-fast error handling (no retries within the same run) + +## Building the Image + +From the repository root: + +```bash +docker build -t icrn-kernel-indexer:latest -f kernel-indexer/Dockerfile . +``` + +Or from the `kernel-indexer` directory: + +```bash +docker build -t icrn-kernel-indexer:latest . +``` + +## Running Locally + +### Basic Usage + +```bash +docker run --rm \ + -v /sw/icrn/jupyter/icrn_ncsa_resources/Kernels:/sw/icrn/jupyter/icrn_ncsa_resources/Kernels \ + icrn-kernel-indexer:latest +``` + +### With Custom Configuration + +```bash +docker run --rm \ + -v /sw/icrn/jupyter/icrn_ncsa_resources/Kernels:/sw/icrn/jupyter/icrn_ncsa_resources/Kernels \ + -e KERNEL_ROOT=/sw/icrn/jupyter/icrn_ncsa_resources/Kernels \ + -e LANGUAGE_FILTER=Python \ + -e LOG_LEVEL=DEBUG \ + icrn-kernel-indexer:latest +``` + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `KERNEL_ROOT` | `/sw/icrn/jupyter/icrn_ncsa_resources/Kernels` | Path to kernel repository root (must be read-write) | +| `OUTPUT_DIR` | (same as `KERNEL_ROOT`) | Directory where collated JSON files will be written | +| `LANGUAGE_FILTER` | (empty) | Optional: Filter by language (R, Python, etc.). If omitted, processes all languages | +| `LOG_LEVEL` | `INFO` | Logging verbosity: `DEBUG`, `INFO`, `WARN`, or `ERROR` | +| `ATOMIC_WRITES` | `true` | Use atomic writes for collated files (write to temp, then rename) | + +## Kubernetes Deployment + +### CronJob Example + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kernel-indexer +spec: + schedule: "0 * * * *" # Run hourly + jobTemplate: + spec: + template: + spec: + containers: + - name: kernel-indexer + image: icrn-kernel-indexer:latest + env: + - name: KERNEL_ROOT + value: "/sw/icrn/jupyter/icrn_ncsa_resources/Kernels" + # OUTPUT_DIR defaults to KERNEL_ROOT, so collated files go to repo root + volumeMounts: + - name: kernel-repo + mountPath: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + # readOnly: false (default) - needed to write manifests + volumes: + - name: kernel-repo + hostPath: + path: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + type: Directory + # Note: Directory type requires the path to exist - will fail if missing + # This is intentional as this is core infrastructure that must be present + restartPolicy: Never + # Never retry on failure - fail fast and let cron schedule handle next attempt +``` + +### With Language Filter + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: kernel-indexer-python +spec: + schedule: "0 2 * * *" # Run daily at 2 AM + jobTemplate: + spec: + template: + spec: + containers: + - name: kernel-indexer + image: icrn-kernel-indexer:latest + env: + - name: KERNEL_ROOT + value: "/sw/icrn/jupyter/icrn_ncsa_resources/Kernels" + - name: LANGUAGE_FILTER + value: "Python" + volumeMounts: + - name: kernel-repo + mountPath: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + volumes: + - name: kernel-repo + hostPath: + path: /sw/icrn/jupyter/icrn_ncsa_resources/Kernels + type: Directory + restartPolicy: Never +``` + +## Output Files + +The indexer creates the following files: + +1. **Individual Manifests**: `$KERNEL_ROOT/{R,Python}/kernel_name/version/package_manifest.json` + - One file per kernel version + - Contains package list for that specific kernel + +2. **Collated Files** (in `OUTPUT_DIR`, default: `KERNEL_ROOT`): + - `collated_manifests.json` - Kernel-centric index (list of all kernels) + - `package_index.json` - Package-centric index (which kernels contain each package) + +## Error Handling + +The container implements a **fail-fast** strategy: +- Exits immediately on any error (no retries) +- Exit codes indicate the type of failure: + - `0`: Success + - `1`: General error + - `2`: Missing dependencies (jq, conda, or kernel_indexer) + - `3`: Kernel root validation failed + - `4`: Indexing phase failed + - `5`: Collation phase failed +- The cron schedule handles retries (next scheduled run) + +## Logging + +Logs are written to stdout and stderr: +- **stdout**: Progress information, summary statistics +- **stderr**: Errors and warnings +- Format: `[TIMESTAMP] [LEVEL] MESSAGE` + +Set `LOG_LEVEL=DEBUG` for verbose output during troubleshooting. + +## Troubleshooting + +### Container Fails to Start + +**Check kernel repository mount:** +```bash +kubectl describe pod +# Look for volume mount errors +``` + +**Verify directory exists on host:** +```bash +ls -ld /sw/icrn/jupyter/icrn_ncsa_resources/Kernels +``` + +### Indexing Fails + +**Check logs:** +```bash +kubectl logs +``` + +**Common issues:** +- Kernel repository not writable: Check permissions +- Missing conda environments: Verify kernel directories contain valid conda environments +- Network issues: If kernels are on network storage, check connectivity + +### Collation Fails + +**Check if indexing completed:** +```bash +# Look for package_manifest.json files in kernel directories +find /sw/icrn/jupyter/icrn_ncsa_resources/Kernels -name package_manifest.json | head -5 +``` + +**Validate JSON files:** +```bash +jq '.' /sw/icrn/jupyter/icrn_ncsa_resources/Kernels/collated_manifests.json +jq '.' /sw/icrn/jupyter/icrn_ncsa_resources/Kernels/package_index.json +``` + +## Integration with Web Server + +The web server container should mount the same kernel repository (read-only) and configure paths: + +```yaml +env: +- name: COLLATED_MANIFESTS_PATH + value: "/sw/icrn/jupyter/icrn_ncsa_resources/Kernels/collated_manifests.json" +- name: PACKAGE_INDEX_PATH + value: "/sw/icrn/jupyter/icrn_ncsa_resources/Kernels/package_index.json" +``` + +Or mount the kernel repo to `/app/data` and use default paths. + +## See Also + +- [DESIGN.md](DESIGN.md) - Detailed design document +- [../kernel_indexer](../kernel_indexer) - The kernel indexer script itself +- [../web/](../web/) - Web server that consumes the generated files + diff --git a/kernel-indexer/entrypoint.sh b/kernel-indexer/entrypoint.sh new file mode 100644 index 0000000..d5e291b --- /dev/null +++ b/kernel-indexer/entrypoint.sh @@ -0,0 +1,232 @@ +#!/bin/bash +set -euo pipefail + +# Exit codes +EXIT_SUCCESS=0 +EXIT_GENERAL_ERROR=1 +EXIT_MISSING_DEPS=2 +EXIT_KERNEL_ROOT_INVALID=3 +EXIT_INDEX_FAILED=4 +EXIT_COLLATE_FAILED=5 + +# Default configuration +DEFAULT_KERNEL_ROOT="/sw/icrn/jupyter/icrn_ncsa_resources/Kernels" + +# Environment variables with defaults +KERNEL_ROOT="${KERNEL_ROOT:-${DEFAULT_KERNEL_ROOT}}" +OUTPUT_DIR="${OUTPUT_DIR:-${KERNEL_ROOT}}" +LANGUAGE_FILTER="${LANGUAGE_FILTER:-}" +LOG_LEVEL="${LOG_LEVEL:-INFO}" +ATOMIC_WRITES="${ATOMIC_WRITES:-true}" + +# Logging function +log() { + local level="$1" + shift + local message="$*" + local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "[${timestamp}] [${level}] ${message}" +} + +log_info() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]] || [[ "${LOG_LEVEL}" == "INFO" ]]; then + log "INFO" "$@" + fi +} + +log_error() { + log "ERROR" "$@" >&2 +} + +log_warn() { + if [[ "${LOG_LEVEL}" != "ERROR" ]]; then + log "WARN" "$@" >&2 + fi +} + +log_debug() { + if [[ "${LOG_LEVEL}" == "DEBUG" ]]; then + log "DEBUG" "$@" + fi +} + +# Validation functions +check_dependencies() { + log_info "Checking dependencies..." + + if ! command -v jq &> /dev/null; then + log_error "jq is not installed or not in PATH" + exit $EXIT_MISSING_DEPS + fi + + if ! command -v conda &> /dev/null; then + log_error "conda is not installed or not in PATH" + exit $EXIT_MISSING_DEPS + fi + + if ! command -v kernel_indexer &> /dev/null; then + log_error "kernel_indexer script is not found or not in PATH" + exit $EXIT_MISSING_DEPS + fi + + if [ ! -x "$(command -v kernel_indexer)" ]; then + log_error "kernel_indexer script is not executable" + exit $EXIT_MISSING_DEPS + fi + + log_info "All dependencies found" +} + +validate_kernel_root() { + log_info "Validating kernel root: ${KERNEL_ROOT}" + + # Check if directory exists (do NOT create it - this is core infrastructure) + if [ ! -d "${KERNEL_ROOT}" ]; then + log_error "Kernel root directory does not exist: ${KERNEL_ROOT}" + log_error "This is core infrastructure - if missing, something is seriously wrong" + exit $EXIT_KERNEL_ROOT_INVALID + fi + + # Check if directory is readable + if [ ! -r "${KERNEL_ROOT}" ]; then + log_error "Kernel root directory is not readable: ${KERNEL_ROOT}" + exit $EXIT_KERNEL_ROOT_INVALID + fi + + # Check if directory is writable (needed for writing manifests) + if [ ! -w "${KERNEL_ROOT}" ]; then + log_error "Kernel root directory is not writable: ${KERNEL_ROOT}" + log_error "Write access is required to create package_manifest.json files" + exit $EXIT_KERNEL_ROOT_INVALID + fi + + log_info "Kernel root validation passed" +} + +validate_output_dir() { + log_info "Validating output directory: ${OUTPUT_DIR}" + + # Check if output directory exists + if [ ! -d "${OUTPUT_DIR}" ]; then + log_error "Output directory does not exist: ${OUTPUT_DIR}" + exit $EXIT_KERNEL_ROOT_INVALID + fi + + # Check if output directory is writable + if [ ! -w "${OUTPUT_DIR}" ]; then + log_error "Output directory is not writable: ${OUTPUT_DIR}" + exit $EXIT_KERNEL_ROOT_INVALID + fi + + log_info "Output directory validation passed" +} + +# Validate collated file +validate_collated_file() { + local output_file="$1" + local file_description="$2" + + log_info "Validating ${file_description}: ${output_file}" + + # Check if file exists + if [ ! -f "${output_file}" ]; then + log_error "${file_description} not found: ${output_file}" + return 1 + fi + + # Validate JSON structure + if ! jq '.' "${output_file}" >/dev/null 2>&1; then + log_error "Invalid JSON in ${file_description}: ${output_file}" + return 1 + fi + + # Get file size for logging (portable approach) + local file_size + if command -v stat &> /dev/null; then + file_size=$(stat -f%z "${output_file}" 2>/dev/null || stat -c%s "${output_file}" 2>/dev/null || wc -c < "${output_file}" 2>/dev/null || echo "unknown") + else + file_size=$(wc -c < "${output_file}" 2>/dev/null || echo "unknown") + fi + log_info "${file_description} validated successfully (size: ${file_size} bytes)" + return 0 +} + +# Main execution +main() { + log_info "Starting kernel indexer container" + log_info "KERNEL_ROOT: ${KERNEL_ROOT}" + log_info "OUTPUT_DIR: ${OUTPUT_DIR}" + if [ -n "${LANGUAGE_FILTER}" ]; then + log_info "LANGUAGE_FILTER: ${LANGUAGE_FILTER}" + else + log_info "LANGUAGE_FILTER: (all languages)" + fi + + # Validation phase + check_dependencies + validate_kernel_root + validate_output_dir + + # Build index command + local index_cmd="kernel_indexer index --kernel-root '${KERNEL_ROOT}'" + if [ -n "${LANGUAGE_FILTER}" ]; then + index_cmd="${index_cmd} --language '${LANGUAGE_FILTER}'" + fi + + # Build collate command + local collate_cmd="kernel_indexer collate --kernel-root '${KERNEL_ROOT}' --output-dir '${OUTPUT_DIR}'" + if [ -n "${LANGUAGE_FILTER}" ]; then + collate_cmd="${collate_cmd} --language '${LANGUAGE_FILTER}'" + fi + + # Execute indexing phase + log_info "Starting indexing phase..." + log_debug "Command: ${index_cmd}" + + if eval "${index_cmd}"; then + log_info "Indexing phase completed successfully" + else + local exit_code=$? + log_error "Indexing phase failed with exit code: ${exit_code}" + exit $EXIT_INDEX_FAILED + fi + + # Execute collation phase + log_info "Starting collation phase..." + log_debug "Command: ${collate_cmd}" + + # If atomic writes are enabled, we need to intercept the output + # kernel_indexer writes directly, so we'll validate after + if eval "${collate_cmd}"; then + log_info "Collation command completed" + + # Validate collated files + log_info "Validating collated output files..." + + local collated_manifests="${OUTPUT_DIR}/collated_manifests.json" + local package_index="${OUTPUT_DIR}/package_index.json" + + if ! validate_collated_file "${collated_manifests}" "collated manifests"; then + exit $EXIT_COLLATE_FAILED + fi + + if ! validate_collated_file "${package_index}" "package index"; then + exit $EXIT_COLLATE_FAILED + fi + + log_info "All collated files validated successfully" + + log_info "Collation phase completed successfully" + else + local exit_code=$? + log_error "Collation phase failed with exit code: ${exit_code}" + exit $EXIT_COLLATE_FAILED + fi + + log_info "Kernel indexing completed successfully" + exit $EXIT_SUCCESS +} + +# Run main function +main "$@" +