Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
25371db
add and move generate_sandbox_dockerfile script to scripts dir
Apr 16, 2025
ebae3c6
move generate_enml_project.py to scripts dir
Apr 16, 2025
8104d8e
update function to set project name in Dockerfile to base name, and n…
Apr 16, 2025
99ba784
generate Dockerfile for oncoclear as an example
Apr 16, 2025
398c4e4
update docker parent image to zenmldocker/zenml-sandbox
Apr 16, 2025
bfd8c23
use complete env variable key in template
Apr 16, 2025
4938493
update dockerfile generator script to use uv, handle pyproject.toml, …
Apr 18, 2025
b1ebc93
add tomli to pyproject.toml to parse toml files
Apr 18, 2025
434f7cb
generate updated Dockerfile.sandbox for omnireader
Apr 18, 2025
6d568df
update script to not generate a .env file if it didnt exist, and dont…
Apr 18, 2025
ec8f24e
use uv binary from distroless Docker image instead of installing uv v…
Apr 20, 2025
8ff3b48
generate updated Dockerfile.sandbox files
Apr 20, 2025
ce252c3
change base image name to zenmldocker/zenml-projects:base
Apr 20, 2025
aa062eb
Merge branch 'main' into add-sandbox-dockerfile-generator-script
strickvl Apr 20, 2025
b932f3e
add workflow file
Apr 20, 2025
f22584c
rename sandbox to codespace
Apr 21, 2025
22ebb93
delete generate_zenml_project.py
Apr 21, 2025
ed92bdb
revert base image name to zenmldocker/zenml-sandbox
Apr 22, 2025
5020fa9
bump python version in pyproject.toml and replace tomli with tomlib
Apr 22, 2025
fd25eb0
Use UTC timestamp as Docker image tag in GH action workflow
Apr 24, 2025
7f9911e
split workflow into encapsulated jobs to avoid redundant dockerfile_e…
Apr 24, 2025
480bffd
update run command to use updated script name and path
Apr 24, 2025
761af1d
rename filename: sandbox -> codespace
Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 21 additions & 18 deletions omni-reader/Dockerfile.sandbox
Original file line number Diff line number Diff line change
@@ -1,44 +1,47 @@
# Sandbox base image
FROM safoinext/zenml-sandbox:latest
FROM zenmldocker/zenml-sandbox:latest

# Project metadata
LABEL project_name="omni-reader"
LABEL project_version="0.1.0"

# Install project-specific dependencies
# Install polars-lts-cpu instead of polars (version compiled for CPU compatibility)
RUN pip install --no-cache-dir \
"instructor==1.7.7" \
"jiwer==3.0.5" \
"jiter==0.8.2" \
"instructor" \
"jiwer" \
"jiter" \
"importlib-metadata<7.0,>=1.4.0" \
"litellm==1.64.1" \
"litellm" \
"mistralai==1.0.3" \
"numpy<2.0,>=1.9.0" \
"openai==1.69.0" \
"Pillow==11.1.0" \
"polars-lts-cpu==1.26.0" \
"pyarrow>=7.0.0" \
"python-dotenv==1.0.1" \
"python-dotenv" \
"streamlit==1.44.0" \
"pydantic>=2.8.2,<2.9.0" \
"tqdm==4.66.4" \
"zenml>=0.80.0" \
uv
"zenml>=0.80.0"

# Set workspace directory
WORKDIR /workspace

# Clone only the omni-reader directory and reorganize
# Clone only the project directory and reorganize
RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \
cp -r /tmp/zenml-projects/omni-reader/* /workspace/ && \
rm -rf /tmp/zenml-projects

# Create a template .env file for API keys
RUN echo "OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE" > .env && \
echo "MISTRAL_API_KEY=YOUR_MISTRAL_API_KEY_HERE" >> .env
RUN echo "OPENAI_API_KEY=YOUR_OPENAI_API_KEY" && \
echo "MISTRAL_API_KEY=YOUR_MISTRAL_API_KEY" > .env

# Create a .vscode directory (mainly to auto-apply the dark theme)
RUN mkdir -p /workspace/.vscode
# Copy settings file
COPY settings.json /workspace/.vscode/settings.json
# Create a .vscode directory and settings.json file
RUN mkdir -p /workspace/.vscode && \
echo '{\n'\
' "workbench.colorTheme": "Default Dark Modern"\n'\
'}' > /workspace/.vscode/settings.json

# Set environment variable to skip CPU checks for Polars as a fallback
ENV POLARS_SKIP_CPU_CHECK=1

# Set environment variables for compatibility and performance
ENV POLARS_SKIP_CPU_CHECK=1
35 changes: 35 additions & 0 deletions oncoclear/Dockerfile.sandbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Sandbox base image
FROM zenmldocker/zenml-sandbox:latest

# Project metadata
LABEL project_name="oncoclear"
LABEL project_version="0.1.0"

# Install project-specific dependencies
RUN pip install --no-cache-dir \
"zenml[server]>=0.50.0" \
"notebook" \
"scikit-learn" \
"pyarrow" \
"pandas"

# Set workspace directory
WORKDIR /workspace

# Clone only the project directory and reorganize
RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \
cp -r /tmp/zenml-projects/oncoclear/* /workspace/ && \
rm -rf /tmp/zenml-projects

# Create a template .env file for API keys
RUN echo "ZENML_PROJECT_SECRET_NAME=YOUR_ZENML_PROJECT_SECRET_NAME" && \
echo "ZENML_STORE_URL=YOUR_ZENML_STORE_URL" && \
echo "ZENML_STORE_API_KEY=YOUR_ZENML_STORE_API_KEY" > .env

# Create a .vscode directory and settings.json file
RUN mkdir -p /workspace/.vscode && \
echo '{\n'\
' "workbench.colorTheme": "Default Dark Modern"\n'\
'}' > /workspace/.vscode/settings.json


224 changes: 224 additions & 0 deletions scripts/generate_sandbox_dockerfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#!/usr/bin/env python3

"""Script to generate Dockerfile.sandbox files for ZenML projects.

This ensures consistency across all project Docker images.
"""

import argparse
import os
import re
import sys
from pathlib import Path

DOCKERFILE_TEMPLATE = """# Sandbox base image
FROM zenmldocker/zenml-sandbox:latest

# Project metadata
LABEL project_name="{project_name}"
LABEL project_version="0.1.0"

# Install project-specific dependencies
RUN pip install --no-cache-dir \\
{dependencies}

# Set workspace directory
WORKDIR /workspace

# Clone only the project directory and reorganize
RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \\
cp -r /tmp/zenml-projects/{project_name}/* /workspace/ && \\
rm -rf /tmp/zenml-projects

# Create a template .env file for API keys
RUN echo "{api_vars}" > .env

# Create a .vscode directory and settings.json file
RUN mkdir -p /workspace/.vscode && \\
echo '{{\\n'\\
' "workbench.colorTheme": "Default Dark Modern"\\n'\\
'}}' > /workspace/.vscode/settings.json

{env_vars_block}
"""


def format_env_key(key):
"""Format environment variable placeholder text."""
# Special case handling
if key == "GOOGLE_APPLICATION_CREDENTIALS":
return f"{key}=PATH_TO_YOUR_GOOGLE_CREDENTIALS_FILE"

return f"{key}=YOUR_{key}"


def parse_requirements(project_dir):
"""Parse requirements.txt file if it exists."""
req_file = Path(project_dir) / "requirements.txt"
if not req_file.exists():
print(f"Warning: No requirements.txt found in {project_dir}")
return []

dependencies = []
with open(req_file, "r") as f:
for line in f:
line = line.strip()
if line and not line.startswith("#"):
if line.startswith("polars"):
line = line.replace("polars", "polars-lts-cpu")
dependencies.append(line)

return dependencies


def detect_api_keys(project_dir):
"""Attempt to detect required API keys by scanning Python files."""
api_patterns = {
# LLM Provider API Keys
"HF_TOKEN": r"huggingface|hf_token",
"OPENAI_API_KEY": r"openai|gpt",
"ANTHROPIC_API_KEY": r"anthropic|claude",
"MISTRAL_API_KEY": r"mistral|mistralai",
"GEMINI_API_KEY": r"gemini|google",
# ZenML-specific API Keys and Environment Variables
"ZENML_STORE_API_KEY": r"zenml.*api_key|zenml_store_api_key",
"ZENML_STORE_URL": r"zenml_store_url|zenml.*url",
"ZENML_PROJECT_SECRET_NAME": r"zenml.*secret|secret_name",
"ZENML_HF_USERNAME": r"zenml_hf_username|hf_username",
"ZENML_HF_SPACE_NAME": r"zenml_hf_space_name|hf_space_name",
# Monitoring and Logging
"LANGFUSE_PUBLIC_KEY": r"langfuse.*public",
"LANGFUSE_SECRET_KEY": r"langfuse.*secret",
"LANGFUSE_HOST": r"langfuse.*host",
# Vector Databases
"PINECONE_API_KEY": r"pinecone",
"SUPABASE_USER": r"supabase.*user",
"SUPABASE_PASSWORD": r"supabase.*password",
"SUPABASE_HOST": r"supabase.*host",
"SUPABASE_PORT": r"supabase.*port",
# Cloud Provider Keys
"AWS_ACCESS_KEY_ID": r"aws.*access|aws_access_key_id",
"AWS_SECRET_ACCESS_KEY": r"aws.*secret|aws_secret_access_key",
"AWS_SESSION_TOKEN": r"aws.*session|aws_session_token",
"AWS_REGION": r"aws.*region|aws_region",
"GOOGLE_APPLICATION_CREDENTIALS": r"google.*credentials",
# Other Service-Specific Keys
"FIFTYONE_LABELSTUDIO_API_KEY": r"fiftyone|labelstudio",
"NEPTUNE_API_TOKEN": r"neptune",
"GH_ACCESS_TOKEN": r"gh_access_token|github",
}

detected_keys = []

for py_file in Path(project_dir).glob("**/*.py"):
with open(py_file, "r", encoding="utf-8", errors="ignore") as f:
content = f.read().lower()
for key, pattern in api_patterns.items():
if re.search(pattern, content):
detected_keys.append(key)

# Remove duplicates
detected_keys = list(set(detected_keys))

if not detected_keys:
detected_keys = ["API_KEY=YOUR_API_KEY_HERE"]

return [format_env_key(key) for key in detected_keys]


def detect_env_variables(project_dir, dependencies):
"""Detect which environment variables are needed based on dependencies and content."""
env_vars = []

# Only add POLARS_SKIP_CPU_CHECK if any polars package is in dependencies
if any("polars" in dep.lower() for dep in dependencies):
env_vars.append("POLARS_SKIP_CPU_CHECK=1")

# Only add TOKENIZERS_PARALLELISM if transformers or tokenizers is used
if any(
dep.lower().startswith(("transform", "token")) for dep in dependencies
):
env_vars.append("TOKENIZERS_PARALLELISM=false")

# These are development convenience variables - could be made optional
# env_vars.append("PYTHONUNBUFFERED=1")
# env_vars.append("PYTHONDONTWRITEBYTECODE=1")

return env_vars


def generate_dockerfile(project_path, output_dir=None):
"""Generate a Dockerfile.sandbox for the specified project."""
if output_dir is None:
output_dir = project_path

base_project_name = os.path.basename(project_path)

project_dir = Path(output_dir)
if not project_dir.exists():
print(f"Error: Project directory {project_dir} not found")
return False

# Get dependencies
dependencies = parse_requirements(project_dir)
if dependencies:
formatted_deps = "\n".join(
f' "{dep}" \\' for dep in dependencies[:-1]
)
if formatted_deps:
formatted_deps += f'\n "{dependencies[-1]}"'
else:
formatted_deps = f' "{dependencies[-1]}"'
else:
formatted_deps = ""

# Detect API keys
api_vars = detect_api_keys(project_dir)
formatted_api_vars = '" && \\\n echo "'.join(api_vars)

env_vars = detect_env_variables(project_dir, dependencies)
env_vars_block = ""
if env_vars:
env_vars_block = (
"\n# Set environment variables for compatibility and performance"
)
for var in env_vars:
env_vars_block += f"\nENV {var}"

# Generate Dockerfile content
dockerfile_content = DOCKERFILE_TEMPLATE.format(
project_name=base_project_name,
dependencies=formatted_deps,
api_vars=formatted_api_vars,
env_vars_block=env_vars_block,
)

# Write Dockerfile
dockerfile_path = project_dir / "Dockerfile.sandbox"
with open(dockerfile_path, "w") as f:
f.write(dockerfile_content)

print(
f"Generated Dockerfile.sandbox for {base_project_name} at {dockerfile_path}"
)
return True


def main():
"""Main function to parse arguments and generate Dockerfile.sandbox."""
parser = argparse.ArgumentParser(
description="Generate Dockerfile.sandbox for ZenML projects"
)
parser.add_argument("project", help="Project name")
parser.add_argument(
"--output-dir", help="Output directory (defaults to project name)"
)

args = parser.parse_args()

success = generate_dockerfile(args.project, args.output_dir)
return 0 if success else 1


if __name__ == "__main__":
sys.exit(main())
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def get_hello_world_str():
return dedent(
f"""\
"""\
import logging

def main():
Expand Down
Loading