diff --git a/.github/workflows/build-push-codespace.yml b/.github/workflows/build-push-codespace.yml new file mode 100644 index 000000000..0748f2917 --- /dev/null +++ b/.github/workflows/build-push-codespace.yml @@ -0,0 +1,171 @@ +name: Build and Push Project Codespace Images + +on: + push: + branches: + - main + paths-ignore: + - "_assets/**" + - ".github/**" + - ".gitignore" + - ".gitmodules" + - ".typos.toml" + - "CODE-OF-CONDUCT.md" + - "CONTRIBUTING.md" + - "scripts/**" + - "LICENSE" + - "pyproject.toml" + - "README.md" + + workflow_dispatch: + inputs: + project: + description: "Project to build (leave empty to detect from changed files)" + required: false + default: "" + +jobs: + detect-changes: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: Detect changed projects + id: set-matrix + run: | + # If this was a manual dispatch _and_ they provided a project, just use that + if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.project }}" ]]; then + PROJECTS="[\"${{ github.event.inputs.project }}\"]" + else + # Otherwise auto-diff HEAD^ → HEAD for any changed top-level dirs + CHANGED_FILES=$(git diff --name-only HEAD^ HEAD) + CHANGED_DIRS=$(echo "$CHANGED_FILES" \ + | awk -F/ '{print $1}' \ + | sort -u \ + | grep -v '^$') + ALL_PROJECT_DIRS=$(find . -maxdepth 1 -type d \ + -not -path '*/\.*' \ + -not -path '.' \ + | sed 's|^\./||' \ + | grep -v '^_') + PROJECTS="[" + sep="" + for d in $CHANGED_DIRS; do + if echo "$ALL_PROJECT_DIRS" | grep -qx "$d"; then + PROJECTS+="${sep}\"$d\"" + sep="," + fi + done + PROJECTS+="]" + fi + + echo "matrix=$PROJECTS" >> $GITHUB_OUTPUT + echo "Projects to build: $PROJECTS" + + check-dockerfile: + needs: detect-changes + runs-on: ubuntu-latest + strategy: + matrix: + project: ${{ fromJson(needs.detect-changes.outputs.matrix) }} + outputs: + dockerfile_exists: ${{ steps.check-dockerfile.outputs.dockerfile_exists }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Check for Dockerfile.codespace + id: check-dockerfile + run: | + if [ -f "${{ matrix.project }}/Dockerfile.codespace" ]; then + echo "dockerfile_exists=true" >> $GITHUB_OUTPUT + else + echo "dockerfile_exists=false" >> $GITHUB_OUTPUT + fi + + generate-dockerfile: + needs: [detect-changes, check-dockerfile] + if: needs.check-dockerfile.outputs.dockerfile_exists == 'false' + runs-on: ubuntu-latest + strategy: + matrix: + project: ${{ fromJson(needs.detect-changes.outputs.matrix) }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Generate Dockerfile.codespace + id: generate-dockerfile + run: | + python scripts/generate_codespace_dockerfile.py "${{ matrix.project }}" + echo "Generated Dockerfile.codespace for ${{ matrix.project }}" + + - name: Create Pull Request for new Dockerfile + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "Auto-generate Dockerfile.codespace for ${{ matrix.project }}" + title: "Auto-generate Dockerfile.codespace for ${{ matrix.project }}" + body: | + This PR adds a generated Dockerfile.codespace for the ${{ matrix.project }} project. + + Please review the changes and merge if they look good. + + Once merged, the Docker image will be built and pushed automatically. + branch: "auto-dockerfile-${{ matrix.project }}" + base: main + labels: | + automated-pr + dockerfile + codespace + + build-and-push: + needs: [detect-changes, check-dockerfile] + if: needs.check-dockerfile.outputs.dockerfile_exists == 'true' + runs-on: ubuntu-latest + strategy: + matrix: + project: ${{ fromJson(needs.detect-changes.outputs.matrix) }} + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + # Generate timestamp for image tag + - name: Generate timestamp + id: timestamp + run: echo "timestamp=$(date -u +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: . + file: ${{ matrix.project }}/Dockerfile.codespace + push: true + tags: zenmldocker/projects-${{ matrix.project }}:${{ steps.timestamp.outputs.timestamp }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/generate_zenml_project.py b/generate_zenml_project.py deleted file mode 100644 index df22b2819..000000000 --- a/generate_zenml_project.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Generate a ZenML project for a tool""" - -import argparse -import logging -import os -import shutil -from textwrap import dedent - - -def get_hello_world_str(): - return dedent( - f"""\ -import logging - -def main(): - pass - -if __name__ == "__main__": - logging.basicConfig(level="INFO") - main() - -""" - ) - - -def get_readme_str(name: str): - return dedent( - f"""\ -# Playground for {name} - -## Installation -``` -cd {name} -poetry install -``` - """ - ) - - -def get_flake8_str(): - return dedent( - """\ - [flake8] - max-line-length = 79 - max-complexity = 18 - select = B,C,E,F,W,T4,B9 - ignore = E203, E266, E501, W503, F403, F401 - """ - ) - - -def get_project_toml_str(name: str, author: str = "Author "): - return dedent( - f"""\ - [tool.poetry] - name = "{name}" - version = "1.0.0" - description = "{name}" - authors = ["{author}"] - license = "Apache 2.0" - - [tool.poetry.dependencies] - python = ">=3.7.0,<3.9.0" - - [tool.poetry.dev-dependencies] - black = "^21.9b0" - isort = "^5.9.3" - pytest = "^6.2.5" - - [build-system] - requires = ["poetry-core>=1.0.0"] - build-backend = "poetry.core.masonry.api" - - [tool.isort] - profile = "black" - known_third_party = [] - skip_glob = [] - line_length = 79 - - [tool.black] - line-length = 79 - include = '\.pyi?$' - exclude = ''' - /( - \.git - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | _build - | buck-out - | build - )/ - ''' - """ - ) - - -def write_file(path: str, content: str): - with open(path, "w") as f: - f.write(content) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("tool_name", type=str, help="Name of the tool") - args = parser.parse_args() - - path = os.path.join(os.getcwd(), args.tool_name) - src_path = os.path.join(path, "src") - if os.path.exists(path): - raise AssertionError(f"{path} already exists!") - - toml_str = get_project_toml_str(args.tool_name) - flake8_str = get_flake8_str() - py_str = get_hello_world_str() - readme_str = get_readme_str(args.tool_name) - - # make dirs - os.mkdir(path) - os.mkdir(src_path) - - # copy .gitignore - shutil.copy( - os.path.join(os.getcwd(), ".gitignore"), - os.path.join(path, ".gitignore"), - ) - - # write files - write_file(os.path.join(path, ".flake8"), flake8_str) - write_file(os.path.join(src_path, "main.py"), py_str) - write_file(os.path.join(path, "pyproject.toml"), toml_str) - write_file(os.path.join(path, "README.md"), readme_str) - - -if __name__ == "__main__": - logging.basicConfig(level="INFO") - main() diff --git a/omni-reader/Dockerfile.codespace b/omni-reader/Dockerfile.codespace new file mode 100644 index 000000000..aa20f0c0a --- /dev/null +++ b/omni-reader/Dockerfile.codespace @@ -0,0 +1,49 @@ +# Sandbox base image +FROM zenmldocker/zenml-sandbox:latest + +# Install uv from official distroless image +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Set uv environment variables for optimization +ENV UV_SYSTEM_PYTHON=1 +ENV UV_COMPILE_BYTECODE=1 + +# Project metadata +LABEL project_name="omni-reader" +LABEL project_version="0.1.0" + +# Install dependencies with uv and cache optimization +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system \ + "instructor" \ + "jiwer" \ + "jiter" \ + "importlib-metadata<7.0,>=1.4.0" \ + "litellm" \ + "mistralai==1.0.3" \ + "numpy<2.0,>=1.9.0" \ + "openai==1.69.0" \ + "Pillow==11.1.0" \ + "polars-lts-cpu==1.26.0" \ + "pyarrow>=7.0.0" \ + "python-dotenv" \ + "streamlit==1.44.0" \ + "pydantic>=2.8.2,<2.9.0" \ + "tqdm==4.66.4" \ + "zenml>=0.80.0" + +# Set workspace directory +WORKDIR /workspace + +# Clone only the project directory and reorganize +RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \ + cp -r /tmp/zenml-projects/omni-reader/* /workspace/ && \ + rm -rf /tmp/zenml-projects + +# VSCode settings +RUN mkdir -p /workspace/.vscode && \ + printf '{\n "workbench.colorTheme": "Default Dark Modern"\n}' > /workspace/.vscode/settings.json + +# Copy .env.example +COPY .env.example /workspace/.env +ENV POLARS_SKIP_CPU_CHECK=1 diff --git a/omni-reader/Dockerfile.sandbox b/omni-reader/Dockerfile.sandbox deleted file mode 100644 index 24301cee7..000000000 --- a/omni-reader/Dockerfile.sandbox +++ /dev/null @@ -1,44 +0,0 @@ -# Sandbox base image -FROM safoinext/zenml-sandbox:latest - -# Install project-specific dependencies -# Install polars-lts-cpu instead of polars (version compiled for CPU compatibility) -RUN pip install --no-cache-dir \ - "instructor==1.7.7" \ - "jiwer==3.0.5" \ - "jiter==0.8.2" \ - "importlib-metadata<7.0,>=1.4.0" \ - "litellm==1.64.1" \ - "mistralai==1.0.3" \ - "numpy<2.0,>=1.9.0" \ - "openai==1.69.0" \ - "Pillow==11.1.0" \ - "polars-lts-cpu==1.26.0" \ - "pyarrow>=7.0.0" \ - "python-dotenv==1.0.1" \ - "streamlit==1.44.0" \ - "pydantic>=2.8.2,<2.9.0" \ - "tqdm==4.66.4" \ - "zenml>=0.80.0" \ - uv - -# Set workspace directory -WORKDIR /workspace - -# Clone only the omni-reader directory and reorganize -RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \ - cp -r /tmp/zenml-projects/omni-reader/* /workspace/ && \ - rm -rf /tmp/zenml-projects - -# Create a template .env file for API keys -RUN echo "OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE" > .env && \ - echo "MISTRAL_API_KEY=YOUR_MISTRAL_API_KEY_HERE" >> .env - -# Create a .vscode directory (mainly to auto-apply the dark theme) -RUN mkdir -p /workspace/.vscode -# Copy settings file -COPY settings.json /workspace/.vscode/settings.json - -# Set environment variable to skip CPU checks for Polars as a fallback -ENV POLARS_SKIP_CPU_CHECK=1 - diff --git a/oncoclear/Dockerfile.codespace b/oncoclear/Dockerfile.codespace new file mode 100644 index 000000000..ddd573fc4 --- /dev/null +++ b/oncoclear/Dockerfile.codespace @@ -0,0 +1,36 @@ +# Sandbox base image +FROM zenmldocker/zenml-sandbox:latest + +# Install uv from official distroless image +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Set uv environment variables for optimization +ENV UV_SYSTEM_PYTHON=1 +ENV UV_COMPILE_BYTECODE=1 + +# Project metadata +LABEL project_name="oncoclear" +LABEL project_version="0.1.0" + +# Install dependencies with uv and cache optimization +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system \ + "zenml[server]>=0.50.0" \ + "notebook" \ + "scikit-learn" \ + "pyarrow" \ + "pandas" + +# Set workspace directory +WORKDIR /workspace + +# Clone only the project directory and reorganize +RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \ + cp -r /tmp/zenml-projects/oncoclear/* /workspace/ && \ + rm -rf /tmp/zenml-projects + +# VSCode settings +RUN mkdir -p /workspace/.vscode && \ + printf '{\n "workbench.colorTheme": "Default Dark Modern"\n}' > /workspace/.vscode/settings.json + + diff --git a/pyproject.toml b/pyproject.toml index 083e3f396..2f4073e52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ description = "Explore MLOps production use-cases with ZenML." authors = ["ZenML CodeMonkey "] [tool.poetry.dependencies] -python = ">=3.7.0,<3.9.0" +python = ">=3.11,<3.13" [tool.poetry.dev-dependencies] pytest = "^6.2.5" diff --git a/scripts/generate_codespace_dockerfile.py b/scripts/generate_codespace_dockerfile.py new file mode 100755 index 000000000..7fe7ea3e9 --- /dev/null +++ b/scripts/generate_codespace_dockerfile.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 + +"""Generate Dockerfile.codespace for ZenML projects.""" + +import argparse +import re +import sys +from pathlib import Path + +import tomllib + +# Dockerfile template +DOCKER_TEMPLATE = """# Sandbox base image +FROM zenmldocker/zenml-sandbox:latest + +# Install uv from official distroless image +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Set uv environment variables for optimization +ENV UV_SYSTEM_PYTHON=1 +ENV UV_COMPILE_BYTECODE=1 + +# Project metadata +LABEL project_name="{name}" +LABEL project_version="0.1.0" + +{deps} + +# Set workspace directory +WORKDIR /workspace + +# Clone only the project directory and reorganize +RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \\ + cp -r /tmp/zenml-projects/{name}/* /workspace/ && \\ + rm -rf /tmp/zenml-projects + +# VSCode settings +RUN mkdir -p /workspace/.vscode && \\ + printf '{{\\n "workbench.colorTheme": "Default Dark Modern"\\n}}' > /workspace/.vscode/settings.json + +{env_block} +""" + +# Patterns to detect environment variables in code +ENV_PATTERN = re.compile( + r"os\.(?:getenv|environ(?:\[|\\.get))\(['\"]([A-Za-z0-9_]+)['\"]\)" +) +DOTENV_PATTERN = re.compile( + r"(?:load_dotenv|dotenv).*?['\"]([A-Za-z0-9_]+)['\"]" +) + + +def replace_polars(dep: str) -> str: + """Replaces 'polars' with 'polars-lts-cpu', a CPU-optimized LTS version for container environments.""" + return ( + dep.replace("polars", "polars-lts-cpu") + if dep.startswith("polars") + else dep + ) + + +def parse_requirements(project_dir: Path) -> list[str]: + """Parse requirements.txt and apply LTS replacement for Polars. + + Replaces 'polars' with 'polars-lts-cpu', a CPU-optimized LTS version for container environments. + """ + req_file = project_dir / "requirements.txt" + if not req_file.exists(): + return [] + deps = [] + for line in req_file.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#"): + deps.append(replace_polars(line)) + return deps + + +def parse_pyproject(project_dir: Path) -> list[str]: + """Parse pyproject.toml supporting PEP 621, Poetry, and PDM; replace Polars with its LTS CPU version. + + Supports dependencies under [project.dependencies], [tool.poetry.dependencies], and [tool.pdm.dependencies]. + """ + file = project_dir / "pyproject.toml" + if not file.exists(): + return [] + try: + data = tomllib.loads(file.read_bytes()) + # PEP 621 + if deps := data.get("project", {}).get("dependencies"): # type: ignore + raw = deps + # Poetry + elif ( + poetry := data.get("tool", {}) + .get("poetry", {}) + .get("dependencies") + ): # type: ignore + raw = [ + f"{n}=={v}" if isinstance(v, str) else n + for n, v in poetry.items() + if n != "python" + ] + # PDM + elif pdm := data.get("tool", {}).get("pdm", {}).get("dependencies"): # type: ignore + raw = pdm + else: + return [] + return [replace_polars(d) for d in raw] + except Exception as e: + print(f"Warning: pyproject.toml parse error: {e}") + return [] + + +def get_dependencies(project_dir: Path) -> tuple[str, list[str]]: + """Aggregate dependencies from requirements or pyproject and format the install block. + + Includes a warning if no dependencies are found. + """ + deps = parse_requirements(project_dir) or parse_pyproject(project_dir) + if not deps: + print(f"Warning: no dependencies found in {project_dir}") + return "# No dependencies found", [] + # build install commands + lines = [] + lines.append("# Install dependencies with uv and cache optimization") + lines.append("RUN --mount=type=cache,target=/root/.cache/uv \\") + lines.append(" uv pip install --system \\") + + lines += [f' "{d}" \\' for d in deps[:-1]] + [f' "{deps[-1]}"'] + return "\n".join(lines), deps + + +def find_env_keys(project_dir: Path) -> set[str]: + """Detect environment variable keys from .env and Python source files. + + Scans .env for explicit keys and searches code for os.getenv, os.environ, and dotenv references. + Defaults to {'API_KEY'} if none found. + """ + keys = set() + env_file = project_dir / ".env" + if env_file.exists(): + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line and not line.startswith("#") and "=" in line: + keys.add(line.split("=", 1)[0].strip()) + for py in project_dir.rglob("*.py"): + txt = py.read_text(errors="ignore") + keys |= set(ENV_PATTERN.findall(txt)) + keys |= set(DOTENV_PATTERN.findall(txt)) + return keys or {"API_KEY"} + + +def gen_env_block( + project_dir: Path, keys: set[str], installed_deps: list[str] +) -> str: + """Generate Dockerfile commands to set up .env with detected keys and runtime tweaks. + + Looks for any .env* files (like .env.example) and uses that for reference. + Does not create a .env file if one doesn't exist. + Adds Polars ENV only if polars-lts-cpu was installed. + """ + lines = [] + + # Look for any .env* files (.env, .env.example, etc.) + env_files = list(project_dir.glob(".env*")) + + if env_files: + # Use the first .env* file found + env_file = env_files[0] + env_file_name = env_file.name + + # Parse the existing keys from the file + existing = set() + try: + for line in env_file.read_text(encoding="utf-8").splitlines(): + if line and not line.startswith("#") and "=" in line: + existing.add(line.split("=", 1)[0].strip()) + except Exception: + existing = set() + + # Copy the existing .env* file + lines.append(f"# Copy {env_file_name}") + lines.append(f"COPY {env_file_name} /workspace/.env") + + # Add missing keys only if we're copying a template + missing = keys - existing + for k in sorted(missing): + lines.append(f'RUN echo "{k}=YOUR_{k}" >> /workspace/.env') + + # Add Polars ENV only if we actually installed polars-lts-cpu + if any("polars-lts-cpu" in dep for dep in installed_deps): + lines.append("ENV POLARS_SKIP_CPU_CHECK=1") + + return "\n".join(lines) if lines else "" + + +def generate_dockerfile( + project_path: str, + output_dir: str | None = None, +) -> bool: + """Create Dockerfile.codespace using the template, dependencies, and environment setup. + + Returns True on success, False otherwise. + """ + out = Path(output_dir or project_path) + if not out.exists(): + print(f"Error: {out} not found") + return False + name = Path(project_path).name + deps_block, installed_deps = get_dependencies(out) + keys = find_env_keys(out) + env_block = gen_env_block(out, keys, installed_deps) + content = DOCKER_TEMPLATE.format( + name=name, deps=deps_block, env_block=env_block + ) + (out / "Dockerfile.codespace").write_text(content) + print(f"Generated Dockerfile.codespace at {out / 'Dockerfile.codespace'}") + return True + + +def main() -> None: + """CLI entry point.""" + parser = argparse.ArgumentParser( + "Generate Dockerfile.codespace for ZenML projects" + ) + parser.add_argument("project", help="Path to the project directory") + parser.add_argument( + "--output-dir", help="Output directory (defaults to project path)" + ) + parser.add_argument( + "--use-uv", + action="store_true", + default=True, + help="Use uv for dependency installation (default: True)", + ) + args = parser.parse_args() + sys.exit(0 if generate_dockerfile(args.project, args.output_dir) else 1) + + +if __name__ == "__main__": + main()