diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d74f1a6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,101 @@ +# Git +.git +.gitignore +.gitattributes + +# Python +__pycache__ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.pyc +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.hypothesis/ +*.mo +*.pot + +# Virtual environments +venv/ +ENV/ +env/ +.venv + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Node.js (for frontend) +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* +.next/ +out/ +.nuxt +.cache + +# Documentation (not needed in containers) +*.md +!README.md +docs/ + +# Development +.env +.env.local +.env.*.local +scratch/ +logo/ + +# Output directories +output/ + +# Jupyter +*.ipynb +.ipynb_checkpoints + +# Parquet data (handled separately via volume mounts) +data/ +surf-workload/ + +# Docker +docker-compose*.yml +Dockerfile* +.dockerignore + +# CI/CD +.github/ +.gitlab-ci.yml + +# Backup files +*.backup +*.bak +*.tmp +*.old + +# OS +Thumbs.db diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e747300..99fcdd5 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -1,8 +1,8 @@ -name: Deploy opendc.org +name: Deploy atlarge-research.github.io/opendt on: push: - branches: ["add-static-documentation-site"] + branches: ["master", "refactor-services"] workflow_dispatch: concurrency: diff --git a/.gitignore b/.gitignore index bcf84aa..6cb7f03 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,23 @@ scratch/ +### OpenDT runtime experiment files ### +experiment.json +.experiment.json.crc +kafka/opendt_sim_inputs/ +opendt_sim_inputs/ +output/ +!output/README.md +!output/.gitkeep + +### VS Code ### +.vscode/ + target/ !.mvn/wrapper/maven-wrapper.jar !**/src/main/**/target/ !**/src/test/**/target/ -### IntelliJ IDEA ### +### IDE ### .idea .idea/modules.xml .idea/jarRepositories.xml @@ -17,8 +29,6 @@ target/ out/ !**/src/main/**/out/ !**/src/test/**/out/ - -### Eclipse ### .apt_generated .classpath .factorypath @@ -26,8 +36,7 @@ out/ .settings .springBeans .sts4-cache - -### NetBeans ### +.cursorrules /nbproject/private/ /nbbuild/ /dist/ @@ -37,37 +46,41 @@ build/ !**/src/main/**/build/ !**/src/test/**/build/ -### VS Code ### -.vscode/ - ### Mac OS ### .DS_Store ### Scala ### .bsp/ -### OpenDT traces ### -surf-22/ - -### OpenDT runtime experiment files ### -experiment.json -.experiment.json.crc -kafka/opendt_sim_inputs/ -opendt_sim_inputs/ -output/ - ### Kafka data ### kafka/kafka-data/ reader/ -.venv +# Python virtual environments +.venv/ +venv/ +env/ .env +# uv +.uv/ +uv.lock -####backup files#### -config/*.backup - -####Pycharm#### +# Python artifacts __pycache__/ *.py[cod] *$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ +*.egg + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Backup files +config/*.backup diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 732db34..0000000 --- a/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -FROM python:3.10-slim - -# Install Java JDK 21 and essentials -RUN apt-get update && apt-get install -y --no-install-recommends \ - openjdk-21-jdk-headless \ - ca-certificates wget unzip \ - && rm -rf /var/lib/apt/lists/* - -# Derive JAVA_HOME from the installed 'java', and provide stable symlinks -RUN set -eux; \ - JH="$(dirname "$(dirname "$(readlink -f "$(command -v java)")")")"; \ - mkdir -p /usr/lib/jvm; \ - ln -sfn "$JH" /usr/lib/jvm/default-java; \ - # Ensure compatibility with scripts expecting this exact path - if [ ! -e /usr/lib/jvm/java-21-openjdk-amd64 ]; then \ - ln -s "$JH" /usr/lib/jvm/java-21-openjdk-amd64; \ - fi - -ENV JAVA_HOME=/usr/lib/jvm/default-java -ENV PATH="$JAVA_HOME/bin:$PATH" - -WORKDIR /app - -# Ensure the package layout under ./src is importable at runtime -ENV PYTHONPATH=/app/src - -# Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# App code -COPY . . - -# Best-effort: if baked in, set runner executable -RUN if [ -f /app/opendt-simulator/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner ]; then \ - chmod +x /app/opendt-simulator/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner; \ - fi - -# Entrypoint ensures runtime perms and JAVA_HOME PATH before launching -COPY entrypoint.sh /entrypoint.sh -RUN chmod +x /entrypoint.sh - -ENTRYPOINT ["/entrypoint.sh"] -CMD ["python", "-m", "opendt.cli"] \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8ec9e30 --- /dev/null +++ b/Makefile @@ -0,0 +1,254 @@ +.PHONY: up down clean-volumes restart build logs help run test setup install-dev clean-env experiment experiment-down + +# Default target +.DEFAULT_GOAL := help + +# ============================================================================= +# Configuration Variables +# ============================================================================= + +# Configuration file path (can be overridden: make up config=config/custom.yaml) +config ?= ./config/default.yaml + +# Build flag - set to 'true' to rebuild images without cache +# Usage: make up build=true +build ?= false + +# Virtual environment detection +VENV := .venv +PYTHON := $(VENV)/bin/python +PYTEST := $(VENV)/bin/pytest +UV := $(shell command -v uv 2> /dev/null) + +## up: Stop containers, delete volumes (clean slate), and start fresh (use build=true to rebuild images) +up: clean-volumes + @echo "πŸš€ Starting OpenDT services with clean slate..." + @echo "πŸ“‹ Using config: $(config)" + @if [ ! -f "$(config)" ]; then \ + echo "❌ Error: Config file not found: $(config)"; \ + exit 1; \ + fi + @if [ "$(build)" = "true" ]; then \ + echo "πŸ”¨ Rebuilding Docker images (no cache)..."; \ + CONFIG_PATH=$(config) docker compose build --no-cache; \ + echo "βœ… Images rebuilt!"; \ + fi + CONFIG_PATH=$(config) docker compose up -d + @echo "βœ… Services started!" + @echo "" + @echo "Available services:" + @echo " - Dashboard: http://localhost:8000" + @echo " - API Docs: http://localhost:8000/docs" + @echo " - Postgres: localhost:5432" + @echo " - Kafka: localhost:9092" + @echo "" + @echo "View logs: make logs" + +## up-debug: Start services in DEBUG mode (sim-worker writes results to ./output/ instead of Kafka) +up-debug: clean-volumes + @echo "πŸ› Starting OpenDT services in DEBUG MODE..." + @echo "πŸ“‹ Using config: $(config)" + @mkdir -p output + @if [ ! -f "$(config)" ]; then \ + echo "❌ Error: Config file not found: $(config)"; \ + exit 1; \ + fi + @if [ "$(build)" = "true" ]; then \ + echo "πŸ”¨ Rebuilding Docker images (no cache)..."; \ + CONFIG_PATH=$(config) DEBUG_MODE=true docker compose build --no-cache; \ + echo "βœ… Images rebuilt!"; \ + fi + CONFIG_PATH=$(config) DEBUG_MODE=true docker compose up -d + @echo "βœ… Services started in DEBUG mode!" + @echo "" + @echo "πŸ› DEBUG MODE: sim-worker will write results to ./output/" + @echo " Kafka publishing is DISABLED for sim-worker" + @echo "" + @echo "Available services:" + @echo " - Dashboard: http://localhost:8000" + @echo " - Postgres: localhost:5432" + @echo " - Kafka: localhost:9092" + @echo "" + @echo "View logs: make logs-sim-worker" + @echo "View results: ls -la output/" + +## run: Alias for 'up' (accepts config parameter) +run: up + +## experiment: Run an experiment (make experiment name=) +experiment: clean-volumes + @if [ -z "$(name)" ]; then \ + echo "❌ Error: Please provide experiment name: make experiment name=my_experiment"; \ + exit 1; \ + fi + @if [ ! -f "config/experiments/$(name).yaml" ]; then \ + echo "❌ Error: Experiment config not found: config/experiments/$(name).yaml"; \ + echo ""; \ + echo "Available experiments:"; \ + ls -1 config/experiments/*.yaml 2>/dev/null | xargs -n 1 basename | sed 's/.yaml//' | sed 's/^/ - /' || echo " (none)"; \ + exit 1; \ + fi + @echo "πŸ§ͺ Starting experiment: $(name)" + @echo "πŸ“‹ Using config: config/experiments/$(name).yaml" + @mkdir -p output/$(name) + EXPERIMENT_NAME=$(name) CONFIG_PATH=./config/experiments/$(name).yaml docker compose up -d + @echo "βœ… Experiment started!" + @echo "" + @echo "Experiment: $(name)" + @echo "Output: output/$(name)/" + @echo "" + @echo "View logs: make logs-sim-worker" + +## experiment-debug: Run an experiment with debug mode enabled (make experiment-debug name=) +experiment-debug: clean-volumes + @if [ -z "$(name)" ]; then \ + echo "❌ Error: Please provide experiment name: make experiment-debug name=my_experiment"; \ + exit 1; \ + fi + @if [ ! -f "config/experiments/$(name).yaml" ]; then \ + echo "❌ Error: Experiment config not found: config/experiments/$(name).yaml"; \ + echo ""; \ + echo "Available experiments:"; \ + ls -1 config/experiments/*.yaml 2>/dev/null | xargs -n 1 basename | sed 's/.yaml//' | sed 's/^/ - /' || echo " (none)"; \ + exit 1; \ + fi + @echo "πŸ§ͺπŸ› Starting experiment with DEBUG mode: $(name)" + @echo "πŸ“‹ Using config: config/experiments/$(name).yaml" + @mkdir -p output/$(name) + EXPERIMENT_NAME=$(name) DEBUG_MODE=true CONFIG_PATH=./config/experiments/$(name).yaml docker compose up -d + @echo "βœ… Experiment started in debug mode!" + @echo "" + @echo "Experiment: $(name)" + @echo "Output: output/$(name)/" + @echo "Debug files: output/$(name)/run_*/" + @echo "" + @echo "View logs: make logs-sim-worker" + +## experiment-down: Stop experiment services +experiment-down: + docker compose down + @echo "βœ… Experiment stopped" + +## down: Stop all containers +down: + @echo "⏹️ Stopping OpenDT services..." + CONFIG_PATH=$(config) docker compose down + @echo "βœ… Services stopped!" + +## clean-volumes: Stop containers and delete persistent volumes (Kafka & Postgres) +clean-volumes: + @echo "🧹 Stopping containers and cleaning persistent volumes..." + CONFIG_PATH=$(config) docker compose down -v + @echo "πŸ—‘οΈ Removing named volumes..." + -docker volume rm opendt-postgres-data 2>/dev/null || true + -docker volume rm opendt-kafka-data 2>/dev/null || true + @echo "βœ… Clean slate ready!" + +## restart: Restart all services (without cleaning volumes) +restart: + @echo "♻️ Restarting OpenDT services..." + CONFIG_PATH=$(config) docker compose restart + @echo "βœ… Services restarted!" + +## build: Rebuild all Docker images +build: + @echo "πŸ”¨ Building Docker images..." + CONFIG_PATH=$(config) docker compose build --no-cache + @echo "βœ… Images built!" + +## rebuild: Clean, rebuild (no cache), and start (alias for make up build=true) +rebuild: + @$(MAKE) up build=true + +## setup: Create virtual environment and install all dependencies +setup: + @echo "πŸ”§ Setting up development environment..." + @if [ -z "$(UV)" ]; then \ + echo "❌ uv not found. Install it with: curl -LsSf https://astral.sh/uv/install.sh | sh"; \ + exit 1; \ + fi + @echo "Creating virtual environment with uv..." + uv venv + @echo "Installing dependencies..." + uv pip install -e libs/common + uv pip install -e "libs/common[test]" + uv pip install -e ".[dev]" + @echo "βœ… Development environment ready!" + @echo "" + @echo "Activate with: source .venv/bin/activate" + +## install-dev: Install dependencies in existing venv (for CI or manual setup) +install-dev: + @echo "πŸ“¦ Installing development dependencies..." + @if [ ! -d "$(VENV)" ]; then \ + echo "❌ Virtual environment not found. Run 'make setup' first."; \ + exit 1; \ + fi + $(PYTHON) -m pip install -e libs/common + $(PYTHON) -m pip install -e "libs/common[test]" + $(PYTHON) -m pip install -e ".[dev]" + @echo "βœ… Dependencies installed!" + +## test: Run tests for shared library +test: + @echo "πŸ§ͺ Running tests..." + @if [ ! -d "$(VENV)" ]; then \ + echo "Virtual environment not found. Running 'make setup'..."; \ + $(MAKE) setup; \ + fi + @if [ ! -f "$(PYTEST)" ]; then \ + echo "pytest not found. Running 'make install-dev'..."; \ + $(MAKE) install-dev; \ + fi + $(PYTEST) libs/common/tests/ -v --tb=short + @echo "βœ… Tests passed!" + +## clean-env: Remove virtual environment +clean-env: + @echo "🧹 Removing virtual environment..." + rm -rf $(VENV) + rm -rf .uv + @echo "βœ… Environment cleaned!" + +## logs: Tail logs for all services +logs: + CONFIG_PATH=$(config) docker compose logs -f + +## logs-dashboard: Tail logs for dashboard service only +logs-dashboard: + CONFIG_PATH=$(config) docker compose logs -f dashboard + +## logs-kafka: Tail logs for Kafka service only +logs-kafka: + CONFIG_PATH=$(config) docker compose logs -f kafka + +## logs-dc-mock: Tail logs for dc-mock service only +logs-dc-mock: + CONFIG_PATH=$(config) docker compose logs -f dc-mock + +## logs-sim-worker: Tail logs for sim-worker service only +logs-sim-worker: + CONFIG_PATH=$(config) docker compose logs -f sim-worker + +## ps: Show running containers +ps: + CONFIG_PATH=$(config) docker compose ps + +## shell-dashboard: Open a shell in the dashboard container +shell-dashboard: + CONFIG_PATH=$(config) docker compose exec dashboard /bin/bash + +## shell-postgres: Open psql in the Postgres container +shell-postgres: + CONFIG_PATH=$(config) docker compose exec postgres psql -U opendt -d opendt + +## kafka-topics: List Kafka topics +kafka-topics: + CONFIG_PATH=$(config) docker compose exec kafka kafka-topics --bootstrap-server localhost:9092 --list + +## help: Show this help message +help: + @echo "OpenDT Makefile Commands" + @echo "========================" + @echo "" + @sed -n 's/^##//p' ${MAKEFILE_LIST} | column -t -s ':' | sed -e 's/^/ /' diff --git a/README.md b/README.md index 1798d3e..eb06839 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,195 @@ -# OpenDT - Digital Twin for Datacenters +# OpenDT - Open Digital Twin for Datacenters -Real-time datacenter simulation with Kafka streaming and LLM-powered optimization. +**OpenDT** is a distributed system for real-time datacenter simulation and What-If analysis. It operates in "Shadow Mode" by replaying historical workload data through the OpenDC simulator to compare predicted vs. actual power consumption. -## Quick Start - Docker +[![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/) +[![Docker](https://img.shields.io/badge/docker-required-blue.svg)](https://www.docker.com/) -Docker does all the magic: +## Quick Start + +### Prerequisites + +- **Docker & Docker Compose** - Container orchestration +- **Make** - Convenience commands +- **Python 3.11+** - For local development +- **uv** - Python package manager ([install](https://astral.sh/uv/install)) + +### Setup ```bash +# 1. Clone repository +git clone https://github.com/your-org/opendt.git cd opendt -export OPENAI_API_KEY="your-key-here" -docker-compose down -docker-compose up --build + +# 2. Setup environment +make setup + +# 3. Start services +make up + +# 4. Access services +open http://localhost:8000 # Dashboard +``` + +That's it! The system is now running with the SURF workload dataset. + +### Running an Experiment + +```bash +# Run experiment with custom config +make experiment name=baseline + +# View results +ls output/baseline/run_1/ +# - results.parquet (power predictions) +# - power_plot.png (actual vs simulated) +# - opendc/ (simulation archives) +``` + +## Documentation + +### Getting Started + +- **[Architecture Overview](docs/ARCHITECTURE.md)** - System design, services, and data flow +- **[Data Models](docs/DATA_MODELS.md)** - Pydantic models and data schemas + +### Service Documentation + +- **[dc-mock](services/dc-mock/README.md)** - Data producer (workload replay) +- **[sim-worker](services/sim-worker/README.md)** - Simulation engine (OpenDC integration) +- **[dashboard](services/dashboard/README.md)** - Web dashboard and REST API +- **[kafka-init](services/kafka-init/README.md)** - Kafka infrastructure setup + +## Architecture + ``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ dc-mock │────>β”‚ Kafka Bus β”‚ +β”‚ (Producer) β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ β”‚ Topics: β”‚ β”‚ +β”‚ Reads: β”‚ β”‚ β”‚ β€’ dc.workload (tasks) β”‚ β”‚ +β”‚ - tasks β”‚ β”‚ β”‚ β€’ dc.power (telemetry) β”‚ β”‚ +β”‚ - fragments β”‚ β”‚ β”‚ β€’ dc.topology (real) β”‚ β”‚ +β”‚ - power β”‚ β”‚ β”‚ β€’ sim.topology (simulated) β”‚ β”‚ +β”‚ - topology β”‚ β”‚ β”‚ β€’ sim.results (predictions) β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” + β”‚ sim-worker β”‚ β”‚ dashboard β”‚ + β”‚ (Consumer) β”‚ β”‚ (FastAPI) β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β€’ Windows β”‚ β”‚ β€’ Web UI β”‚ + β”‚ β€’ OpenDC β”‚ β”‚ β€’ REST API β”‚ + β”‚ β€’ Caching │◀────│ β€’ Topology Mgmt β”‚ + β”‚ β€’ Experiments β”‚ β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ PostgreSQL β”‚ + β”‚ β”‚ (TimescaleDB) β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” + β”‚ Experiment β”‚ + β”‚ Output: β”‚ + β”‚ β€’ Parquet β”‚ + β”‚ β€’ Plots β”‚ + β”‚ β€’ Archives β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +See [Architecture Overview](docs/ARCHITECTURE.md) for detailed explanation. + +## Available Commands + +### Core Commands + +| Command | Description | +|---------|-------------| +| `make up` | Start with clean slate (deletes volumes) | +| `make up-debug` | Start in debug mode (local file output) | +| `make down` | Stop all services | +| `make logs` | View all logs | +| `make ps` | Show running containers | + +### Experiment Commands + +| Command | Description | +|---------|-------------| +| `make experiment name=X` | Run experiment X | +| `make experiment-debug name=X` | Run experiment X with debug output | +| `make experiment-down` | Stop experiment | + +### Development Commands + +| Command | Description | +|---------|-------------| +| `make setup` | Setup virtual environment | +| `make test` | Run all tests | +| `make shell-dashboard` | Open shell in dashboard container | +| `make kafka-topics` | List Kafka topics | + +### Monitoring Commands + +| Command | Description | +|---------|-------------| +| `make logs-dc-mock` | View dc-mock logs | +| `make logs-sim-worker` | View sim-worker logs | +| `make logs-dashboard` | View dashboard logs | + +Run `make help` to see all available commands. + +## Configuration + +### Basic Configuration + +**File**: `config/default.yaml` + +```yaml +workload: "SURF" # Data directory name + +simulation: + speed_factor: 300 # 300x real-time + window_size_minutes: 5 # 5-minute windows + heartbeat_cadence_minutes: 1 + experiment_mode: false + +kafka: + bootstrap_servers: "kafka:29092" +``` + +### Experiment Configuration + +**File**: `config/experiments/my_experiment.yaml` + +```yaml +workload: "SURF" + +simulation: + speed_factor: 300 + window_size_minutes: 15 # Longer windows for experiments + experiment_mode: true # Enable experiment mode +``` + +See [Configuration Guide](docs/CONFIGURATION.md) for advanced options. + +## System Components + +### Services + +- **dc-mock**: Replays historical workload/power data to Kafka +- **sim-worker**: Consumes streams, invokes OpenDC simulator +- **dashboard**: Web dashboard with REST API for system control and visualization + +### Infrastructure + +- **Kafka**: Message broker (KRaft mode, no Zookeeper) +- **PostgreSQL**: Database for persistent storage +- **OpenDC**: Java-based datacenter simulator (bundled) + +### Shared Libraries -Then, open http://localhost:8080 for the orchestrator UI. +- **opendt-common**: Pydantic models, configuration, Kafka utilities diff --git a/config/default.yaml b/config/default.yaml new file mode 100644 index 0000000..469e3a4 --- /dev/null +++ b/config/default.yaml @@ -0,0 +1,47 @@ +workload: "SURF" + +simulation: + # 1.0 = Realtime, -1 = Max Speed + speed_factor: 300 + window_size_minutes: 5 + heartbeat_cadence_minutes: 1 + experiment_mode: false + +features: + calibration_enabled: false + +kafka: + bootstrap_servers: "kafka:29092" + topics: + workload: + name: "dc.workload" + config: + # 24 hours + retention.ms: "86400000" + power: + name: "dc.power" + config: + # 1 hour + retention.ms: "3600000" + topology: + name: "dc.topology" + config: + # Compacted topic - keeps latest topology + cleanup.policy: "compact" + # Keep at least 1 hour + min.compaction.lag.ms: "3600000" + system: + name: "sys.config" + config: + cleanup.policy: "compact" + sim_topology: + name: "sim.topology" + config: + # Compacted topic - keeps latest simulated topology + cleanup.policy: "compact" + min.compaction.lag.ms: "0" + results: + name: "sim.results" + config: + # 7 days retention for simulation results + retention.ms: "604800000" diff --git a/config/experiment_template.json b/config/experiment_template.json deleted file mode 100644 index 79b0603..0000000 --- a/config/experiment_template.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "name": "simple", - "exportModels": [ - { - "exportInterval": 300, - "printFrequency": 168, - "filesToExport": [ - "powerSource" - ], - "computeExportConfig": { - "powerSourceExportColumns": ["energy_usage", "power_draw"] - } - } - ] -} diff --git a/config/experiments/experiment_1.yaml b/config/experiments/experiment_1.yaml new file mode 100644 index 0000000..1c93ca9 --- /dev/null +++ b/config/experiments/experiment_1.yaml @@ -0,0 +1,48 @@ +workload: "SURF" + +simulation: + # 1.0 = Realtime, -1 = Max Speed + speed_factor: 300 + window_size_minutes: 15 + heartbeat_cadence_minutes: 1 + # Experiment mode: write results to parquet + experiment_mode: true + +features: + calibration_enabled: false + +kafka: + bootstrap_servers: "kafka:29092" + topics: + workload: + name: "dc.workload" + config: + # 24 hours + retention.ms: "86400000" + power: + name: "dc.power" + config: + # 1 hour + retention.ms: "3600000" + topology: + name: "dc.topology" + config: + # Compacted topic - keeps latest topology + cleanup.policy: "compact" + # Keep at least 1 hour + min.compaction.lag.ms: "3600000" + system: + name: "sys.config" + config: + cleanup.policy: "compact" + sim_topology: + name: "sim.topology" + config: + # Compacted topic - keeps latest simulated topology + cleanup.policy: "compact" + min.compaction.lag.ms: "0" + results: + name: "sim.results" + config: + # 7 days retention for simulation results + retention.ms: "604800000" diff --git a/config/slo.json b/config/slo.json deleted file mode 100644 index 981cde8..0000000 --- a/config/slo.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "energy_target": 2.0, - "runtime_target": 2.0 -} diff --git a/config/topology.json b/config/topology.json deleted file mode 100644 index 6211b76..0000000 --- a/config/topology.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "clusters": [ - { - "hosts": [ - { - "count": 1, - "cpu": { - "coreCount": 64, - "coreSpeed": 1600 - }, - "memory": { - "memorySize": 214748364800 - }, - "name": "H01" - } - ], - "name": "C01" - } - ] -} \ No newline at end of file diff --git a/data/SURF/consumption.parquet b/data/SURF/consumption.parquet new file mode 100644 index 0000000..8467db7 Binary files /dev/null and b/data/SURF/consumption.parquet differ diff --git a/surf-workload/fragments.parquet b/data/SURF/fragments.parquet similarity index 100% rename from surf-workload/fragments.parquet rename to data/SURF/fragments.parquet diff --git a/surf-workload/tasks.parquet b/data/SURF/tasks.parquet similarity index 100% rename from surf-workload/tasks.parquet rename to data/SURF/tasks.parquet diff --git a/data/SURF/topology.json b/data/SURF/topology.json new file mode 100644 index 0000000..1c9720c --- /dev/null +++ b/data/SURF/topology.json @@ -0,0 +1,28 @@ +{ + "clusters": [ + { + "name": "A01", + "hosts": [ + { + "name": "A01", + "count": 277, + "cpu": { + "coreCount": 16, + "coreSpeed": 2100 + }, + "memory": { + "memorySize": 128000000 + }, + "cpuPowerModel": { + "modelType": "asymptotic", + "power": 400, + "idlePower": 32, + "maxPower": 180, + "asymUtil": 0.3, + "dvfs": false + } + } + ] + } + ] +} diff --git a/data/SURF/workload.yaml b/data/SURF/workload.yaml new file mode 100644 index 0000000..a1346be --- /dev/null +++ b/data/SURF/workload.yaml @@ -0,0 +1,10 @@ +name: "SURF" +description: "SURF HPC cluster workload trace" + +timestamps: + # Consumption data uses relative timestamps (ms from start) + # This offset is added to the earliest task submission_time to get absolute timestamps + consumption_offset_ms: 0 # No offset by default (align with earliest task) + + # Alternative: specify an absolute base timestamp + # base_timestamp: "2022-10-06T22:00:00" diff --git a/docker-compose.override.yaml b/docker-compose.override.yaml deleted file mode 100644 index 5716c37..0000000 --- a/docker-compose.override.yaml +++ /dev/null @@ -1,9 +0,0 @@ -services: - opendt: - entrypoint: ["python", "-u", "-m", "opendt.cli"] - environment: - - OPENDT_DATA_DIR=/app/data - - OPENDT_SIM_DIR=/app/output/opendt-simulation/raw-output - volumes: - - ./:/app - - ./output/opendt-simulation/raw-output:/app/output/opendt-simulation/raw-output diff --git a/docker-compose.yml b/docker-compose.yml index 42043f7..c56a73f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,35 +1,225 @@ services: + # ============================================================================ + # INFRASTRUCTURE SERVICES + # ============================================================================ + + postgres: + image: postgres:15-alpine + container_name: opendt-postgres + environment: + POSTGRES_DB: opendt + POSTGRES_USER: opendt + POSTGRES_PASSWORD: opendt_dev_password + PGDATA: /var/lib/postgresql/data/pgdata + ports: + - "5432:5432" + volumes: + - postgres_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U opendt"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - opendt-network + kafka: - image: confluentinc/cp-kafka:latest + image: confluentinc/cp-kafka:7.5.0 + container_name: opendt-kafka + ports: + - "9092:9092" + - "9093:9093" environment: - CLUSTER_ID: "MkU3OEVBNTcwNTJENDM2Qk" + # KRaft Configuration (No Zookeeper) KAFKA_NODE_ID: 1 - KAFKA_PROCESS_ROLES: "broker,controller" - KAFKA_LISTENERS: "PLAINTEXT://0.0.0.0:29092,CONTROLLER://0.0.0.0:29093" - KAFKA_ADVERTISED_LISTENERS: "PLAINTEXT://kafka:29092" - KAFKA_CONTROLLER_QUORUM_VOTERS: "1@kafka:29093" - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: "CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT" - KAFKA_CONTROLLER_LISTENER_NAMES: "CONTROLLER" - KAFKA_INTER_BROKER_LISTENER_NAME: "PLAINTEXT" - ports: - - "9092:29092" + KAFKA_PROCESS_ROLES: 'broker,controller' + KAFKA_CONTROLLER_QUORUM_VOTERS: '1@kafka:9093' + KAFKA_CONTROLLER_LISTENER_NAMES: 'CONTROLLER' + + # Listener Configuration + KAFKA_LISTENERS: 'PLAINTEXT://kafka:29092,PLAINTEXT_HOST://0.0.0.0:9092,CONTROLLER://kafka:9093' + KAFKA_ADVERTISED_LISTENERS: 'PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092' + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: 'CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT' + KAFKA_INTER_BROKER_LISTENER_NAME: 'PLAINTEXT' + + # Cluster Configuration + CLUSTER_ID: 'MkU3OEVBNTcwNTJENDM2Qk' + + # Log Configuration + KAFKA_LOG_DIRS: '/var/lib/kafka/data' + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + + # Performance & Development Settings + KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'false' + KAFKA_DELETE_TOPIC_ENABLE: 'true' + KAFKA_LOG_RETENTION_HOURS: 168 + KAFKA_LOG_SEGMENT_BYTES: 1073741824 + # Message size limits (for large task aggregates with fragments) + KAFKA_MESSAGE_MAX_BYTES: 10485760 # 10MB max message size + KAFKA_REPLICA_FETCH_MAX_BYTES: 10485760 # 10MB for replication + volumes: + - kafka_data:/var/lib/kafka/data healthcheck: - test: ["CMD", "kafka-topics", "--bootstrap-server", "localhost:29092", "--list"] + test: ["CMD-SHELL", "kafka-broker-api-versions --bootstrap-server localhost:9092 || exit 1"] interval: 5s timeout: 10s - retries: 5 + retries: 10 + start_period: 30s + networks: + - opendt-network + + # ============================================================================ + # INFRASTRUCTURE INITIALIZATION + # ============================================================================ - opendt: - build: . + kafka-init: + build: + context: . + dockerfile: services/kafka-init/Dockerfile + container_name: opendt-kafka-init + environment: + CONFIG_FILE: /app/config/simulation.yaml + PYTHONUNBUFFERED: 1 + volumes: + - ./services/kafka-init:/app/services/kafka-init + - ./libs/common:/app/libs/common + - ${CONFIG_PATH:-./config/default.yaml}:/app/config/simulation.yaml:ro depends_on: kafka: condition: service_healthy + networks: + - opendt-network + restart: "no" + + # ============================================================================ + # PYTHON MICROSERVICES + # ============================================================================ + + dc-mock: + build: + context: . + dockerfile: services/dc-mock/Dockerfile + container_name: opendt-dc-mock + environment: + KAFKA_BOOTSTRAP_SERVERS: kafka:29092 + DATA_PATH: /app/data + CONFIG_FILE: /app/config/simulation.yaml + PYTHONUNBUFFERED: 1 + volumes: + # Mount source code for hot reload + - ./services/dc-mock:/app/services/dc-mock + - ./libs/common:/app/libs/common + # Mount data files + - ./data:/app/data:ro + # Mount configuration file + - ${CONFIG_PATH:-./config/default.yaml}:/app/config/simulation.yaml:ro + depends_on: + kafka-init: + condition: service_completed_successfully + networks: + - opendt-network + command: python -m dc_mock.main + + sim-worker: + build: + context: . + dockerfile: services/sim-worker/Dockerfile + container_name: opendt-sim-worker environment: KAFKA_BOOTSTRAP_SERVERS: kafka:29092 - OPENAI_API_KEY: ${OPENAI_API_KEY:-} - stop_grace_period: 3s + CONFIG_FILE: /app/config/simulation.yaml + PYTHONUNBUFFERED: 1 + # Debug mode: set to "true" to write debug files alongside main output + DEBUG_MODE: ${DEBUG_MODE:-false} + DEBUG_OUTPUT_DIR: /app/output + # Experiment mode: experiment name (used for output directory) + EXPERIMENT_NAME: ${EXPERIMENT_NAME:-default} + EXPERIMENT_OUTPUT_DIR: /app/output + volumes: + # Mount source code for hot reload + - ./services/sim-worker:/app/services/sim-worker + - ./libs/common:/app/libs/common + # Mount configuration file + - ${CONFIG_PATH:-./config/default.yaml}:/app/config/simulation.yaml:ro + # Mount output directory for debug results + - ./output:/app/output + depends_on: + kafka-init: + condition: service_completed_successfully + networks: + - opendt-network + command: python -m sim_worker.main + + dashboard: + build: + context: . + dockerfile: services/dashboard/Dockerfile + container_name: opendt-dashboard + environment: + DATABASE_URL: postgresql://opendt:opendt_dev_password@postgres:5432/opendt + KAFKA_BOOTSTRAP_SERVERS: kafka:29092 + CONFIG_FILE: /app/config/simulation.yaml + PYTHONUNBUFFERED: 1 ports: - - "8080:8080" - working_dir: /app + - "8000:8000" volumes: - - ./:/app + # Mount source code for hot reload + - ./services/dashboard:/app/services/dashboard + - ./libs/common:/app/libs/common + # Mount configuration file + - ${CONFIG_PATH:-./config/default.yaml}:/app/config/simulation.yaml:ro + depends_on: + postgres: + condition: service_healthy + kafka: + condition: service_healthy + networks: + - opendt-network + command: uvicorn dashboard.main:app --host 0.0.0.0 --port 8000 --reload + + # ============================================================================ + # FRONTEND + # ============================================================================ + + # frontend: + # build: + # context: ./frontend + # dockerfile: Dockerfile + # target: development + # container_name: opendt-frontend + # environment: + # NODE_ENV: development + # NEXT_PUBLIC_API_URL: http://localhost:8000 + # ports: + # - "3000:3000" + # volumes: + # # Mount source code for HMR (Hot Module Replacement) + # - ./frontend:/app + # - /app/node_modules + # - /app/.next + # depends_on: + # - dashboard + # networks: + # - opendt-network + # command: npm run dev + +# ============================================================================== +# NETWORKS +# ============================================================================== + +networks: + opendt-network: + driver: bridge + +# ============================================================================== +# VOLUMES (Persistent Data) +# ============================================================================== + +volumes: + postgres_data: + name: opendt-postgres-data + kafka_data: + name: opendt-kafka-data diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index f893fb4..f1412f2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,145 +1,244 @@ -# High-level architecture +# OpenDT Architecture -OpenDT runs two concurrent threads that communicate via Kafka: +Welcome to the OpenDT documentation! This document provides a comprehensive overview of the system's architecture, design principles, and core concepts. -## 1. Producer Loop (Workload Replay) +## Table of Contents -The producer streams workload traces to Kafka, simulating real-time datacenter activity. +- [System Overview](#system-overview) +- [Architecture Diagram](#architecture-diagram) +- [Services](#services) +- [Data Flow](#data-flow) +- [Kafka Topics](#kafka-topics) +- [Related Documentation](#related-documentation) -### Data Sources -- **tasks.parquet** - - Columns: `id`, `submission_time`, `duration`, `cpu_count`, `cpu_capacity`, `mem_capacity` - - Tasks have timestamps indicating when they were submitted - -- **fragments.parquet** - - Columns: `id` (task_id), `duration`, `cpu_count`, `cpu_usage` - - Fragments have NO submission times in raw data - - Each task is composed of β‰₯1 fragments for fine-grained resource modeling +## System Overview -More info about these fields can be found in the [OpenDC documentation](https://atlarge-research.github.io/opendc/docs/documentation/Input/Workload/#tasks). +**OpenDT** (Open Digital Twin) is a distributed system for datacenter simulation and What-If analysis. It operates in "Shadow Mode" by replaying historical workload data through the OpenDC simulator to compare predicted vs. actual power consumption. -### Key Transformations +### Key Objectives -**Fragment Timestamp Synthesis:** -Fragments inherit their parent task's submission time, offset by cumulative fragment durations. The synthesized timestamp is stored in the `submission_time` key of each fragment message: +1. **Power Consumption Prediction**: Simulate datacenter power usage based on workload patterns +2. **What-If Analysis**: Answer questions like "What happens if we upgrade CPU architecture?" without touching live hardware +3. **Infrastructure Optimization**: Identify opportunities for energy efficiency improvements +4. **Real-time Comparison**: Continuously compare simulation predictions against actual telemetry + +### Core Capabilities + +- Event-time windowing with configurable window sizes (default: 5 minutes) +- Cumulative simulation for accurate long-running predictions +- Topology management (real vs. simulated configurations) +- Result caching to avoid redundant simulations +- Multiple operating modes (normal, debug, experiment) +- Dynamic plot generation for power consumption analysis + +## Architecture Diagram ``` -Task starts at T=0 -Fragment 1.submission_time = T=0 + duration[0] -Fragment 2.submission_time = T=0 + duration[0] + duration[1] -Fragment 3.submission_time = T=0 + duration[0] + duration[1] + duration[2] -... +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ OpenDT System β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ dc-mock │────>β”‚ Kafka Bus β”‚ +β”‚ (Producer) β”‚ β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ β”‚ Topics: β”‚ β”‚ +β”‚ Reads: β”‚ β”‚ β”‚ β€’ dc.workload (tasks) β”‚ β”‚ +β”‚ - tasks β”‚ β”‚ β”‚ β€’ dc.power (telemetry) β”‚ β”‚ +β”‚ - fragments β”‚ β”‚ β”‚ β€’ dc.topology (real) β”‚ β”‚ +β”‚ - power β”‚ β”‚ β”‚ β€’ sim.topology (simulated) β”‚ β”‚ +β”‚ - topology β”‚ β”‚ β”‚ β€’ sim.results (predictions) β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β”‚ β€’ sys.config (runtime cfg) β”‚ β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ sim-worker β”‚ β”‚ dashboard β”‚ + β”‚ (Consumer) β”‚ β”‚ (FastAPI) β”‚ + β”‚ β”‚ β”‚ β”‚ + β”‚ β€’ Windows β”‚ β”‚ β€’ Web UI β”‚ + β”‚ β€’ OpenDC β”‚ β”‚ β€’ REST API β”‚ + β”‚ β€’ Caching │◀────│ β€’ Topology Mgmt β”‚ + β”‚ β€’ Experiments β”‚ β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ PostgreSQL β”‚ + β”‚ β”‚ (TimescaleDB) β”‚ + β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β” + β”‚ Experiment β”‚ + β”‚ Output: β”‚ + β”‚ β€’ Parquet β”‚ + β”‚ β€’ Plots β”‚ + β”‚ β€’ Archives β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` -This creates a sequential execution timeline where fragments "chain" together. +## Services -**Time-Scaled Replay:** -- Original trace: ~7 days of SURF workload data -- Replay speed: 10x accelerated (TIME_SCALE = 0.1) -- Real-time gaps between events are preserved but compressed -- Example: 1-hour workload gap β†’ 6-minute real wait +OpenDT consists of 5 microservices orchestrated via Docker Compose: -### Workload Semantics +### 1. dc-mock (Datacenter Mock) -**Task-Fragment Relationship:** -In the AtLarge traces, tasks are _always_ equally split into fragments. For example, we can have a task with `duration=10000ms` split into 5 fragments of `duration=2000ms`. +**Purpose**: Simulates a real datacenter by replaying historical data -**Computing Clock Cycles:** -Fragments specify the total number of clock cycles needed, calculated as: -``` -total_cycles = duration (ms) Γ— cpu_usage (MHz) Γ— 1,000 -``` -Since MHz = million cycles/second and duration is in milliseconds. - -**Simulation Behavior:** -The `duration` field represents the actual measured runtime of the original workload. However, when simulating on different topologies, the execution time varies based on CPU specifications. - -**Example:** -A fragment with `cpu_count=1`, `cpu_usage=1000 MHz`, `duration=5000 ms`: -- Total cycles needed: `5000 Γ— 1000 Γ— 1,000 = 5,000,000,000 cycles` (5 billion) - -When simulated on different hardware: -- **2 CPUs @ 1000 MHz each:** Total throughput = 2,000 MHz β†’ `duration = 5,000,000,000 / (2000 Γ— 1,000,000) = 2500 ms` -- **1 CPU @ 5000 MHz:** Total throughput = 5,000 MHz β†’ `duration = 5,000,000,000 / (5000 Γ— 1,000,000) = 1000 ms` - -This is a simplification of reality but makes reasoning about the simulation behavior clearer. - -### Output to Kafka - -**Topic: "tasks"** -```json -{ - "key": {"id": "task_id"}, - "value": { - "submission_time": "2022-10-06T22:00:00Z", - "duration": 27935000, - "cpu_count": 16, - "cpu_capacity": 33600.0, - "mem_capacity": 100000 - } -} -``` +**Location**: [`../services/dc-mock/`](../services/dc-mock/README.md) -**Topic: "fragments"** -```json -{ - "key": {"id": "task_id"}, - "value": { - "submission_time": "2022-10-06T22:00:30Z", - "duration": 30000, - "cpu_usage": 1953.0 - } -} -``` +**Key Features**: +- Reads Parquet files (`tasks`, `fragments`, `consumption`) from `data//` +- Publishes to Kafka with configurable speed factor (e.g., 300x real-time) +- Three independent producers: Workload, Power, Topology +- Heartbeat mechanism for window synchronization + +**Produces To**: +- `dc.workload` - Task submissions + periodic heartbeats +- `dc.power` - Power consumption telemetry +- `dc.topology` - Real datacenter topology snapshots + +--- -### Implementation -- Two parallel threads: one for tasks, one for fragments -- Both synchronized via barrier to start simultaneously -- Pacing via `sleep()` between messages to maintain temporal structure +### 2. sim-worker (Simulation Engine) + +**Purpose**: Core simulation worker that bridges Kafka and OpenDC simulator + +**Location**: [`../services/sim-worker/`](../services/sim-worker/README.md) + +**Key Features**: +- Event-time windowing with heartbeat-driven closing +- Cumulative simulation (re-simulates all tasks from beginning) +- Result caching based on topology hash + task count +- Multiple operating modes (normal, debug, experiment) +- Integration with OpenDC binary (Java-based simulator) + +**Consumes From**: +- `dc.workload` - Tasks and heartbeats +- `dc.topology` - Real topology snapshots +- `sim.topology` - Simulated topology updates +- `dc.power` - Actual power (experiment mode) + +**Produces To**: +- `sim.results` - Simulation predictions (normal mode) +- Local files - Results and archives (debug/experiment mode) --- -## 2. Consumer Loop (Digital Twin) +### 3. dashboard (Web Dashboard) -The consumer organizes incoming Kafka messages into time-based windows and feeds them to the OpenDC simulator. +**Purpose**: Web dashboard and REST API for system control and visualization + +**Location**: [`../services/dashboard/`](../services/dashboard/README.md) + +**Key Features**: +- Web UI for real-time visualization +- FastAPI with automatic OpenAPI documentation +- Topology management endpoint (`PUT /api/topology`) +- Health check and status endpoints +- Kafka producer for configuration updates +- Static file serving for dashboard assets + +**Routes**: +- `GET /` - Web dashboard UI +- `GET /health` - Health check (Kafka + config status) +- `GET /docs` - Interactive Swagger UI +- `PUT /api/topology` - Update simulated datacenter topology + +--- -### Windowing Strategy +### 4. kafka-init (Infrastructure Initialization) -**Window Size:** 5 minutes of virtual trace time (`REAL_WINDOW_SIZE_SEC = 300s`) +**Purpose**: Creates Kafka topics with proper retention and compaction policies -**Processing:** Windows are created based on `submission_time` timestamps: -- As tasks/fragments arrive, they're assigned to windows based on their timestamp -- A window becomes "ready" when data from the next time window starts arriving -- Windows are processed sequentially (FIFO) +**Location**: [`../services/kafka-init/`](../services/kafka-init/README.md) + +**Key Features**: +- Reads topic configuration from YAML +- Creates topics on Kafka startup +- Applies retention policies and compaction settings +- Fail-fast on errors + +--- + +## Data Flow + +### 1. Data Ingestion -**Example:** ``` -Window 5: 2022-10-06 22:22:00 β†’ 22:27:00 - β”œβ”€ 113 tasks - └─ 1,243 fragments (~11 per task) +data/SURF/ +β”œβ”€β”€ tasks.parquet ─┐ +β”œβ”€β”€ fragments.parquet ────> dc-mock ──> dc.workload (Kafka) +β”œβ”€β”€ consumption.parquet────> dc-mock ──> dc.power (Kafka) +└── topology.json β”€β”˜β”€β”€> dc-mock ──> dc.topology (Kafka) ``` -### Output to Orchestrator +### 2. Simulation Pipeline + +``` +dc.workload ──┐ + β”œβ”€β”€> sim-worker ──> OpenDC ──> results +dc.topology ─── (binary) +sim.topology β”€β”˜ +``` + +### 3. Window Processing + +1. **Task Arrival**: Tasks published to `dc.workload` with submission timestamps +2. **Window Assignment**: Task assigned to window based on rounded submission time +3. **Heartbeat Signal**: Periodic heartbeat messages indicate time progression +4. **Window Closing**: When heartbeat timestamp β‰₯ window end, close window +5. **Simulation**: Invoke OpenDC with cumulative tasks + simulated topology +6. **Caching**: Check if topology + task count match previous simulation +7. **Output**: Publish results or write to files based on operating mode + +### 4. Topology Management + +``` +User/Dashboard ──> PUT /api/topology ──> sim.topology (Kafka) ──> sim-worker + β”‚ + β”œβ”€β”€> Update simulated topology + β”œβ”€β”€> Clear result cache + └──> Use for future simulations +``` + +## Kafka Topics + +### Topic Overview + +| Topic | Type | Purpose | Retention | Key | +|-------|------|---------|-----------|-----| +| `dc.workload` | Stream | Task submissions + heartbeats | 24 hours | null | +| `dc.power` | Stream | Actual power telemetry | 1 hour | null | +| `dc.topology` | Compacted | Real datacenter topology | 1h lag | `datacenter` | +| `sim.topology` | Compacted | Simulated topology (What-If) | 0ms lag | `datacenter` | +| `sys.config` | Compacted | Runtime configuration | infinite | setting key | +| `sim.results` | Stream | Simulation predictions | 7 days | null | + +### Compaction Strategy + +**Compacted topics** (`dc.topology`, `sim.topology`, `sys.config`) keep only the latest value per key: +- Ensures consumers always get current state +- Enables efficient state recovery +- Reduces storage for infrequently changing data -Each completed window yields a `batch_data` dictionary containing: -- `tasks_sample`: List of task objects for this window -- `fragments_sample`: List of fragment objects for this window -- `task_count`, `fragment_count`: Counts -- `avg_cpu_usage`: Average CPU usage across fragments -- `window_start`, `window_end`: Time boundaries +**Stream topics** (`dc.workload`, `dc.power`, `sim.results`) retain all messages up to retention period: +- Preserves full event history +- Enables time-travel and replay +- Supports multiple consumers at different offsets -### Simulation Flow +## Related Documentation -1. Orchestrator receives `batch_data` -2. OpenDC simulates workload on current topology -3. Returns: `energy_kwh`, `runtime_hours`, `cpu_utilization`, `max_power_draw` +### Service Documentation +- [dc-mock README](../services/dc-mock/README.md) - Datacenter mock producer +- [sim-worker README](../services/sim-worker/README.md) - Simulation engine +- [dashboard README](../services/dashboard/README.md) - Web dashboard and API +- [kafka-init README](../services/kafka-init/README.md) - Kafka initialization -**Real example from logs:** -- Input: 113 tasks, 1,243 fragments -- Output: `energy_kwh=0.84`, `runtime_hours=2.62`, `cpu_utilization=57.3%` +### Concept Documentation +- [Data Models](./DATA_MODELS.md) - Pydantic models and data structures -### Implementation -- Three threads: tasks consumer, fragments consumer, window processor -- Threads share window state via locks/conditions -- Task-fragment matching: Fragments are joined with their parent tasks by `id` -- Windows wait `VIRTUAL_WINDOW_SIZE = 30s` between processing (real time) +### Development Resources +- [Root README](../README.md) - Quick start and setup +- [Makefile Commands](../Makefile) - Available `make` commands +- [Common Library](../libs/common/opendt_common/) - Shared Pydantic models and utilities diff --git a/docs/DATA_MODELS.md b/docs/DATA_MODELS.md new file mode 100644 index 0000000..46757c2 --- /dev/null +++ b/docs/DATA_MODELS.md @@ -0,0 +1,501 @@ +# Data Models + +This document describes all data models used in OpenDT, their structure, validation rules, and usage patterns. + +## Table of Contents + +- [Overview](#overview) +- [Workload Models](#workload-models) +- [Topology Models](#topology-models) +- [Telemetry Models](#telemetry-models) +- [Message Wrappers](#message-wrappers) +- [Simulation Results](#simulation-results) +- [Data Physics](#data-physics) + +## Overview + +All data models in OpenDT are defined using **Pydantic v2** for: +- Runtime type validation +- JSON serialization/deserialization +- Automatic API documentation +- Data integrity guarantees + +**Location**: [`../libs/common/opendt_common/models/`](../libs/common/opendt_common/models/) + +## Workload Models + +### Task + +Represents a workload submission to the datacenter. + +**File**: [`task.py`](../libs/common/opendt_common/models/task.py) + +```python +class Task(BaseModel): + """A workload task submitted to the datacenter.""" + + id: int # Unique task identifier + submission_time: datetime # When task was submitted (ISO 8601) + duration: int # Total duration in milliseconds + cpu_count: int # Number of CPU cores requested + cpu_capacity: float # CPU speed in MHz + mem_capacity: int # Memory capacity in MB + fragments: list[Fragment] # Execution profile fragments +``` + +**Physical Interpretation**: +A task represents a request for compute cycles: +``` +Total Cycles = cpu_count Γ— cpu_capacity Γ— duration Γ— 1000 +``` + +**Example**: +```json +{ + "id": 2132895, + "submission_time": "2022-10-07T00:39:21", + "duration": 12000, + "cpu_count": 16, + "cpu_capacity": 33600.0, + "mem_capacity": 100000, + "fragments": [...] +} +``` + +--- + +### Fragment + +Represents a fine-grained execution profile segment of a task. + +**File**: [`fragment.py`](../libs/common/opendt_common/models/fragment.py) + +```python +class Fragment(BaseModel): + """A time segment of task execution with specific resource usage.""" + + id: int # Fragment identifier + task_id: int # Parent task ID + duration: int # Fragment duration in milliseconds + cpu_count: int # Number of CPUs used in this fragment + cpu_usage: float # CPU utilization for this fragment +``` + +**Purpose**: Fragments describe non-uniform resource usage over time. For example: +- First 1000ms: 100% CPU utilization (cpu_usage = 16.0 for 16 cores) +- Next 2000ms: 50% CPU utilization (cpu_usage = 8.0 for 16 cores) + +**Example**: +```json +{ + "id": 1, + "task_id": 2132895, + "duration": 5000, + "cpu_count": 16, + "cpu_usage": 147.0 +} +``` + +--- + +### WorkloadMessage + +Wrapper for messages on `dc.workload` topic, distinguishing tasks from heartbeats. + +**File**: [`workload_message.py`](../libs/common/opendt_common/models/workload_message.py) + +```python +class WorkloadMessage(BaseModel): + """Wrapper for messages on dc.workload topic.""" + + message_type: Literal["task", "heartbeat"] # Message type discriminator + timestamp: datetime # Simulation timestamp + task: Task | None = None # Task data (only if type="task") +``` + +**Usage**: + +Task message: +```json +{ + "message_type": "task", + "timestamp": "2022-10-07T00:39:21", + "task": { /* Task object */ } +} +``` + +Heartbeat message: +```json +{ + "message_type": "heartbeat", + "timestamp": "2022-10-07T00:45:00", + "task": null +} +``` + +**Purpose**: Heartbeats signal time progression to consumers, enabling deterministic window closing even when no tasks arrive. + +## Topology Models + +### Topology + +Root model representing datacenter infrastructure. + +**File**: [`topology.py`](../libs/common/opendt_common/models/topology.py) + +```python +class Topology(BaseModel): + """Datacenter topology definition.""" + + clusters: list[Cluster] # List of clusters (min 1 required) + + # Helper methods + def total_host_count() -> int + def total_core_count() -> int + def total_memory_bytes() -> int +``` + +**Example**: +```json +{ + "clusters": [ + { + "name": "A01", + "hosts": [/* Host objects */] + } + ] +} +``` + +--- + +### Cluster + +Represents a logical group of hosts. + +```python +class Cluster(BaseModel): + """Cluster of hosts in a datacenter.""" + + name: str # Cluster identifier + hosts: list[Host] # List of host configurations (min 1 required) +``` + +--- + +### Host + +Represents a physical server configuration (possibly replicated). + +```python +class Host(BaseModel): + """Host (physical server) in a datacenter cluster.""" + + name: str # Host identifier/name + count: int # Number of identical hosts + cpu: CPU # CPU specification + memory: Memory # Memory specification + cpuPowerModel: CPUPowerModel # Power consumption model +``` + +**Example**: +```json +{ + "name": "A01-Host", + "count": 277, + "cpu": { "coreCount": 16, "coreSpeed": 2100 }, + "memory": { "memorySize": 128000000 }, + "cpuPowerModel": { /* Power model */ } +} +``` + +--- + +### CPU + +CPU hardware specification. + +```python +class CPU(BaseModel): + """CPU specification for a host.""" + + coreCount: int # Number of CPU cores (> 0) + coreSpeed: float # CPU speed in MHz (> 0) +``` + +--- + +### Memory + +Memory hardware specification. + +```python +class Memory(BaseModel): + """Memory specification for a host.""" + + memorySize: int # Memory size in bytes (> 0) +``` + +--- + +### CPUPowerModel + +Defines how CPU utilization translates to power consumption. + +```python +class CPUPowerModel(BaseModel): + """CPU power consumption model.""" + + modelType: Literal["asymptotic", "linear", "square", "cubic", "sqrt"] + power: float # Nominal power consumption in Watts (> 0) + idlePower: float # Power at 0% utilization in Watts (β‰₯ 0) + maxPower: float # Power at 100% utilization in Watts (> 0) + asymUtil: float = 0.5 # Asymptotic utilization coefficient (0-1) + dvfs: bool = False # Dynamic Voltage/Frequency Scaling enabled +``` + +**Power Model Types**: +- **asymptotic**: Realistic non-linear curve (recommended) +- **linear**: Simple linear interpolation between idle and max +- **square**: Quadratic relationship +- **cubic**: Cubic relationship +- **sqrt**: Square root relationship + +**Example**: +```json +{ + "modelType": "asymptotic", + "power": 400.0, + "idlePower": 32.0, + "maxPower": 180.0, + "asymUtil": 0.3, + "dvfs": false +} +``` + +--- + +### TopologySnapshot + +Timestamped wrapper for topology on `dc.topology` topic. + +```python +class TopologySnapshot(BaseModel): + """Timestamped topology snapshot for Kafka messages.""" + + timestamp: datetime # When snapshot was captured (ISO 8601) + topology: Topology # The datacenter topology +``` + +**Purpose**: Adds temporal context to topology updates, enabling time-travel and audit trails. + +**Example**: +```json +{ + "timestamp": "2022-10-07T09:14:30", + "topology": { /* Topology object */ } +} +``` + +## Telemetry Models + +### Consumption + +Power consumption telemetry from datacenter. + +**File**: [`consumption.py`](../libs/common/opendt_common/models/consumption.py) + +```python +class Consumption(BaseModel): + """Power consumption measurement from datacenter.""" + + power_draw: float # Instantaneous power in Watts + energy_usage: float # Accumulated energy in Joules + timestamp: datetime # Measurement timestamp (ISO 8601) +``` + +**Example**: +```json +{ + "power_draw": 19180.0, + "energy_usage": 575400.0, + "timestamp": "2022-10-08T06:35:30" +} +``` + +**Units**: +- `power_draw`: Watts (W) +- `energy_usage`: Joules (J) - accumulated since last snapshot +- To convert Joules to kWh: `kWh = joules / 3,600,000` + +## Simulation Results + +### SimulationResults + +Output from OpenDC simulator. + +**File**: [`sim_worker/runner/models.py`](../services/sim-worker/sim_worker/runner/models.py) + +```python +class SimulationResults(BaseModel): + """Results from OpenDC simulation.""" + + status: str # "success" or "error" + error: str | None = None # Error message if failed + + # Summary Statistics + energy_kwh: float = 0.0 # Total energy in kilowatt-hours + cpu_utilization: float = 0.0 # Average CPU utilization (0.0-1.0) + max_power_draw: float = 0.0 # Maximum power in Watts + runtime_hours: float = 0.0 # Simulated runtime duration + + # Timeseries Data + power_draw_series: list[TimeseriesData] = [] # Power over time + cpu_utilization_series: list[TimeseriesData] = [] # CPU util over time + + # Metadata + temp_dir: str | None = None # Temporary directory path + opendc_output_dir: str | None = None # OpenDC output directory +``` + +--- + +### TimeseriesData + +Timeseries data point. + +```python +class TimeseriesData(BaseModel): + """Single timeseries data point.""" + + timestamp: int # Milliseconds offset from simulation start + value: float # Measured value +``` + +**Example**: +```json +{ + "timestamp": 150000, + "value": 18750.5 +} +``` + +## Data Physics + +### Task Execution + +A task's resource requirements define the total compute cycles needed: + +```python +total_cycles = cpu_count Γ— cpu_capacity Γ— duration Γ— 1000 +``` + +**Example**: +- 16 cores Γ— 3360 MHz Γ— 12 seconds = 644,352,000 cycles + +### Fragment Profiling + +Fragments break down task execution into segments with varying resource usage: + +1. **Bursty Task**: + - Fragment 1 (0-1s): 100% CPU + - Fragment 2 (1-10s): 20% CPU + - Fragment 3 (10-12s): 80% CPU + +2. **Steady Task**: + - Single fragment (0-12s): 75% CPU + +This allows accurate power modeling for real-world workload patterns. + +### Energy Integration + +Total energy consumed: +``` +Energy (Joules) = ∫ Power(t) dt +Energy (kWh) = Joules / 3,600,000 +``` + +## Validation Rules + +### Field Constraints + +Pydantic enforces validation at runtime: + +- **Positive integers**: `gt=0` (cpu_count, duration, count) +- **Non-negative floats**: `ge=0` (idlePower) +- **Positive floats**: `gt=0` (cpu_capacity, maxPower) +- **Ranges**: `ge=0, le=1` (asymUtil, cpu_utilization) +- **Min length**: `min_length=1` (clusters, hosts, fragments) +- **Datetime**: ISO 8601 format required + +### Example Validation + +```python +from opendt_common.models import Task + +# Valid task +task = Task( + id=1, + submission_time="2022-10-07T00:39:21", + duration=1000, + cpu_count=8, + cpu_capacity=2400.0, + mem_capacity=64000, + fragments=[] +) + +# Invalid task (negative duration) +try: + Task( + id=2, + duration=-100, # ❌ Will raise ValidationError + ... + ) +except ValidationError as e: + print(e) +``` + +## Usage Patterns + +### Serialization + +```python +from opendt_common.models import Task + +# To JSON +task_json = task.model_dump(mode="json") +task_str = task.model_dump_json() + +# From JSON +task = Task(**json_data) +task = Task.model_validate_json(json_string) +``` + +### Kafka Integration + +```python +from opendt_common.utils.kafka import send_message +from opendt_common.models import TopologySnapshot + +snapshot = TopologySnapshot( + timestamp=datetime.now(), + topology=topology +) + +send_message( + producer=producer, + topic="dc.topology", + message=snapshot.model_dump(mode="json"), + key="datacenter" +) +``` + +## Related Documentation + +- [Architecture Overview](./ARCHITECTURE.md) - System design and data flow +- [Dashboard Documentation](../services/dashboard/README.md) - Web UI and REST API using these models +- [Simulation Worker](../services/sim-worker/README.md) - How models are used in simulation + +--- + +For model source code, see [`libs/common/opendt_common/models/`](../libs/common/opendt_common/models/). diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md deleted file mode 100644 index 494fb96..0000000 --- a/docs/DEVELOPMENT.md +++ /dev/null @@ -1,169 +0,0 @@ -# Development Guide - -This document explains all steps needed to get OpenDT running locally for development. - -## Prerequisites - -### Docker -OpenDT uses Docker for running the app in release mode, but also to easily run the Kafka broker service (and in the future, the database service). Make sure Docker is installed by following the instructions [here](https://docs.docker.com/desktop). - -You can verify that it is installed by running -```sh -docker --version -``` - -## Quick Start - -### Start Development - -From the root directory, run: - -```bash -docker compose up -d -``` - -This command will: - -- Start the Kafka broker service -- Start the OpenDT Flask Python service - -## Logging -To see the logs of the Kafka service, run: -```sh -docker compose logs -f --tail=1 kafka -``` - -To see _all_ the logs of the OpenDT app, run: -```sh -docker compose logs -f --tail=1 opendt -``` - -To see _just_ the logs of the Python modules, run: -```sh -docker compose logs --tail=1 -f --no-log-prefix opendt | jq -R 'fromjson? | select(. != null and (.logger | startswith("werkzeug") | not))' -``` - -To see _just_ the logs of the Flask web app, run: -```sh -docker compose logs --tail=1 -f --no-log-prefix opendt | jq -R 'fromjson? | select(. != null and (.logger | startswith("werkzeug")))' -``` - -## Repository - -High-level overview of the repository: -``` -OpenDT/ -β”‚ -β”œβ”€β”€ src/opendt/ -β”‚ β”œβ”€β”€ app.py ──────────────────── Flask application factory -β”‚ β”œβ”€β”€ cli.py ──────────────────── Main entry point -β”‚ β”‚ -β”‚ β”œβ”€β”€ api/ -β”‚ β”‚ β”œβ”€β”€ routes.py ───────────── REST endpoints + UI routes -β”‚ β”‚ β”œβ”€β”€ schemas.py ──────────── Pydantic validation models -β”‚ β”‚ └── dependencies.py ─────── Orchestrator singleton -β”‚ β”‚ -β”‚ β”œβ”€β”€ core/ -β”‚ β”‚ β”‚ -β”‚ β”‚ β”œβ”€β”€ orchestrator/ -β”‚ β”‚ β”‚ β”œβ”€β”€ controller.py ──── Main orchestrator (coordination) -β”‚ β”‚ β”‚ β”œβ”€β”€ state.py ───────── System state & buffers -β”‚ β”‚ β”‚ β”œβ”€β”€ topology.py ────── Topology file watcher -β”‚ β”‚ β”‚ └── slo.py ─────────── SLO file watcher -β”‚ β”‚ β”‚ -β”‚ β”‚ β”œβ”€β”€ simulation/ -β”‚ β”‚ β”‚ β”œβ”€β”€ runner.py ──────── OpenDC simulator wrapper -β”‚ β”‚ β”‚ └── adapters.py ────── Parquet data conversion -β”‚ β”‚ β”‚ -β”‚ β”‚ β”œβ”€β”€ optimization/ -β”‚ β”‚ β”‚ β”œβ”€β”€ base.py ────────── Strategy protocol -β”‚ β”‚ β”‚ β”œβ”€β”€ llm.py ─────────── OpenAI-based optimizer -β”‚ β”‚ β”‚ β”œβ”€β”€ rule_based.py ──── Heuristic fallback -β”‚ β”‚ β”‚ └── scoring.py ─────── SLO scoring functions -β”‚ β”‚ β”‚ -β”‚ β”‚ └── workers/ -β”‚ β”‚ β”œβ”€β”€ scheduler.py ───── Thread utilities -β”‚ β”‚ └── tasks.py ────────── Background task definitions -β”‚ β”‚ -β”‚ β”œβ”€β”€ adapters/ -β”‚ β”‚ └── ingestion/ -β”‚ β”‚ └── kafka/ -β”‚ β”‚ β”œβ”€β”€ producer.py ─── Streams workload to Kafka -β”‚ β”‚ └── consumer.py ─── Consumes & creates windows -β”‚ β”‚ -β”‚ └── config/ -β”‚ β”œβ”€β”€ settings.py ────────── Environment variables -β”‚ └── loaders.py ─────────── JSON file I/O -β”‚ -β”œβ”€β”€ src/templates/ -β”‚ └── index.html ─────────────── Dashboard UI -β”‚ -β”œβ”€β”€ src/static/ -β”‚ β”œβ”€β”€ style.css -β”‚ └── js/ -β”‚ β”œβ”€β”€ boot.js ────────────── App initialization -β”‚ β”œβ”€β”€ polling.js ─────────── Status polling -β”‚ β”œβ”€β”€ charts.js ──────────── Plotly visualization -β”‚ β”œβ”€β”€ recommendations.js ── LLM recommendation UI -β”‚ └── ui.js ──────────────── Event handlers -β”‚ -β”œβ”€β”€ config/ -β”‚ β”œβ”€β”€ topology.json ──────────── Cluster/host configuration -β”‚ └── slo.json ───────────────── Energy/runtime targets -β”‚ -β”œβ”€β”€ surf-workload/ -β”‚ β”œβ”€β”€ tasks.parquet ──────────── Sample workload tasks -β”‚ └── fragments.parquet ──────── Sample workload fragments -β”‚ -β”œβ”€β”€ tests/ -β”‚ β”œβ”€β”€ api/ -β”‚ β”œβ”€β”€ config/ -β”‚ β”œβ”€β”€ ingestion/ -β”‚ β”œβ”€β”€ optimization/ -β”‚ β”œβ”€β”€ orchestrator/ -β”‚ └── simulation/ -β”‚ -β”œβ”€β”€ docker-compose.yml ─────────── Kafka + OpenDT services -β”œβ”€β”€ Dockerfile ─────────────────── Container build -└── requirements.txt ───────────── Python dependencies -``` - -System flow overview: -``` -Docker Compose -β”‚ -β”œβ”€β”€ Kafka Service -β”‚ └── Topics: tasks, fragments -β”‚ -└── OpenDT Service - β”‚ - └── Flask App (cli.py β†’ app.py) - β”‚ - β”œβ”€β”€ API Routes (/api/*) - β”‚ └── β†’ OpenDTOrchestrator - β”‚ - └── OpenDTOrchestrator (core/orchestrator/controller.py) - β”‚ - β”œβ”€β”€ Producer Thread - β”‚ └── TimedKafkaProducer - β”‚ └── Streams surf-workload/*.parquet β†’ Kafka - β”‚ - └── Consumer Thread - └── DigitalTwinConsumer - └── Creates windows from Kafka - β”‚ - └── For each window: - β”‚ - β”œβ”€β”€ 1. Baseline - β”‚ └── OpenDCRunner β†’ simulation results - β”‚ - β”œβ”€β”€ 2. Optimize (loop) - β”‚ β”œβ”€β”€ LLM.optimize() β†’ new topology - β”‚ β”œβ”€β”€ OpenDCRunner β†’ test results - β”‚ └── Score vs SLO targets - β”‚ - └── 3. Update State - └── Store best config - β”‚ - └── UI polls /api/status -``` diff --git a/docs/OPTIMIZATION.md b/docs/OPTIMIZATION.md deleted file mode 100644 index 326f05d..0000000 --- a/docs/OPTIMIZATION.md +++ /dev/null @@ -1,217 +0,0 @@ -# Optimization - -OpenDT uses optimization strategies to propose topology changes that improve energy efficiency and runtime performance against SLO targets. - -## Overview - -After simulating each window's workload on the current topology, the optimizer **proposes modified topologies** to better meet SLO goals. These proposals are tested by re-simulating the same window with different hardware configurations, and the best result is kept. - -## SLO Targets - -Service Level Objectives (SLOs) are defined in `config/slo.json`: - -```json -{ - "energy_target": 2.0, // kWh per window - "runtime_target": 2.0 // simulated hours per window -} -``` - -These targets are **per-window** (5 minutes of workload trace time). The optimizer compares simulation results against these targets to determine if changes are needed. - -### Scoring - -The scoring function evaluates how well a topology performs: - -```python -score = (energy_kwh * 2.0) + (runtime_hours * 1.0) -``` - -- **Lower is better** -- Energy is weighted 2Γ— runtime (energy optimization prioritized) -- Used to compare baseline vs proposed topologies - -## Optimizer Inputs - -The optimizer receives comprehensive information about the current state: - -**1. Simulation Results** (from baseline OpenDC run): -```python -{ - 'energy_kwh': 0.84, - 'runtime_hours': 2.62, - 'cpu_utilization': 0.573, - 'max_power_draw': 400.0, - 'status': 'success' -} -``` - -**2. Batch Data** (from consumer window): -```python -{ - 'task_count': 113, - 'fragment_count': 1243, - 'avg_cpu_usage': 2156.3, // MHz - 'tasks_sample': [...], // Full task objects - 'fragments_sample': [...], // Full fragment objects - 'window_start': Timestamp(...), - 'window_end': Timestamp(...) -} -``` - -**3. SLO Targets**: -```python -{ - 'energy_target': 2.0, - 'runtime_target': 2.0 -} -``` - -**4. Current Topology**: -```python -{ - 'clusters': [{ - 'name': 'C01', - 'hosts': [{ - 'name': 'H01', - 'count': 12, // Physical host count - 'cpu': { - 'coreCount': 64, // Cores per CPU - 'coreSpeed': 4000 // MHz - }, - 'memory': { - 'memorySize': 214748364800 // Bytes - } - }] - }] -} -``` - -## Optimization Strategies - -### 1. LLM Optimizer - -Uses OpenAI GPT-3.5 to make intelligent topology recommendations. - -**Input to LLM:** -The system sends a structured prompt including: -```json -{ - "simulation_results": { - "energy_kwh": 0.84, - "runtime_hours": 2.62, - "cpu_utilization": 0.573 - }, - "workload_characteristics": { - "task_count": 113, - "fragment_count": 1243, - "avg_cpu_usage": 2156.3 - }, - "slo_targets": { - "energy_target": 2.0, - "runtime_target": 2.0 - }, - "current_topology": { - "clusters": [{ - "name": "C01", - "hosts": [{ - "name": "H01", - "count": 12, - "cpu": {"coreCount": 64, "coreSpeed": 4000} - }] - }] - } -} -``` - -**Output from LLM:** -The LLM responds with a suggested topology modification (host counts, CPU cores, CPU speeds) that is converted to OpenDC format and tested via simulation. - -**Fallback:** If the LLM call fails (timeout, API error, no API key), the system automatically falls back to rule-based optimization. - -### 2. Rule-Based Optimizer - -A deterministic, heuristic-based strategy used as fallback when LLM is unavailable. - -**Decision Logic:** - -| Condition | Action | Modification | -|-----------|--------|--------------| -| Energy β‰₯30% over target | Massive downscale | Reduce host count by 1 | -| Energy β‰₯15% over target | Downscale | Reduce CPU frequency by 10% (min 1800 MHz) | -| Runtime β‰₯25% over target | Scale up | Add 4 CPU cores (max 48) | -| Runtime β‰₯10% over target | Light scale up | Add 2 CPU cores (max 32) | -| Energy ≀-20% AND Runtime ≀-10% under target | Consolidate | Remove 2 CPU cores (min 8) | -| Otherwise | Maintain | No changes | - -**Example:** -If energy is 2.8 kWh against a target of 2.0 kWh (40% over), the rule-based optimizer reduces host count from 12 to 11. - -## Optimization Loop (Per Window) - -For each window, the orchestrator follows this process: - -1. **Baseline Simulation** - - Run OpenDC simulation on current window's workload using current topology - - Calculate baseline score from results - -2. **Optimization Attempt(s)** - - Call optimizer with baseline results, workload data, SLO targets, and current topology - - Optimizer returns a proposed topology modification - - Re-simulate the same window workload using the proposed topology - - Calculate score for the proposed configuration - -3. **Comparison & Selection** - - If proposed score is better than baseline by at least `IMPROVEMENT_DELTA` (0.05), keep the proposed topology - - Otherwise, keep the current topology - -4. **Store Best Configuration** - - Save the best topology and score in system state - - User can view and accept recommendations via the UI - -**Configuration:** -- `MAX_TRIES_PER_WINDOW = 1`: Number of optimization attempts per window (currently 1) -- `WINDOW_TRY_BUDGET_SEC = 30`: Maximum time allowed for optimization per window -- `IMPROVEMENT_DELTA = 0.05`: Minimum score improvement required to accept a change - -## Window-Scoped Optimization - -Optimization is performed **per-window**: only the current window's workload is used for testing topologies. Past windows are not re-simulated. - -### Example - -For Window 5 with 113 tasks and 1,243 fragments: - -**Baseline:** -- Topology: 12 hosts Γ— 64 cores @ 4000 MHz -- Simulate Window 5 workload β†’ energy: 0.84 kWh, runtime: 2.62h β†’ Score: 3.70 - -**Optimization:** -- LLM proposes: 10 hosts Γ— 48 cores @ 3500 MHz -- Re-simulate Window 5 workload β†’ energy: 0.72 kWh, runtime: 2.85h β†’ Score: 4.29 - -**Decision:** Keep baseline topology (score 3.70 is better than 4.29) - -The assumption is that a topology performing well on this window will likely work well for future similar workloads. - -## User Interaction - -The optimization loop **proposes** topologies but doesn't automatically apply them. The user can: - -1. **View recommendations** in the web UI dashboard -2. **Accept a recommendation** via the `/api/accept_recommendation` endpoint - -When a topology is accepted: -- It's written to `config/topology.json` -- The file watcher detects the change -- Future windows use the new topology as baseline - -## Configuration - -Key settings in `src/opendt/config/settings.py`: - -```python -IMPROVEMENT_DELTA = 0.05 # Minimum score improvement to accept change -WINDOW_TRY_BUDGET_SEC = 30.0 # Max time for optimization per window -MAX_TRIES_PER_WINDOW = 1 # Number of optimization attempts (currently 1) -``` diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100644 index 93eb49e..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/sh -set -e - -# Recompute JAVA_HOME safely in case image/base changes -if command -v java >/dev/null 2>&1; then - JH="$(dirname "$(dirname "$(readlink -f "$(command -v java)")")")" - if [ -d "$JH" ]; then - export JAVA_HOME="$JH" - export PATH="$JAVA_HOME/bin:$PATH" - fi -fi - -# Ensure Python can resolve the in-repo package layout -if [ -d /app/src ]; then - case ":$PYTHONPATH:" in - *":/app/src:"*) ;; - *) export PYTHONPATH="${PYTHONPATH:+$PYTHONPATH:}/app/src" ;; - esac -fi - -# Ensure the OpenDC runner is executable (covers bind-mount cases) -OPENDC_ROOT="/app/src/opendt/core/simulation/opendc" -RUNNER="${OPENDC_ROOT}/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner" - -if [ ! -f "$RUNNER" ]; then - RUNNER="${OPENDC_ROOT}/bin/OpenDCExperimentRunner/OpenDCExperimentRunner" -fi - -if [ ! -f "$RUNNER" ]; then - # Legacy fallback for older volume layouts - LEGACY_ROOT="/app/opendt-simulator/bin/OpenDCExperimentRunner" - if [ -f "${LEGACY_ROOT}/bin/OpenDCExperimentRunner" ]; then - RUNNER="${LEGACY_ROOT}/bin/OpenDCExperimentRunner" - OPENDC_ROOT="${LEGACY_ROOT}" - elif [ -f "${LEGACY_ROOT}/OpenDCExperimentRunner" ]; then - RUNNER="${LEGACY_ROOT}/OpenDCExperimentRunner" - OPENDC_ROOT="${LEGACY_ROOT}" - fi -fi - -if [ -f "$RUNNER" ]; then - chmod +x "$RUNNER" || true - # Make sure its directories are traversable - chmod -R a+rx "$OPENDC_ROOT" || true -fi - -#give perms to run script -#chmod 777 /app/run.sh - -exec "$@" diff --git a/libs/common/opendt_common/__init__.py b/libs/common/opendt_common/__init__.py new file mode 100644 index 0000000..3cc7f32 --- /dev/null +++ b/libs/common/opendt_common/__init__.py @@ -0,0 +1,27 @@ +"""OpenDT Common Library - Shared models and utilities.""" + +__version__ = "0.1.0" + +from opendt_common.config import ( + AppConfig, + DynamicConfigEvent, + FeatureFlags, + SimConfig, + WorkloadContext, + load_config_from_env, +) +from opendt_common.models import Consumption, Fragment, Task, Topology, TopologySnapshot + +__all__ = [ + "Task", + "Fragment", + "Consumption", + "Topology", + "TopologySnapshot", + "AppConfig", + "SimConfig", + "FeatureFlags", + "WorkloadContext", + "DynamicConfigEvent", + "load_config_from_env", +] diff --git a/libs/common/opendt_common/config.py b/libs/common/opendt_common/config.py new file mode 100644 index 0000000..40bfa5e --- /dev/null +++ b/libs/common/opendt_common/config.py @@ -0,0 +1,333 @@ +"""Configuration management for OpenDT services. + +This module provides: +1. Static configuration from YAML files (startup) +2. Dynamic configuration updates via Kafka (runtime) +3. Path resolution based on workload names +""" + +from pathlib import Path +from typing import Any + +import yaml +from pydantic import BaseModel, Field, field_validator + + +class KafkaTopicConfig(BaseModel): + """Configuration for a single Kafka topic.""" + + name: str = Field(..., description="The actual topic name") + partitions: int = Field(default=1, description="Number of partitions", gt=0) + replication_factor: int = Field(default=1, description="Replication factor", gt=0) + config: dict[str, str] = Field( + default_factory=dict, description="Key-value pairs for topic properties" + ) + + +class KafkaConfig(BaseModel): + """Kafka infrastructure configuration.""" + + bootstrap_servers: str = Field( + default="localhost:9092", description="Kafka bootstrap server addresses" + ) + topics: dict[str, KafkaTopicConfig] = Field( + default_factory=dict, description="Topic configurations keyed by logical name" + ) + + +class SimConfig(BaseModel): + """Simulation configuration parameters.""" + + speed_factor: float = Field( + default=10.0, description="Simulation speed: 1.0 = realtime, -1 = max speed, >1 = faster" + ) + window_size_minutes: int = Field( + default=5, description="Time window size in minutes for aggregation", gt=0 + ) + heartbeat_cadence_minutes: int = Field( + default=1, description="Cadence in simulation minutes for workload heartbeat messages", gt=0 + ) + experiment_mode: bool = Field( + default=False, + description="Enable experiment mode (write results to parquet instead of Kafka)", + ) + + @field_validator("speed_factor") + @classmethod + def validate_speed_factor(cls, v: float) -> float: + """Validate speed factor is either -1 or positive.""" + if v != -1 and v <= 0: + raise ValueError("speed_factor must be positive or -1 (max speed)") + return v + + +class FeatureFlags(BaseModel): + """Feature flags for enabling/disabling functionality.""" + + calibration_enabled: bool = Field(default=False, description="Enable power model calibration") + + +class WorkloadMetadata(BaseModel): + """Workload-specific metadata and configuration.""" + + name: str = Field(..., description="Workload name") + description: str | None = Field(default=None, description="Workload description") + consumption_offset_ms: int = Field( + default=0, description="Offset in ms to add to consumption timestamps" + ) + + @classmethod + def load(cls, path: Path) -> "WorkloadMetadata": + """Load workload metadata from YAML file.""" + if not path.exists(): + # Return default if file doesn't exist + return cls(name=path.parent.name) + + with open(path) as f: + data = yaml.safe_load(f) + + # Extract relevant fields + timestamps = data.get("timestamps", {}) + return cls( + name=data.get("name", path.parent.name), + description=data.get("description"), + consumption_offset_ms=timestamps.get("consumption_offset_ms", 0), + ) + + +class WorkloadContext(BaseModel): + """Workload context with resolved file paths.""" + + name: str = Field(..., description="Workload name (e.g., 'SURF')") + base_path: Path = Field(default=Path("/app/data"), description="Base data directory") + metadata: WorkloadMetadata | None = Field(None, description="Workload metadata") + + def __init__(self, **data): + """Initialize and load metadata if not provided.""" + super().__init__(**data) + if self.metadata is None and self.workload_config_file.exists(): + self.metadata = WorkloadMetadata.load(self.workload_config_file) + + @property + def tasks_file(self) -> Path: + """Path to tasks.parquet file.""" + return self.base_path / self.name / "tasks.parquet" + + @property + def fragments_file(self) -> Path: + """Path to fragments.parquet file.""" + return self.base_path / self.name / "fragments.parquet" + + @property + def consumption_file(self) -> Path: + """Path to consumption.parquet file.""" + return self.base_path / self.name / "consumption.parquet" + + @property + def topology_file(self) -> Path: + """Path to topology.json file.""" + return self.base_path / self.name / "topology.json" + + @property + def workload_dir(self) -> Path: + """Path to workload directory.""" + return self.base_path / self.name + + @property + def workload_config_file(self) -> Path: + """Path to workload configuration file.""" + return self.base_path / self.name / "workload.yaml" + + @property + def consumption_offset_ms(self) -> int: + """Get consumption timestamp offset in milliseconds.""" + if self.metadata: + return self.metadata.consumption_offset_ms + return 0 + + def exists(self) -> bool: + """Check if workload directory exists.""" + return self.workload_dir.exists() + + def get_file_status(self) -> dict[str, bool]: + """Check which workload files exist.""" + return { + "tasks": self.tasks_file.exists(), + "fragments": self.fragments_file.exists(), + "consumption": self.consumption_file.exists(), + "topology": self.topology_file.exists(), + } + + class Config: + arbitrary_types_allowed = True + + +class AppConfig(BaseModel): + """Main application configuration.""" + + workload: str = Field(..., description="Workload name (e.g., 'SURF')") + simulation: SimConfig = Field(default_factory=lambda: SimConfig()) + features: FeatureFlags = Field(default_factory=lambda: FeatureFlags()) + kafka: KafkaConfig = Field(default_factory=lambda: KafkaConfig()) + + def get_workload_context(self, base_path: Path | None = None) -> WorkloadContext: + """Get workload context with resolved paths. + + Args: + base_path: Override the default base path (/app/data) + + Returns: + WorkloadContext with resolved file paths + """ + if base_path is None: + base_path = Path("/app/data") + + return WorkloadContext(name=self.workload, base_path=base_path) + + @classmethod + def load(cls, path: str | Path) -> "AppConfig": + """Load configuration from YAML file. + + Args: + path: Path to YAML configuration file + + Returns: + Loaded AppConfig instance + + Raises: + FileNotFoundError: If config file doesn't exist + ValueError: If YAML is invalid + """ + config_path = Path(path) + + if not config_path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + + with open(config_path) as f: + data = yaml.safe_load(f) + + if not data: + raise ValueError(f"Empty or invalid YAML in {config_path}") + + return cls(**data) + + def save(self, path: str | Path) -> None: + """Save configuration to YAML file. + + Args: + path: Path to save configuration file + """ + config_path = Path(path) + config_path.parent.mkdir(parents=True, exist_ok=True) + + with open(config_path, "w") as f: + yaml.safe_dump(self.model_dump(), f, default_flow_style=False, sort_keys=False) + + def to_dict(self) -> dict[str, Any]: + """Convert config to dictionary.""" + return self.model_dump() + + +class DynamicConfigEvent(BaseModel): + """Event model for runtime configuration updates via Kafka. + + Published to topic: sys.config + + Example: + { + "setting_key": "simulation.speed_factor", + "new_value": 5.0, + "timestamp": "2024-01-01T12:00:00Z", + "source": "api" + } + """ + + setting_key: str = Field( + ..., description="Dot-notation path to setting (e.g., 'simulation.speed_factor')" + ) + new_value: Any = Field(..., description="New value for the setting") + timestamp: str | None = Field(None, description="ISO timestamp of when the change was made") + source: str | None = Field( + None, description="Source of the configuration change (e.g., 'api', 'admin')" + ) + + def apply_to_config(self, config: AppConfig) -> AppConfig: + """Apply this configuration change to an AppConfig instance. + + Args: + config: The configuration to update + + Returns: + Updated configuration (new instance) + + Raises: + ValueError: If setting_key is invalid + """ + # Parse the setting key (e.g., "simulation.speed_factor") + parts = self.setting_key.split(".") + + if len(parts) < 2: + raise ValueError(f"Invalid setting_key: {self.setting_key}") + + # Create a new config with the updated value + config_dict = config.to_dict() + + # Navigate to the nested setting + target = config_dict + for part in parts[:-1]: + if part not in target: + raise ValueError(f"Invalid setting path: {self.setting_key}") + target = target[part] + + # Update the value + final_key = parts[-1] + if final_key not in target: + raise ValueError(f"Invalid setting key: {self.setting_key}") + + target[final_key] = self.new_value + + # Return new config instance + return AppConfig(**config_dict) + + +# Convenience function for loading config from environment +def load_config_from_env(env_var: str = "CONFIG_FILE") -> AppConfig: + """Load configuration from path specified in environment variable. + + Args: + env_var: Name of environment variable containing config path + + Returns: + Loaded AppConfig instance + + Raises: + ValueError: If environment variable not set + FileNotFoundError: If config file doesn't exist + """ + import os + + config_path = os.getenv(env_var) + if not config_path: + raise ValueError(f"Environment variable {env_var} not set") + + return AppConfig.load(config_path) + + +# Example usage +if __name__ == "__main__": + # Load config + config = AppConfig.load("config/default.yaml") + print(f"Loaded config for workload: {config.workload}") + + # Get workload context + workload = config.get_workload_context() + print(f"Tasks file: {workload.tasks_file}") + print(f"Fragments file: {workload.fragments_file}") + print(f"File status: {workload.get_file_status()}") + + # Dynamic update + event = DynamicConfigEvent( + setting_key="simulation.speed_factor", new_value=20.0, timestamp=None, source="api" + ) + updated_config = event.apply_to_config(config) + print(f"Updated speed factor: {updated_config.simulation.speed_factor}") diff --git a/libs/common/opendt_common/models/__init__.py b/libs/common/opendt_common/models/__init__.py new file mode 100644 index 0000000..db03673 --- /dev/null +++ b/libs/common/opendt_common/models/__init__.py @@ -0,0 +1,32 @@ +"""Shared Pydantic models for OpenDT.""" + +from opendt_common.models.consumption import Consumption +from opendt_common.models.fragment import Fragment +from opendt_common.models.task import Task +from opendt_common.models.topology import ( + CPU, + Cluster, + CPUPowerModel, + Host, + Memory, + Topology, + TopologySnapshot, +) +from opendt_common.models.workload_message import WorkloadMessage + +# Update forward references for Task.fragments +Task.model_rebuild() + +__all__ = [ + "Task", + "Fragment", + "Consumption", + "Topology", + "TopologySnapshot", + "Cluster", + "Host", + "CPU", + "Memory", + "CPUPowerModel", + "WorkloadMessage", +] diff --git a/libs/common/opendt_common/models/consumption.py b/libs/common/opendt_common/models/consumption.py new file mode 100644 index 0000000..b8ad572 --- /dev/null +++ b/libs/common/opendt_common/models/consumption.py @@ -0,0 +1,50 @@ +"""Consumption model from consumption.parquet.""" + +from datetime import datetime + +from pydantic import BaseModel, Field, field_validator + + +class Consumption(BaseModel): + """Represents power/resource consumption data. + + Schema matches consumption.parquet: + - power_draw: Power consumption in watts + - energy_usage: Energy consumed in joules + - timestamp: Absolute timestamp of measurement + """ + + power_draw: float = Field(..., description="Power consumption in watts", ge=0) + energy_usage: float = Field(..., description="Energy consumed in joules", ge=0) + timestamp: datetime = Field(..., description="Absolute timestamp of measurement") + + @field_validator("timestamp", mode="before") + @classmethod + def parse_timestamp(cls, v: datetime | int | float) -> datetime: + """Parse timestamp from epoch milliseconds to datetime.""" + if isinstance(v, (int, float)): + # Convert milliseconds to seconds for datetime + return datetime.fromtimestamp(v / 1000.0) + return v + + @property + def energy_usage_kwh(self) -> float: + """Get energy usage in kilowatt-hours (kWh). + + 1 kWh = 3,600,000 joules + """ + return self.energy_usage / 3_600_000.0 + + @property + def power_draw_kw(self) -> float: + """Get power draw in kilowatts (kW).""" + return self.power_draw / 1000.0 + + class Config: + json_schema_extra = { + "example": { + "power_draw": 250.5, + "energy_usage": 125250.0, + "timestamp": "2024-01-01T00:00:00Z", + } + } diff --git a/libs/common/opendt_common/models/fragment.py b/libs/common/opendt_common/models/fragment.py new file mode 100644 index 0000000..a76c9ea --- /dev/null +++ b/libs/common/opendt_common/models/fragment.py @@ -0,0 +1,51 @@ +"""Fragment model from fragments.parquet.""" + +from pydantic import BaseModel, Field, field_validator + + +class Fragment(BaseModel): + """Represents a workload fragment. + + Schema matches fragments.parquet: + - task_id: Task ID (foreign key, aliased from 'id' in parquet) + - duration: Fragment duration in milliseconds + - cpu_count: Number of CPU cores + - cpu_usage: MHz usage per CPU core + """ + + task_id: int = Field(..., alias="id", description="Task ID (foreign key)") + duration: int = Field(..., description="Fragment duration in milliseconds", ge=0) + cpu_count: int = Field(..., description="Number of CPU cores", ge=0) + cpu_usage: float = Field(..., description="MHz usage per CPU core", ge=0) + + @field_validator("task_id", mode="before") + @classmethod + def parse_id(cls, v: str | int) -> int: + """Parse task ID from string to int.""" + if isinstance(v, str): + # Handle "task-123" -> 123 + if "task-" in v: + return int(v.split("-")[1]) + return int(v) + return v + + @property + def duration_seconds(self) -> float: + """Get duration in seconds (convenience property).""" + return self.duration / 1000.0 + + @property + def total_cpu_usage_mhz(self) -> float: + """Get total CPU usage in MHz.""" + return self.cpu_count * self.cpu_usage + + class Config: + populate_by_name = True # Allow both 'id' and 'task_id' + json_schema_extra = { + "example": { + "id": 123, # Will be mapped to task_id + "duration": 5000, + "cpu_count": 4, + "cpu_usage": 1800.0, + } + } diff --git a/libs/common/opendt_common/models/task.py b/libs/common/opendt_common/models/task.py new file mode 100644 index 0000000..64cdbfe --- /dev/null +++ b/libs/common/opendt_common/models/task.py @@ -0,0 +1,90 @@ +"""Task model from tasks.parquet.""" + +from datetime import datetime + +# Import Fragment for type hints (avoiding circular import) +from typing import TYPE_CHECKING + +from pydantic import BaseModel, Field, field_validator + +if TYPE_CHECKING: + from .fragment import Fragment + + +class Task(BaseModel): + """Represents a computational task from the workload trace. + + This is the AGGREGATE ROOT for workload events. + + Schema matches tasks.parquet: + - id: Task identifier (parsed from string to int) + - submission_time: Task submission timestamp (epoch ms) + - duration: Task duration in milliseconds + - cpu_count: Number of CPU cores + - cpu_capacity: MHz per CPU core + - mem_capacity: Memory in MB + - fragments: List of child fragments (aggregated, not in parquet) + """ + + id: int = Field(..., description="Unique task identifier") + submission_time: datetime = Field(..., description="Task submission timestamp (epoch ms)") + duration: int = Field(..., description="Task duration in milliseconds", ge=0) + cpu_count: int = Field(..., description="Number of CPU cores", ge=0) + cpu_capacity: float = Field(..., description="MHz per CPU core", ge=0) + mem_capacity: int = Field(..., description="Memory capacity in MB", ge=0) + + # AGGREGATION FIELD: Not in Parquet, populated by producer + fragments: list["Fragment"] = Field(default_factory=list, description="Child fragments") + + @field_validator("id", mode="before") + @classmethod + def parse_id(cls, v: str | int) -> int: + """Parse task ID from string to int.""" + if isinstance(v, str): + # Handle "task-123" -> 123 + if "task-" in v: + return int(v.split("-")[1]) + return int(v) + return v + + @field_validator("submission_time", mode="before") + @classmethod + def parse_submission_time(cls, v: datetime | int | float) -> datetime: + """Parse submission time from epoch milliseconds to datetime.""" + if isinstance(v, (int, float)): + # Convert milliseconds to seconds for datetime + return datetime.fromtimestamp(v / 1000.0) + return v + + @property + def duration_seconds(self) -> float: + """Get duration in seconds (convenience property).""" + return self.duration / 1000.0 + + @property + def total_cpu_mhz(self) -> float: + """Get total CPU capacity in MHz.""" + return self.cpu_count * self.cpu_capacity + + @property + def mem_capacity_gb(self) -> float: + """Get memory capacity in GB (convenience property).""" + return self.mem_capacity / 1024.0 + + @property + def fragment_count(self) -> int: + """Get number of fragments.""" + return len(self.fragments) + + class Config: + json_schema_extra = { + "example": { + "id": 123, + "submission_time": "2024-01-01T00:00:00Z", + "duration": 120500, + "cpu_count": 4, + "cpu_capacity": 2400.0, + "mem_capacity": 4096, + "fragments": [], + } + } diff --git a/libs/common/opendt_common/models/topology.py b/libs/common/opendt_common/models/topology.py new file mode 100644 index 0000000..dbff16c --- /dev/null +++ b/libs/common/opendt_common/models/topology.py @@ -0,0 +1,122 @@ +"""Topology models for datacenter infrastructure. + +This module defines the hierarchical structure of a datacenter: +- Datacenter (root) + - Clusters + - Hosts + - CPU + - Memory + - CPU Power Model +""" + +from datetime import datetime +from typing import Literal + +from pydantic import BaseModel, Field + + +class CPU(BaseModel): + """CPU specification for a host.""" + + coreCount: int = Field(..., description="Number of CPU cores", gt=0) + coreSpeed: float = Field(..., description="CPU speed in MHz", gt=0) + + +class Memory(BaseModel): + """Memory specification for a host.""" + + memorySize: int = Field(..., description="Memory size in bytes", gt=0) + + +class CPUPowerModel(BaseModel): + """CPU power consumption model. + + Defines how CPU utilization translates to power consumption (Watts). + """ + + modelType: Literal["asymptotic", "linear", "square", "cubic", "sqrt"] = Field( + ..., description="Power model type" + ) + power: float = Field(..., description="Nominal power consumption in Watts", gt=0) + idlePower: float = Field(..., description="Power at 0% utilization in Watts", ge=0) + maxPower: float = Field(..., description="Power at 100% utilization in Watts", gt=0) + asymUtil: float = Field( + default=0.5, + description="Asymptotic utilization coefficient (for asymptotic model)", + ge=0, + le=1, + ) + dvfs: bool = Field( + default=False, + description="Dynamic Voltage and Frequency Scaling enabled", + ) + + +class Host(BaseModel): + """Host (physical server) in a datacenter cluster.""" + + name: str = Field(..., description="Host identifier/name") + count: int = Field(..., description="Number of identical hosts", gt=0) + cpu: CPU = Field(..., description="CPU specification") + memory: Memory = Field(..., description="Memory specification") + cpuPowerModel: CPUPowerModel = Field(..., description="CPU power consumption model") + + +class Cluster(BaseModel): + """Cluster of hosts in a datacenter.""" + + name: str = Field(..., description="Cluster identifier/name") + hosts: list[Host] = Field(..., description="List of host types in this cluster", min_length=1) + + +class Topology(BaseModel): + """Datacenter topology definition. + + Represents the hierarchical structure and hardware capabilities + of a datacenter for simulation purposes. + """ + + clusters: list[Cluster] = Field( + ..., description="List of clusters in the datacenter", min_length=1 + ) + + def total_host_count(self) -> int: + """Calculate total number of physical hosts across all clusters.""" + return sum(host.count for cluster in self.clusters for host in cluster.hosts) + + def total_core_count(self) -> int: + """Calculate total number of CPU cores across all clusters.""" + return sum( + host.count * host.cpu.coreCount for cluster in self.clusters for host in cluster.hosts + ) + + def total_memory_bytes(self) -> int: + """Calculate total memory capacity in bytes across all clusters.""" + return sum( + host.count * host.memory.memorySize + for cluster in self.clusters + for host in cluster.hosts + ) + + class Config: + # Allow extra fields for forward compatibility + extra = "allow" + + +class TopologySnapshot(BaseModel): + """Timestamped topology snapshot for Kafka messages. + + Wraps a Topology with a timestamp indicating when it was captured/published. + """ + + timestamp: datetime = Field( + ..., description="When this topology snapshot was captured (ISO 8601 format)" + ) + topology: Topology = Field(..., description="The datacenter topology") + + class Config: + json_encoders = { + datetime: lambda v: v.strftime("%Y-%m-%dT%H:%M:%S") + if v.microsecond == 0 + else v.isoformat() + } diff --git a/libs/common/opendt_common/models/workload_message.py b/libs/common/opendt_common/models/workload_message.py new file mode 100644 index 0000000..f0d12ad --- /dev/null +++ b/libs/common/opendt_common/models/workload_message.py @@ -0,0 +1,52 @@ +"""Workload message wrapper for dc.workload topic. + +Wraps both task submissions and heartbeat messages. +""" + +from datetime import datetime +from typing import Literal + +from pydantic import BaseModel, Field + +from .task import Task + + +class WorkloadMessage(BaseModel): + """Wrapper for messages on dc.workload topic. + + Supports two message types: + - 'task': A workload submission containing task data + - 'heartbeat': A keepalive message to help consumers detect end-of-stream + + The heartbeat mechanism allows consumers to distinguish between: + - No new tasks because the workload is complete + - No new tasks because Kafka is delayed + """ + + message_type: Literal["task", "heartbeat"] = Field( + ..., description="Type of message: 'task' for workload or 'heartbeat' for keepalive" + ) + timestamp: datetime = Field(..., description="Simulation timestamp of this message") + task: Task | None = Field( + None, description="Task data (only present when message_type='task')" + ) + + class Config: + json_schema_extra = { + "examples": [ + { + "message_type": "task", + "timestamp": "2024-01-01T00:05:30", + "task": { + "id": 123, + "submission_time": "2024-01-01T00:05:30", + "duration": 120500, + "cpu_count": 4, + "cpu_capacity": 2400.0, + "mem_capacity": 4096, + "fragments": [], + }, + }, + {"message_type": "heartbeat", "timestamp": "2024-01-01T00:06:00", "task": None}, + ] + } diff --git a/libs/common/opendt_common/utils/__init__.py b/libs/common/opendt_common/utils/__init__.py new file mode 100644 index 0000000..d172fc0 --- /dev/null +++ b/libs/common/opendt_common/utils/__init__.py @@ -0,0 +1,8 @@ +"""Shared utilities for OpenDT services.""" + +from opendt_common.utils.kafka import get_kafka_consumer, get_kafka_producer + +__all__ = [ + "get_kafka_producer", + "get_kafka_consumer", +] diff --git a/libs/common/opendt_common/utils/kafka.py b/libs/common/opendt_common/utils/kafka.py new file mode 100644 index 0000000..f3aada2 --- /dev/null +++ b/libs/common/opendt_common/utils/kafka.py @@ -0,0 +1,124 @@ +"""Kafka utilities for OpenDT services.""" + +import json +import logging +import os +from typing import Any + +from kafka import KafkaConsumer, KafkaProducer +from kafka.errors import KafkaError + +logger = logging.getLogger(__name__) + + +def get_kafka_bootstrap_servers() -> str: + """Get Kafka bootstrap servers from environment or use default.""" + return os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092") + + +def get_kafka_producer(bootstrap_servers: str | None = None, **kwargs: Any) -> KafkaProducer: + """Create a Kafka producer with sensible defaults. + + Args: + bootstrap_servers: Kafka bootstrap servers (defaults to env var) + **kwargs: Additional KafkaProducer configuration + + Returns: + Configured KafkaProducer instance + """ + if bootstrap_servers is None: + bootstrap_servers = get_kafka_bootstrap_servers() + + default_config = { + "bootstrap_servers": bootstrap_servers, + "value_serializer": lambda v: json.dumps(v).encode("utf-8"), + "key_serializer": lambda k: k.encode("utf-8") if k else None, + "acks": "all", + "retries": 3, + "max_in_flight_requests_per_connection": 1, + # Streaming-friendly settings: send immediately rather than batching + "linger_ms": 10, # Wait max 10ms before sending a batch + "batch_size": 16384, # 16KB batch size (default) + # Increase message size limits for large task aggregates with many fragments + "max_request_size": 10485760, # 10MB (default is 1MB) + "buffer_memory": 33554432, # 32MB buffer (default) + } + + # Merge with user-provided config + config = {**default_config, **kwargs} + + logger.info(f"Creating Kafka producer for {bootstrap_servers}") + return KafkaProducer(**config) + + +def get_kafka_consumer( + topics: list[str], group_id: str, bootstrap_servers: str | None = None, **kwargs: Any +) -> KafkaConsumer: + """Create a Kafka consumer with sensible defaults. + + Args: + topics: List of topics to subscribe to + group_id: Consumer group ID + bootstrap_servers: Kafka bootstrap servers (defaults to env var) + **kwargs: Additional KafkaConsumer configuration + + Returns: + Configured KafkaConsumer instance + """ + if bootstrap_servers is None: + bootstrap_servers = get_kafka_bootstrap_servers() + + default_config = { + "bootstrap_servers": bootstrap_servers, + "group_id": group_id, + "value_deserializer": lambda m: json.loads(m.decode("utf-8")), + "key_deserializer": lambda k: k.decode("utf-8") if k else None, + "auto_offset_reset": "earliest", + "enable_auto_commit": True, + "max_poll_records": 500, + } + + # Merge with user-provided config + config = {**default_config, **kwargs} + + logger.info(f"Creating Kafka consumer for topics {topics} in group {group_id}") + return KafkaConsumer(*topics, **config) + + +_message_count = {"count": 0} # Mutable counter for first-message logging + + +def send_message( + producer: KafkaProducer, topic: str, message: dict[str, Any], key: str | None = None +) -> None: + """Send a message to a Kafka topic. + + Args: + producer: KafkaProducer instance + topic: Topic name + message: Message payload (will be JSON serialized) + key: Optional message key + """ + try: + future = producer.send(topic, key=key, value=message) + # Block for 'synchronous' sends + record_metadata = future.get(timeout=10) + + # Log first few messages at INFO level for debugging + _message_count["count"] += 1 + if _message_count["count"] <= 5: + logger.info( + f"βœ“ Message {_message_count['count']} sent to {topic} " + f"partition {record_metadata.partition} offset {record_metadata.offset}" + ) + else: + logger.debug( + f"Message sent to {topic} partition {record_metadata.partition} " + f"at offset {record_metadata.offset}" + ) + except KafkaError as e: + logger.error(f"Failed to send message to {topic}: {e}") + raise + except Exception as e: + logger.error(f"Unexpected error sending message to {topic}: {e}", exc_info=True) + raise diff --git a/libs/common/pyproject.toml b/libs/common/pyproject.toml new file mode 100644 index 0000000..6b127a8 --- /dev/null +++ b/libs/common/pyproject.toml @@ -0,0 +1,73 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "opendt-common" +version = "0.1.0" +description = "Shared models and utilities for OpenDT services" +readme = "README.md" +requires-python = ">=3.11" +authors = [ + {name = "OpenDT Team", email = "dev@opendt.example.com"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", +] + +dependencies = [ + "pydantic>=2.0.0", + "pydantic-settings>=2.0.0", + "kafka-python>=2.0.2", + "pyyaml>=6.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-cov>=4.1.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.5.0", +] +test = [ + "pytest>=7.4.0", + "pandas>=2.0.0", + "pyarrow>=13.0.0", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["opendt_common*"] + +[tool.setuptools.package-data] +opendt_common = ["py.typed"] + +[tool.black] +line-length = 100 +target-version = ["py311"] + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "C", # flake8-comprehensions + "B", # flake8-bugbear + "UP", # pyupgrade +] +ignore = [] + +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true diff --git a/libs/common/pytest.ini b/libs/common/pytest.ini new file mode 100644 index 0000000..34f4d88 --- /dev/null +++ b/libs/common/pytest.ini @@ -0,0 +1,11 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +addopts = + -v + --tb=short + --strict-markers +markers = + slow: marks tests as slow diff --git a/libs/common/tests/__init__.py b/libs/common/tests/__init__.py new file mode 100644 index 0000000..62b623d --- /dev/null +++ b/libs/common/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for opendt-common library.""" diff --git a/libs/common/tests/test_models.py b/libs/common/tests/test_models.py new file mode 100644 index 0000000..c90eeda --- /dev/null +++ b/libs/common/tests/test_models.py @@ -0,0 +1,227 @@ +"""Tests for Pydantic models with actual SURF workload data.""" + +from datetime import datetime +from pathlib import Path + +import pandas as pd +import pytest +from opendt_common import Consumption, Fragment, Task + +# Locate test data +DATA_DIR = Path(__file__).parent.parent.parent.parent / "data" / "SURF" +TASKS_FILE = DATA_DIR / "tasks.parquet" +FRAGMENTS_FILE = DATA_DIR / "fragments.parquet" +CONSUMPTION_FILE = DATA_DIR / "consumption.parquet" + + +@pytest.fixture +def tasks_df(): + """Load tasks dataframe.""" + if not TASKS_FILE.exists(): + pytest.skip(f"Test data not found: {TASKS_FILE}") + return pd.read_parquet(TASKS_FILE) + + +@pytest.fixture +def fragments_df(): + """Load fragments dataframe.""" + if not FRAGMENTS_FILE.exists(): + pytest.skip(f"Test data not found: {FRAGMENTS_FILE}") + return pd.read_parquet(FRAGMENTS_FILE) + + +@pytest.fixture +def consumption_df(): + """Load consumption dataframe.""" + if not CONSUMPTION_FILE.exists(): + pytest.skip(f"Test data not found: {CONSUMPTION_FILE}") + return pd.read_parquet(CONSUMPTION_FILE) + + +class TestTaskModel: + """Test Task Pydantic model.""" + + def test_parse_first_task(self, tasks_df): + """Test parsing first task row.""" + task_dict = tasks_df.iloc[0].to_dict() + task = Task(**task_dict) + + assert isinstance(task.id, int) + assert isinstance(task.submission_time, datetime) + assert task.duration > 0 + assert task.cpu_count >= 0 + assert task.cpu_capacity >= 0 + assert task.mem_capacity >= 0 + + def test_parse_all_tasks(self, tasks_df): + """Test parsing all tasks.""" + errors = [] + for idx, row in tasks_df.iterrows(): + try: + Task(**row.to_dict()) + except Exception as e: + errors.append(f"Row {idx}: {e}") + + assert len(errors) == 0, f"Failed to parse {len(errors)} tasks:\n" + "\n".join(errors[:5]) + + def test_task_id_parsing(self, tasks_df): + """Test ID parsing from string.""" + task_dict = tasks_df.iloc[0].to_dict() + task_dict['id'] = f"task-{task_dict['id']}" + task = Task(**task_dict) + assert isinstance(task.id, int) + + def test_task_properties(self, tasks_df): + """Test computed properties.""" + task = Task(**tasks_df.iloc[0].to_dict()) + + assert task.duration_seconds == task.duration / 1000.0 + assert task.total_cpu_mhz == task.cpu_count * task.cpu_capacity + assert task.mem_capacity_gb == task.mem_capacity / 1024.0 + + def test_task_with_fragments(self, tasks_df, fragments_df): + """Test task with nested fragments.""" + first_task_id = tasks_df.iloc[0]['id'] + fragments_by_task = fragments_df.groupby('id') + + task_dict = tasks_df.iloc[0].to_dict() + + if first_task_id in fragments_by_task.groups: + task_fragments = fragments_by_task.get_group(first_task_id) + task_dict['fragments'] = [Fragment(**row.to_dict()) for _, row in task_fragments.iterrows()] + + task = Task(**task_dict) + assert isinstance(task.fragments, list) + assert task.fragment_count == len(task.fragments) + + +class TestFragmentModel: + """Test Fragment Pydantic model.""" + + def test_parse_first_fragment(self, fragments_df): + """Test parsing first fragment row.""" + fragment_dict = fragments_df.iloc[0].to_dict() + fragment = Fragment(**fragment_dict) + + assert isinstance(fragment.task_id, int) + assert fragment.duration > 0 + assert fragment.cpu_count >= 0 + assert fragment.cpu_usage >= 0 + + def test_parse_all_fragments(self, fragments_df): + """Test parsing all fragments.""" + errors = [] + for idx, row in fragments_df.iterrows(): + try: + Fragment(**row.to_dict()) + except Exception as e: + errors.append(f"Row {idx}: {e}") + + assert len(errors) == 0, f"Failed to parse {len(errors)} fragments:\n" + "\n".join(errors[:5]) + + def test_fragment_id_parsing(self, fragments_df): + """Test ID parsing with alias.""" + fragment_dict = fragments_df.iloc[0].to_dict() + fragment_dict['id'] = f"task-{fragment_dict['id']}" + fragment = Fragment(**fragment_dict) + assert isinstance(fragment.task_id, int) + + def test_fragment_properties(self, fragments_df): + """Test computed properties.""" + fragment = Fragment(**fragments_df.iloc[0].to_dict()) + + assert fragment.duration_seconds == fragment.duration / 1000.0 + assert fragment.total_cpu_usage_mhz == fragment.cpu_count * fragment.cpu_usage + + +class TestConsumptionModel: + """Test Consumption Pydantic model.""" + + def test_parse_first_consumption(self, consumption_df): + """Test parsing first consumption row.""" + cons_dict = consumption_df.iloc[0].to_dict() + consumption = Consumption(**cons_dict) + + assert consumption.power_draw >= 0 + assert consumption.energy_usage >= 0 + assert isinstance(consumption.timestamp, datetime) + + def test_parse_all_consumption(self, consumption_df): + """Test parsing all consumption records.""" + errors = [] + for idx, row in consumption_df.iterrows(): + try: + Consumption(**row.to_dict()) + except Exception as e: + errors.append(f"Row {idx}: {e}") + + assert len(errors) == 0, f"Failed to parse {len(errors)} records:\n" + "\n".join(errors[:5]) + + def test_consumption_properties(self, consumption_df): + """Test computed properties.""" + consumption = Consumption(**consumption_df.iloc[0].to_dict()) + + assert consumption.power_draw_kw == consumption.power_draw / 1000.0 + assert consumption.energy_usage_kwh == consumption.energy_usage / 3_600_000.0 + + +class TestAggregation: + """Test task-fragment aggregation logic.""" + + def test_fragments_match_tasks(self, tasks_df, fragments_df): + """Test that fragment IDs match task IDs.""" + task_ids = set(tasks_df['id'].unique()) + fragment_ids = set(fragments_df['id'].unique()) + + assert fragment_ids.issubset(task_ids), "Found fragments with non-existent task IDs" + + def test_full_aggregation(self, tasks_df, fragments_df): + """Test full aggregation process.""" + fragments_by_task = fragments_df.groupby('id') + + tasks = [] + for _, task_row in tasks_df.iterrows(): + task_dict = task_row.to_dict() + task_id = task_dict['id'] + + if task_id in fragments_by_task.groups: + task_fragments = fragments_by_task.get_group(task_id) + task_dict['fragments'] = [Fragment(**row.to_dict()) for _, row in task_fragments.iterrows()] + + tasks.append(Task(**task_dict)) + + assert len(tasks) == len(tasks_df) + total_fragments = sum(t.fragment_count for t in tasks) + assert total_fragments == len(fragments_df) + + +class TestSerializationDeserialization: + """Test JSON serialization/deserialization.""" + + def test_task_json_roundtrip(self, tasks_df): + """Test task JSON serialization.""" + task = Task(**tasks_df.iloc[0].to_dict()) + json_str = task.model_dump_json() + task_dict = task.model_dump() + + assert isinstance(json_str, str) + assert isinstance(task_dict, dict) + assert task_dict['id'] == task.id + + def test_fragment_json_roundtrip(self, fragments_df): + """Test fragment JSON serialization.""" + fragment = Fragment(**fragments_df.iloc[0].to_dict()) + json_str = fragment.model_dump_json() + fragment_dict = fragment.model_dump() + + assert isinstance(json_str, str) + assert isinstance(fragment_dict, dict) + + def test_consumption_json_roundtrip(self, consumption_df): + """Test consumption JSON serialization.""" + consumption = Consumption(**consumption_df.iloc[0].to_dict()) + json_str = consumption.model_dump_json() + cons_dict = consumption.model_dump() + + assert isinstance(json_str, str) + assert isinstance(cons_dict, dict) diff --git a/libs/common/tests/test_topology.py b/libs/common/tests/test_topology.py new file mode 100644 index 0000000..8e95fbe --- /dev/null +++ b/libs/common/tests/test_topology.py @@ -0,0 +1,263 @@ +"""Tests for Topology models.""" + +import json +from datetime import datetime +from pathlib import Path + +import pytest + +from opendt_common.models.topology import ( + CPU, + Cluster, + CPUPowerModel, + Host, + Memory, + Topology, + TopologySnapshot, +) + + +@pytest.fixture +def sample_topology_data() -> dict: + """Sample topology data matching SURF workload structure.""" + return { + "clusters": [ + { + "name": "A01", + "hosts": [ + { + "name": "A01", + "count": 277, + "cpu": {"coreCount": 16, "coreSpeed": 2100}, + "memory": {"memorySize": 128000000}, + "cpuPowerModel": { + "modelType": "asymptotic", + "power": 400, + "idlePower": 32, + "maxPower": 180, + "asymUtil": 0.3, + "dvfs": False, + }, + } + ], + } + ] + } + + +def test_cpu_model(): + """Test CPU model validation.""" + cpu = CPU(coreCount=16, coreSpeed=2100.0) + assert cpu.coreCount == 16 + assert cpu.coreSpeed == 2100.0 + + # Test validation + with pytest.raises(Exception): + CPU(coreCount=0, coreSpeed=2100.0) # Invalid: coreCount must be > 0 + + +def test_memory_model(): + """Test Memory model validation.""" + memory = Memory(memorySize=128000000) + assert memory.memorySize == 128000000 + + # Test validation + with pytest.raises(Exception): + Memory(memorySize=0) # Invalid: memorySize must be > 0 + + +def test_cpu_power_model(): + """Test CPUPowerModel validation.""" + power_model = CPUPowerModel( + modelType="asymptotic", + power=400.0, + idlePower=32.0, + maxPower=180.0, + asymUtil=0.3, + dvfs=False, + ) + assert power_model.modelType == "asymptotic" + assert power_model.power == 400.0 + assert power_model.idlePower == 32.0 + assert power_model.maxPower == 180.0 + assert power_model.asymUtil == 0.3 + assert power_model.dvfs is False + + +def test_host_model(): + """Test Host model.""" + host = Host( + name="A01", + count=277, + cpu=CPU(coreCount=16, coreSpeed=2100.0), + memory=Memory(memorySize=128000000), + cpuPowerModel=CPUPowerModel( + modelType="asymptotic", + power=400.0, + idlePower=32.0, + maxPower=180.0, + asymUtil=0.3, + dvfs=False, + ), + ) + assert host.name == "A01" + assert host.count == 277 + assert host.cpu.coreCount == 16 + assert host.memory.memorySize == 128000000 + + +def test_cluster_model(): + """Test Cluster model.""" + cluster = Cluster( + name="A01", + hosts=[ + Host( + name="A01", + count=277, + cpu=CPU(coreCount=16, coreSpeed=2100.0), + memory=Memory(memorySize=128000000), + cpuPowerModel=CPUPowerModel( + modelType="asymptotic", + power=400.0, + idlePower=32.0, + maxPower=180.0, + asymUtil=0.3, + dvfs=False, + ), + ) + ], + ) + assert cluster.name == "A01" + assert len(cluster.hosts) == 1 + assert cluster.hosts[0].name == "A01" + + +def test_topology_model(sample_topology_data): + """Test Topology model.""" + topology = Topology(**sample_topology_data) + assert len(topology.clusters) == 1 + assert topology.clusters[0].name == "A01" + + +def test_topology_from_json(sample_topology_data): + """Test creating Topology from JSON data.""" + topology = Topology(**sample_topology_data) + + # Verify structure + assert len(topology.clusters) == 1 + cluster = topology.clusters[0] + assert cluster.name == "A01" + assert len(cluster.hosts) == 1 + + host = cluster.hosts[0] + assert host.name == "A01" + assert host.count == 277 + assert host.cpu.coreCount == 16 + assert host.cpu.coreSpeed == 2100 + assert host.memory.memorySize == 128000000 + + +def test_topology_calculations(sample_topology_data): + """Test Topology utility methods.""" + topology = Topology(**sample_topology_data) + + # Test calculations + assert topology.total_host_count() == 277 + assert topology.total_core_count() == 277 * 16 # 4432 + assert topology.total_memory_bytes() == 277 * 128000000 + + +def test_topology_model_dump(sample_topology_data): + """Test Topology serialization.""" + topology = Topology(**sample_topology_data) + + # Serialize back to dict + dumped = topology.model_dump(mode="json") + + # Should be able to reconstruct + topology2 = Topology(**dumped) + assert topology2.total_host_count() == topology.total_host_count() + assert topology2.total_core_count() == topology.total_core_count() + + +def test_topology_from_surf_file(): + """Test loading actual SURF topology file if it exists.""" + surf_topology_path = Path(__file__).parent.parent.parent.parent / "data/SURF/topology.json" + + if not surf_topology_path.exists(): + pytest.skip("SURF topology file not found") + + with open(surf_topology_path) as f: + data = json.load(f) + + topology = Topology(**data) + assert len(topology.clusters) > 0 + assert topology.total_host_count() > 0 + assert topology.total_core_count() > 0 + + +def test_topology_snapshot(sample_topology_data): + """Test TopologySnapshot with timestamp.""" + topology = Topology(**sample_topology_data) + timestamp = datetime(2022, 10, 7, 9, 14, 30) + + snapshot = TopologySnapshot(timestamp=timestamp, topology=topology) + + assert snapshot.timestamp == timestamp + assert snapshot.topology == topology + assert len(snapshot.topology.clusters) == 1 + + +def test_topology_snapshot_serialization(sample_topology_data): + """Test TopologySnapshot JSON serialization with proper timestamp format.""" + topology = Topology(**sample_topology_data) + timestamp = datetime(2022, 10, 7, 9, 14, 30) + + snapshot = TopologySnapshot(timestamp=timestamp, topology=topology) + + # Serialize to dict + snapshot_dict = snapshot.model_dump(mode="json") + + assert "timestamp" in snapshot_dict + assert "topology" in snapshot_dict + + # Verify timestamp format (should be ISO 8601) + assert isinstance(snapshot_dict["timestamp"], str) + # Check it matches the expected format + assert snapshot_dict["timestamp"] == "2022-10-07T09:14:30" + + # Verify we can reconstruct from the dict + snapshot2 = TopologySnapshot(**snapshot_dict) + assert snapshot2.topology.total_host_count() == topology.total_host_count() + + +def test_topology_snapshot_with_microseconds(): + """Test TopologySnapshot handles timestamps with microseconds.""" + topology = Topology( + clusters=[ + Cluster( + name="Test", + hosts=[ + Host( + name="H1", + count=1, + cpu=CPU(coreCount=8, coreSpeed=2000), + memory=Memory(memorySize=64000000), + cpuPowerModel=CPUPowerModel( + modelType="linear", + power=200, + idlePower=20, + maxPower=100, + ), + ) + ], + ) + ] + ) + + timestamp_with_micros = datetime(2022, 10, 7, 9, 14, 30, 123456) + snapshot = TopologySnapshot(timestamp=timestamp_with_micros, topology=topology) + + snapshot_dict = snapshot.model_dump(mode="json") + # With microseconds, should use full ISO format + assert "T" in snapshot_dict["timestamp"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4309ac8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,89 @@ +[project] +name = "opendt" +version = "0.1.0" +description = "OpenDT - Open Digital Twin for Datacenters" +requires-python = ">=3.11" +dependencies = [ + # Core data processing (shared across all services) + "pandas>=2.0.0", + "pyarrow>=13.0.0", + "pyyaml>=6.0.0", + "matplotlib>=3.7.0", + + # Kafka (used by most services) + "kafka-python>=2.0.2", + + # Validation + "pydantic>=2.0.0", + "pydantic-settings>=2.0.0", + + # Web framework (for dashboard service) + "fastapi>=0.104.0", + "uvicorn[standard]>=0.24.0", + "jinja2>=3.1.0", + + # Database (for services that need it) + "psycopg2-binary>=2.9.0", + "sqlalchemy>=2.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "pytest-cov>=4.1.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.5.0", + "ipython>=8.0.0", +] + +# Service-specific dependencies (optional, if some services have unique needs) +dc-mock = [] # Currently uses only shared dependencies +sim-worker = [] # Currently uses only shared dependencies +api = [ + "httpx>=0.25.0", # For API client usage +] + +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +include = [] + +[tool.black] +line-length = 100 +target-version = ["py311"] +extend-exclude = "scratch/" + +[tool.ruff] +line-length = 100 +target-version = "py311" +extend-exclude = ["scratch/"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "C", # flake8-comprehensions + "B", # flake8-bugbear + "UP", # pyupgrade +] +ignore = [] + +[tool.pytest.ini_options] +testpaths = ["libs/common/tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --tb=short" + +[tool.pyright] +include = ["libs", "services"] +extraPaths = ["libs/common"] +venvPath = "." +venv = ".venv" +typeCheckingMode = "basic" diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index fcccae1..0000000 --- a/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -pythonpath = src diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 5034b66..0000000 --- a/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -flask==3.0.0 -kafka-python==2.0.2 -pandas==2.1.4 -pyarrow==14.0.1 -langchain-openai==0.1.19 -langchain-core==0.2.38 -pydantic>=2.7,<3 -watchdog -pytest -numpy -python-dotenv==1.0.1 diff --git a/services/dashboard/Dockerfile b/services/dashboard/Dockerfile new file mode 100644 index 0000000..fe809c1 --- /dev/null +++ b/services/dashboard/Dockerfile @@ -0,0 +1,34 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + postgresql-client \ + && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies from root pyproject.toml +COPY pyproject.toml /app/pyproject.toml +RUN pip install --no-cache-dir -e /app + +# Copy and install shared library +COPY libs/common /app/libs/common +RUN pip install --no-cache-dir -e /app/libs/common + +# Copy service code +COPY services/dashboard /app/services/dashboard + +# Create non-root user for security +RUN useradd -m -u 1000 opendt && \ + chown -R opendt:opendt /app + +USER opendt + +# Set working directory to service for proper module resolution +WORKDIR /app/services/dashboard + +# Expose the API port +EXPOSE 8000 + +CMD ["uvicorn", "dashboard.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/services/dashboard/README.md b/services/dashboard/README.md new file mode 100644 index 0000000..f72352d --- /dev/null +++ b/services/dashboard/README.md @@ -0,0 +1,296 @@ +# dashboard Service + +The **dashboard** service provides a web-based user interface and REST API for the OpenDT system. It combines real-time visualization with programmatic control through FastAPI endpoints. + + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ dashboard β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ FastAPI │────────>β”‚ Kafka β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ Producer β”‚ β”‚ +β”‚ β”‚ Routes: β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ - / β”‚ β”‚ +β”‚ β”‚ - /health β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ - /docs │────────>β”‚ PostgreSQL β”‚ β”‚ +β”‚ β”‚ - /api/... β”‚ β”‚ Database β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ +β”‚ Topics Published: β”‚ +β”‚ β€’ sim.topology (Topology updates) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Web Dashboard + +### Accessing the Dashboard + +Once services are running, access the dashboard at: +- **Dashboard**: http://localhost:8000 +- **API Docs**: http://localhost:8000/docs +- **Health Check**: http://localhost:8000/health + +### Dashboard Features + +The web UI provides: +- Real-time metrics display +- System status monitoring +- Interactive controls +- Power consumption charts (via Plotly.js) +- Topology visualization + +**Note**: The dashboard JavaScript polls API endpoints that are not yet fully implemented. Some features may show as unavailable until backend endpoints are added. + +## API Endpoints + +### Root + +**GET /** + +Serves the web dashboard UI. + +**Response**: HTML page with dashboard interface + +--- + +### Health Check + +**GET /health** + +Service health status including Kafka connectivity. + +**Response**: +```json +{ + "status": "healthy", + "kafka": "connected", + "config": "loaded" +} +``` + +--- + +### API Documentation + +**GET /docs** + +Interactive Swagger UI for testing endpoints. + +**GET /redoc** + +Alternative ReDoc documentation interface. + +--- + +### Update Topology + +**PUT /api/topology** + +Updates the simulated datacenter topology for What-If analysis. + +**Request Body** (`application/json`): +```json +{ + "clusters": [ + { + "name": "A01", + "hosts": [ + { + "name": "A01-Host", + "count": 277, + "cpu": { + "coreCount": 16, + "coreSpeed": 2100 + }, + "memory": { + "memorySize": 128000000 + }, + "cpuPowerModel": { + "modelType": "asymptotic", + "power": 400.0, + "idlePower": 32.0, + "maxPower": 180.0, + "asymUtil": 0.3, + "dvfs": false + } + } + ] + } + ] +} +``` + +**Response** (200 OK): +```json +{ + "status": "updated", + "message": "Topology published to sim.topology", + "clusters": 1, + "total_hosts": 277, + "total_cores": 4432, + "topic": "sim.topology" +} +``` + +**Behavior**: +1. Validates topology against `Topology` Pydantic model +2. Publishes to `sim.topology` Kafka topic (compacted) +3. `sim-worker` consumes update and: + - Updates simulated topology in memory + - Clears result cache (forces fresh simulations) + - Uses new topology for subsequent windows + +**Example via cURL**: +```bash +curl -X PUT http://localhost:8000/api/topology \ + -H "Content-Type: application/json" \ + -d @data/SURF/topology.json +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `CONFIG_FILE` | Path to YAML configuration | `/app/config/simulation.yaml` | +| `DATABASE_URL` | PostgreSQL connection string | `postgresql://opendt:...` | +| `LOG_LEVEL` | Logging level | `INFO` | + +### YAML Configuration + +**File**: `config/default.yaml` + +```yaml +kafka: + bootstrap_servers: "kafka:29092" + topics: + sim_topology: + name: "sim.topology" + config: + cleanup.policy: "compact" + min.compaction.lag.ms: "0" +``` + +## Running + +### Via Docker Compose + +```bash +# Start all services +make up + +# Access dashboard +open http://localhost:8000 + +# View logs +make logs-dashboard +# Or: +docker compose logs -f dashboard +``` + +### Standalone (Development) + +```bash +cd services/dashboard +source ../../.venv/bin/activate + +# Set environment +export CONFIG_FILE=../../config/default.yaml +export DATABASE_URL=postgresql://opendt:opendt_dev_password@localhost:5432/opendt + +# Run with hot reload +uvicorn dashboard.main:app --reload --host 0.0.0.0 --port 8000 +``` + +## Development + +### Project Structure + +``` +dashboard/ +β”œβ”€β”€ __init__.py +β”œβ”€β”€ main.py # FastAPI app + routes +β”œβ”€β”€ static/ # Dashboard assets +β”‚ β”œβ”€β”€ js/ +β”‚ β”‚ β”œβ”€β”€ charts.js +β”‚ β”‚ β”œβ”€β”€ polling.js +β”‚ β”‚ β”œβ”€β”€ ui.js +β”‚ β”‚ └── ... +β”‚ └── style.css +└── templates/ + └── index.html # Dashboard HTML +``` + +### Adding API Endpoints + +Define new endpoints in `main.py`: + +```python +@app.get("/api/my-endpoint") +async def my_endpoint(): + """Endpoint description for OpenAPI.""" + return {"result": "data"} +``` + +### Testing + +```bash +# Interactive testing via Swagger UI +open http://localhost:8000/docs + +# Manual testing via cURL +curl http://localhost:8000/health +``` + +## Static Assets + +The dashboard serves static files from `services/dashboard/static/`: +- **JavaScript**: Charts, polling, UI interactions +- **CSS**: Dashboard styling +- **HTML**: Single-page application template + +Files are mounted at `/static` route and referenced in the HTML template. + +## Monitoring + +### Logs + +```bash +# Tail logs +docker compose logs -f dashboard + +# Expected output: +# INFO - Starting OpenDT Dashboard service... +# INFO - Config loaded from /app/config/simulation.yaml +# INFO - Kafka producer initialized +# INFO - Uvicorn running on http://0.0.0.0:8000 +``` + +### Health Endpoint + +```bash +# Check health +curl http://localhost:8000/health + +# Healthy response: +{ + "status": "healthy", + "kafka": "connected", + "config": "loaded" +} +``` + +## Related Documentation + +- [Architecture Overview](../../docs/ARCHITECTURE.md) - System design +- [Data Models](../../docs/DATA_MODELS.md) - Topology schema +- [Simulation Worker](../sim-worker/README.md) - Consumer of topology updates +- [FastAPI Documentation](https://fastapi.tiangolo.com/) - Framework reference + +--- + +For questions or contributions, see the [Contributing Guide](../../CONTRIBUTING.md). diff --git a/services/dashboard/dashboard/__init__.py b/services/dashboard/dashboard/__init__.py new file mode 100644 index 0000000..3780e98 --- /dev/null +++ b/services/dashboard/dashboard/__init__.py @@ -0,0 +1,3 @@ +"""OpenDT Dashboard Service.""" + +__version__ = "0.1.0" diff --git a/services/dashboard/dashboard/main.py b/services/dashboard/dashboard/main.py new file mode 100644 index 0000000..0ebe3eb --- /dev/null +++ b/services/dashboard/dashboard/main.py @@ -0,0 +1,227 @@ +"""OpenDT Dashboard - Main FastAPI Application.""" + +import logging +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Annotated + +from fastapi import Body, FastAPI, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from opendt_common import load_config_from_env +from opendt_common.models.topology import CPU, Cluster, CPUPowerModel, Host, Memory, Topology +from opendt_common.utils import get_kafka_producer +from opendt_common.utils.kafka import send_message + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Setup paths for static files and templates +BASE_DIR = Path(__file__).resolve().parent.parent +STATIC_DIR = BASE_DIR / "static" +TEMPLATES_DIR = BASE_DIR / "templates" + +# Initialize templates +templates = Jinja2Templates(directory=str(TEMPLATES_DIR)) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Lifespan context manager for startup and shutdown events.""" + # Startup + logger.info("Starting OpenDT Dashboard service...") + + # Load configuration + try: + app.state.config = load_config_from_env() + logger.info(f"Configuration loaded for workload: {app.state.config.workload}") + except Exception as e: + logger.error(f"Failed to load configuration: {e}") + app.state.config = None + + # Initialize Kafka producer (stored in app state for reuse) + try: + app.state.kafka_producer = get_kafka_producer() + logger.info("Kafka producer initialized") + except Exception as e: + logger.error(f"Failed to initialize Kafka producer: {e}") + app.state.kafka_producer = None + + yield + + # Shutdown + logger.info("Shutting down OpenDT Dashboard service...") + if app.state.kafka_producer: + app.state.kafka_producer.close() + logger.info("Kafka producer closed") + + +# Create FastAPI application +app = FastAPI( + title="OpenDT Dashboard", + description="Open Digital Twin - Web Dashboard and API for datacenter simulation", + version="0.1.0", + lifespan=lifespan, + docs_url="/docs", + redoc_url="/redoc", +) + +# Configure CORS +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:8000"], # Dashboard URL + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Mount static files +app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") + + +# ============================================================================ +# DASHBOARD +# ============================================================================ + + +@app.get("/", response_class=HTMLResponse) +async def dashboard(request: Request): + """Serve the OpenDT dashboard UI.""" + return templates.TemplateResponse("index.html", {"request": request}) + + +# ============================================================================ +# API ENDPOINTS +# ============================================================================ + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + kafka_status = "connected" if app.state.kafka_producer else "disconnected" + config_status = "loaded" if app.state.config else "not loaded" + + return { + "status": "healthy", + "kafka": kafka_status, + "config": config_status, + } + + +# ============================================================================ +# TOPOLOGY MANAGEMENT +# ============================================================================ + + +# Default topology for Swagger UI (matches SURF data) +DEFAULT_TOPOLOGY = Topology( + clusters=[ + Cluster( + name="A01", + hosts=[ + Host( + name="A01", + count=277, + cpu=CPU(coreCount=16, coreSpeed=2100.0), + memory=Memory(memorySize=128000000), # ~128 MB + cpuPowerModel=CPUPowerModel( + modelType="asymptotic", + power=400.0, + idlePower=32.0, + maxPower=180.0, + asymUtil=0.3, + dvfs=False, + ), + ) + ], + ) + ] +) + +# Example for OpenAPI docs +DEFAULT_TOPOLOGY_EXAMPLE = DEFAULT_TOPOLOGY.model_dump(mode="json") + + +@app.put("/api/topology") +async def update_topology( + topology: Annotated[ + Topology, + Body( + description="Datacenter topology configuration", + openapi_examples={ + "default": { + "summary": "SURF datacenter topology", + "description": "Default SURF topology: 277 hosts, 16 cores each @ 2.1 GHz", + "value": DEFAULT_TOPOLOGY_EXAMPLE, + } + }, + ), + ] = DEFAULT_TOPOLOGY, +): + """Update the simulated datacenter topology. + + This endpoint validates the topology structure and publishes it to Kafka. + The sim-worker will pick it up and use it for future simulations. + + Args: + topology: Datacenter topology configuration with cluster details + + Returns: + Success confirmation with topology details + + Raises: + HTTPException: 500 if Kafka producer is not available + HTTPException: 500 if publishing to Kafka fails + """ + # Check if Kafka producer is available + if not app.state.kafka_producer: + logger.error("Kafka producer not initialized") + raise HTTPException(status_code=500, detail="Kafka producer not available") + + # Check if config is loaded (to get topic name) + if not app.state.config: + logger.error("Configuration not loaded") + raise HTTPException(status_code=500, detail="Configuration not loaded") + + # Topology already validated by Pydantic + logger.info(f"Topology validated: {len(topology.clusters)} cluster(s)") + + # Get sim.topology topic name from config + sim_topology_topic = app.state.config.kafka.topics.get("sim_topology") + if not sim_topology_topic: + logger.error("sim.topology topic not configured") + raise HTTPException(status_code=500, detail="sim.topology topic not configured") + + topic_name = sim_topology_topic.name + + # Publish to sim.topology Kafka topic with compacted key + try: + send_message( + producer=app.state.kafka_producer, + topic=topic_name, + message=topology.model_dump(mode="json"), + key="datacenter", + ) + logger.info(f"Topology published to {topic_name}") + except Exception as e: + logger.error(f"Failed to publish topology to Kafka: {e}") + raise HTTPException(status_code=500, detail=f"Failed to publish topology: {e}") from e + + return { + "status": "updated", + "message": f"Topology published to {topic_name}", + "clusters": len(topology.clusters), + "total_hosts": topology.total_host_count(), + "total_cores": topology.total_core_count(), + "topic": topic_name, + } + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/static/js/boot.js b/services/dashboard/static/js/boot.js similarity index 100% rename from src/static/js/boot.js rename to services/dashboard/static/js/boot.js diff --git a/src/static/js/charts.js b/services/dashboard/static/js/charts.js similarity index 100% rename from src/static/js/charts.js rename to services/dashboard/static/js/charts.js diff --git a/src/static/js/llm.js b/services/dashboard/static/js/llm.js similarity index 100% rename from src/static/js/llm.js rename to services/dashboard/static/js/llm.js diff --git a/src/static/js/polling.js b/services/dashboard/static/js/polling.js similarity index 100% rename from src/static/js/polling.js rename to services/dashboard/static/js/polling.js diff --git a/src/static/js/recommendations.js b/services/dashboard/static/js/recommendations.js similarity index 100% rename from src/static/js/recommendations.js rename to services/dashboard/static/js/recommendations.js diff --git a/src/static/js/render.js b/services/dashboard/static/js/render.js similarity index 100% rename from src/static/js/render.js rename to services/dashboard/static/js/render.js diff --git a/src/static/js/ui.js b/services/dashboard/static/js/ui.js similarity index 100% rename from src/static/js/ui.js rename to services/dashboard/static/js/ui.js diff --git a/src/static/js/utils.js b/services/dashboard/static/js/utils.js similarity index 100% rename from src/static/js/utils.js rename to services/dashboard/static/js/utils.js diff --git a/src/static/style.css b/services/dashboard/static/style.css similarity index 100% rename from src/static/style.css rename to services/dashboard/static/style.css diff --git a/src/templates/index.html b/services/dashboard/templates/index.html similarity index 99% rename from src/templates/index.html rename to services/dashboard/templates/index.html index 5700bb3..2056579 100644 --- a/src/templates/index.html +++ b/services/dashboard/templates/index.html @@ -6,7 +6,7 @@ - + diff --git a/services/dc-mock/Dockerfile b/services/dc-mock/Dockerfile new file mode 100644 index 0000000..9eb76e1 --- /dev/null +++ b/services/dc-mock/Dockerfile @@ -0,0 +1,30 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies from root pyproject.toml +COPY pyproject.toml /app/pyproject.toml +RUN pip install --no-cache-dir -e /app + +# Copy and install shared library +COPY libs/common /app/libs/common +RUN pip install --no-cache-dir -e /app/libs/common + +# Copy service code +COPY services/dc-mock /app/services/dc-mock + +# Create non-root user for security +RUN useradd -m -u 1000 opendt && \ + chown -R opendt:opendt /app + +USER opendt + +# Set working directory to service for proper module resolution +WORKDIR /app/services/dc-mock + +CMD ["python", "-m", "dc_mock.main"] diff --git a/services/dc-mock/README.md b/services/dc-mock/README.md new file mode 100644 index 0000000..acbbaa0 --- /dev/null +++ b/services/dc-mock/README.md @@ -0,0 +1,301 @@ +# dc-mock Service + +The **dc-mock** service simulates a real datacenter by replaying historical workload and power consumption data to Kafka topics. It acts as the data source for the entire OpenDT system. + +## Responsibilities + +1. **Workload Replay**: Stream task submissions from `tasks.parquet` and `fragments.parquet` +2. **Power Telemetry**: Stream power consumption from `consumption.parquet` +3. **Topology Broadcasting**: Periodically publish datacenter topology from `topology.json` +4. **Heartbeat Generation**: Send periodic heartbeat messages for window synchronization + +## Architecture + +``` +data/SURF/ +β”œβ”€β”€ tasks.parquet ─┐ +β”œβ”€β”€ fragments.parquet ───> WorkloadProducer ─> dc.workload +β”œβ”€β”€ consumption.parquet ───> PowerProducer ───> dc.power +└── topology.json β”€β”˜β”€> TopologyProducer ─> dc.topology +``` + +### Producers + +#### 1. WorkloadProducer + +**File**: [`dc_mock/producers/workload_producer.py`](./dc_mock/producers/workload_producer.py) + +- Reads `tasks.parquet` and `fragments.parquet` +- Joins tasks with their fragments +- Publishes `WorkloadMessage` objects to `dc.workload` +- Emits **heartbeat messages** every `heartbeat_cadence_minutes` (simulation time) +- Respects `speed_factor` for time progression + +**Message Types**: +```python +# Task message +{ + "message_type": "task", + "timestamp": "2022-10-07T00:39:21", + "task": { + "id": 2132895, + "submission_time": "2022-10-07T00:39:21", + "duration": 12000, + "cpu_count": 16, + "cpu_capacity": 33600.0, + "mem_capacity": 100000, + "fragments": [...] + } +} + +# Heartbeat message +{ + "message_type": "heartbeat", + "timestamp": "2022-10-07T00:45:00", + "task": null +} +``` + +#### 2. PowerProducer + +**File**: [`dc_mock/producers/power_producer.py`](./dc_mock/producers/power_producer.py) + +- Reads `consumption.parquet` +- Publishes `Consumption` objects to `dc.power` +- Provides ground truth for comparing simulation predictions + +**Message Format**: +```python +{ + "power_draw": 19180.0, # Watts + "energy_usage": 575400.0, # Joules + "timestamp": "2022-10-08T06:35:30" +} +``` + +#### 3. TopologyProducer + +**File**: [`dc_mock/producers/topology_producer.py`](./dc_mock/producers/topology_producer.py) + +- Reads `topology.json` +- Publishes `TopologySnapshot` to `dc.topology` every 30 seconds (real-time) +- Uses compacted topic with key `"datacenter"` to keep latest only + +**Message Format**: +```python +{ + "timestamp": "2022-10-07T09:14:30", + "topology": { + "clusters": [...] + } +} +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `CONFIG_FILE` | Path to YAML configuration | `/app/config/simulation.yaml` | +| `WORKER_ID` | Unique producer identifier | `dc-mock-1` | + +### YAML Configuration + +**File**: `config/default.yaml` + +```yaml +workload: "SURF" # Maps to data/SURF/ + +simulation: + speed_factor: 300 # 300x real-time + heartbeat_cadence_minutes: 1 # Heartbeat every 1 minute (sim time) + +kafka: + bootstrap_servers: "kafka:29092" + topics: + workload: + name: "dc.workload" + power: + name: "dc.power" + topology: + name: "dc.topology" +``` + +### Speed Factor Behavior + +- `speed_factor: 1.0` - Real-time replay (1 second sim = 1 second real) +- `speed_factor: 300.0` - 300x faster (1 hour sim = 12 seconds real) +- `speed_factor: -1` - Maximum speed (no sleep between messages) + +**Formula**: +```python +sleep_time = (next_timestamp - current_timestamp) / speed_factor +``` + +## Data Format + +### Input Files + +Located in `data//`: + +#### `tasks.parquet` + +Required columns: +- `id` (int): Task identifier +- `submission_time` (datetime): When task was submitted +- `duration` (int): Task duration in milliseconds +- `cpu_count` (int): Number of CPUs requested +- `cpu_capacity` (float): CPU speed in MHz +- `mem_capacity` (int): Memory in MB + +#### `fragments.parquet` + +Required columns: +- `id` (int): Fragment identifier +- `task_id` (int): Parent task ID +- `duration` (int): Fragment duration in milliseconds +- `cpu_count` (int): CPUs used +- `cpu_usage` (float): CPU utilization value + +#### `consumption.parquet` + +Required columns: +- `timestamp` (datetime): Measurement time +- `power_draw` (float): Instantaneous power in Watts +- `energy_usage` (float): Accumulated energy in Joules + +#### `topology.json` + +See [Data Models documentation](../../docs/DATA_MODELS.md#topology-models) for schema. + +## Running + +### Via Docker Compose + +```bash +# Start all services +make up + +# View dc-mock logs +make logs-dc-mock + +# Or directly: +docker compose logs -f dc-mock +``` + +### Standalone (Development) + +```bash +cd services/dc-mock +source ../../.venv/bin/activate + +CONFIG_FILE=../../config/default.yaml \ +python -m dc_mock.main +``` + +## Heartbeat Mechanism + +### Purpose + +Heartbeats solves the problem: How does the consumer know when to close a window? + +Without heartbeats: +- Consumer can't distinguish between "no new tasks" and "Kafka delay" +- Windows might close prematurely or stay open indefinitely + +With heartbeats: +- Producer sends heartbeat every N minutes (simulation time) +- Consumer knows time has progressed even without task arrivals +- Windows can be closed deterministically based on heartbeat timestamps + +### Implementation + +```python +next_heartbeat_time = first_task_time.floor('1min') + +for task in sorted_tasks: + # Emit heartbeats for all minutes before this task + while next_heartbeat_time < task.submission_time: + heartbeat = WorkloadMessage( + message_type="heartbeat", + timestamp=next_heartbeat_time, + task=None + ) + publish(heartbeat) + next_heartbeat_time += timedelta(minutes=heartbeat_cadence) + + # Emit the task + task_message = WorkloadMessage( + message_type="task", + timestamp=task.submission_time, + task=task + ) + publish(task_message) +``` + +### Configuration + +```yaml +simulation: + heartbeat_cadence_minutes: 1 # Send heartbeat every 1 minute (sim time) +``` + +**Trade-offs**: +- **Shorter cadence** (e.g., 1 minute): More accurate window closing, more Kafka messages +- **Longer cadence** (e.g., 5 minutes): Fewer messages, less precise window boundaries + +## Monitoring + +### Logs + +```bash +# View producer activity +docker compose logs -f dc-mock + +# Expected output: +# INFO - Starting WorkloadProducer... +# INFO - Loaded 12,345 tasks from tasks.parquet +# INFO - Starting PowerProducer... +# INFO - Starting TopologyProducer... +# INFO - Published heartbeat: 2022-10-07T00:01:00 +# INFO - Published task 2132895 to dc.workload +# INFO - Published power telemetry to dc.power: 19180.0 W +``` + +### Kafka Topic Inspection + +```bash +# List topics +make kafka-topics + +# Consume from workload topic +docker exec -it opendt-kafka kafka-console-consumer \ + --bootstrap-server localhost:9092 \ + --topic dc.workload \ + --from-beginning \ + --max-messages 10 + +# Consume from topology topic (compacted) +docker exec -it opendt-kafka kafka-console-consumer \ + --bootstrap-server localhost:9092 \ + --topic dc.topology \ + --from-beginning +``` + +## Testing + +```bash +# Run tests +cd services/dc-mock +pytest + +# Run specific test +pytest tests/test_producers.py::test_workload_producer +``` + +## Related Documentation + +- [Architecture Overview](../../docs/ARCHITECTURE.md) - System design +- [Data Models](../../docs/DATA_MODELS.md) - Message formats +- [Simulation Worker](../sim-worker/README.md) - Consumer of these messages diff --git a/services/dc-mock/dc_mock/__init__.py b/services/dc-mock/dc_mock/__init__.py new file mode 100644 index 0000000..99b577d --- /dev/null +++ b/services/dc-mock/dc_mock/__init__.py @@ -0,0 +1,12 @@ +"""DC-Mock Service - Datacenter Mock Producers.""" + +__version__ = "0.1.0" + +from dc_mock.producers import BaseProducer, PowerProducer, TopologyProducer, WorkloadProducer + +__all__ = [ + "BaseProducer", + "TopologyProducer", + "WorkloadProducer", + "PowerProducer", +] diff --git a/services/dc-mock/dc_mock/main.py b/services/dc-mock/dc_mock/main.py new file mode 100644 index 0000000..1f4e618 --- /dev/null +++ b/services/dc-mock/dc_mock/main.py @@ -0,0 +1,268 @@ +"""DC-Mock Service - Main Entry Point. + +This service: +1. Loads configuration from environment +2. Starts three independent threaded producers: + - TopologyProducer: Periodically publishes datacenter topology + - WorkloadProducer: Streams task/workload events in time order + - PowerProducer: Streams power consumption telemetry in time order +3. Respects simulation speed_factor for timing +4. Gracefully handles shutdown +""" + +import logging +import os +import signal +import sys +from pathlib import Path + +from opendt_common import load_config_from_env + +from dc_mock.producers import PowerProducer, TopologyProducer, WorkloadProducer + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +class DCMockOrchestrator: + """Orchestrates multiple threaded producers for DC-Mock service.""" + + def __init__(self): + """Initialize the orchestrator.""" + self.topology_producer: TopologyProducer | None = None + self.workload_producer: WorkloadProducer | None = None + self.power_producer: PowerProducer | None = None + self.shutdown_requested = False + + def setup_signal_handlers(self) -> None: + """Setup signal handlers for graceful shutdown.""" + + def signal_handler(signum, frame): + logger.info(f"Received signal {signum}, initiating graceful shutdown...") + self.shutdown_requested = True + self.stop_all() + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + def start_all( + self, + workload_context, + kafka_bootstrap_servers: str, + speed_factor: float, + topology_topic: str, + workload_topic: str, + power_topic: str, + heartbeat_cadence_minutes: int = 1, + ) -> None: + """Start all producers. + + Args: + workload_context: Workload context with resolved file paths + kafka_bootstrap_servers: Kafka broker addresses + speed_factor: Simulation speed multiplier + topology_topic: Kafka topic for topology events + workload_topic: Kafka topic for workload events + power_topic: Kafka topic for power consumption events + heartbeat_cadence_minutes: Cadence in simulation minutes for heartbeat messages + """ + logger.info("=" * 70) + logger.info("Starting DC-Mock Producers") + logger.info("=" * 70) + + # 1. Start WorkloadProducer first (we need earliest_task_time for PowerProducer) + logger.info("\n[1/3] Initializing WorkloadProducer...") + self.workload_producer = WorkloadProducer( + workload_context=workload_context, + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=workload_topic, + heartbeat_cadence_minutes=heartbeat_cadence_minutes, + ) + + # Pre-load tasks to get earliest time (needed for power producer) + _, earliest_task_time = self.workload_producer.load_and_aggregate_tasks() + earliest_task_time_ms = int(earliest_task_time.timestamp() * 1000) + logger.info(f"Earliest task time: {earliest_task_time} ({earliest_task_time_ms}ms)") + + # Start workload producer thread + self.workload_producer.start() + + # 2. Start PowerProducer + logger.info("\n[2/3] Initializing PowerProducer...") + if workload_context.consumption_file.exists(): + self.power_producer = PowerProducer( + workload_context=workload_context, + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=power_topic, + earliest_task_time_ms=earliest_task_time_ms, + ) + self.power_producer.start() + else: + logger.warning( + f"Consumption file not found: {workload_context.consumption_file}, " + "skipping power consumption streaming" + ) + + # 3. Start TopologyProducer + logger.info("\n[3/3] Initializing TopologyProducer...") + if workload_context.topology_file.exists(): + self.topology_producer = TopologyProducer( + topology_file=workload_context.topology_file, + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=topology_topic, + publish_interval_seconds=30.0, + ) + self.topology_producer.start() + else: + logger.warning( + f"Topology file not found: {workload_context.topology_file}, " + "skipping topology publishing" + ) + + logger.info("\n" + "=" * 70) + logger.info("βœ… All producers started") + logger.info("=" * 70) + + def wait_for_completion(self) -> None: + """Wait for all producers to complete or be interrupted.""" + logger.info("\nWaiting for producers to complete...") + + try: + # Wait for workload producer (main event stream) + if self.workload_producer and self.workload_producer._thread: + if self.workload_producer.is_running(): + logger.info("Waiting for WorkloadProducer to finish...") + self.workload_producer._thread.join() + + # Wait for power producer + if self.power_producer and self.power_producer._thread: + if self.power_producer.is_running(): + logger.info("Waiting for PowerProducer to finish...") + self.power_producer._thread.join() + + # Topology producer runs indefinitely, so we stop it explicitly + if self.topology_producer and self.topology_producer.is_running(): + logger.info("Stopping TopologyProducer...") + self.topology_producer.stop() + + logger.info("All producers completed") + + except KeyboardInterrupt: + logger.info("Received interrupt during wait") + self.stop_all() + + def stop_all(self) -> None: + """Stop all running producers.""" + logger.info("\n" + "=" * 70) + logger.info("Stopping all producers...") + logger.info("=" * 70) + + producers = [ + ("TopologyProducer", self.topology_producer), + ("WorkloadProducer", self.workload_producer), + ("PowerProducer", self.power_producer), + ] + + for name, producer in producers: + if producer and producer.is_running(): + logger.info(f"Stopping {name}...") + producer.stop(timeout=5.0) + elif producer: + logger.info(f"{name} already stopped") + + logger.info("=" * 70) + logger.info("βœ… All producers stopped") + logger.info("=" * 70) + + def run(self) -> int: + """Run the orchestrator. + + Returns: + Exit code: 0 for success, 1 for error + """ + try: + # Load configuration + logger.info("Loading configuration...") + config = load_config_from_env() + logger.info(f"Loaded configuration for workload: {config.workload}") + logger.info(f"Simulation speed: {config.simulation.speed_factor}x") + + # Get workload context + data_path = Path(os.getenv("DATA_PATH", "/app/data")) + workload_context = config.get_workload_context(base_path=data_path) + + # Verify workload directory exists + if not workload_context.exists(): + logger.error(f"Workload directory not found: {workload_context.workload_dir}") + logger.info("Available workloads:") + for item in data_path.iterdir(): + if item.is_dir(): + logger.info(f" - {item.name}") + return 1 + + # Log file status + file_status = workload_context.get_file_status() + logger.info("Workload files:") + for file_type, exists in file_status.items(): + status = "βœ“" if exists else "βœ—" + logger.info(f" {status} {file_type}") + + # Get Kafka configuration + kafka_bootstrap_servers = config.kafka.bootstrap_servers + logger.info(f"Kafka bootstrap servers: {kafka_bootstrap_servers}") + + # Get topic names + topology_topic = config.kafka.topics["topology"].name + workload_topic = config.kafka.topics["workload"].name + power_topic = config.kafka.topics["power"].name + logger.info( + f"Topics: topology={topology_topic}, workload={workload_topic}, power={power_topic}" + ) + + # Setup signal handlers + self.setup_signal_handlers() + + # Start all producers + self.start_all( + workload_context=workload_context, + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=config.simulation.speed_factor, + topology_topic=topology_topic, + workload_topic=workload_topic, + power_topic=power_topic, + heartbeat_cadence_minutes=config.simulation.heartbeat_cadence_minutes, + ) + + # Wait for completion + self.wait_for_completion() + + logger.info("βœ… DC-Mock service completed successfully") + return 0 + + except KeyboardInterrupt: + logger.info("Received interrupt signal") + self.stop_all() + return 0 + except Exception as e: + logger.error(f"❌ Error in DC-Mock service: {e}", exc_info=True) + self.stop_all() + return 1 + + +def main() -> int: + """Main entry point. + + Returns: + Exit code: 0 for success, 1 for error + """ + orchestrator = DCMockOrchestrator() + return orchestrator.run() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/dc-mock/dc_mock/producers/__init__.py b/services/dc-mock/dc_mock/producers/__init__.py new file mode 100644 index 0000000..41018a3 --- /dev/null +++ b/services/dc-mock/dc_mock/producers/__init__.py @@ -0,0 +1,13 @@ +"""DC-Mock Producers - Threaded Kafka producers.""" + +from dc_mock.producers.base import BaseProducer +from dc_mock.producers.power_producer import PowerProducer +from dc_mock.producers.topology_producer import TopologyProducer +from dc_mock.producers.workload_producer import WorkloadProducer + +__all__ = [ + "BaseProducer", + "TopologyProducer", + "WorkloadProducer", + "PowerProducer", +] diff --git a/services/dc-mock/dc_mock/producers/base.py b/services/dc-mock/dc_mock/producers/base.py new file mode 100644 index 0000000..2d38e29 --- /dev/null +++ b/services/dc-mock/dc_mock/producers/base.py @@ -0,0 +1,186 @@ +"""Base producer class for DC-Mock threaded producers.""" + +import logging +import threading +from abc import ABC, abstractmethod +from typing import Any + +from kafka import KafkaProducer +from opendt_common.utils import get_kafka_producer +from opendt_common.utils.kafka import send_message + +logger = logging.getLogger(__name__) + + +class BaseProducer(ABC): + """Base class for threaded Kafka producers. + + Provides common functionality for all producers: + - Kafka producer management + - Thread lifecycle management + - Message emission utilities + - Speed factor handling + """ + + def __init__( + self, + kafka_bootstrap_servers: str, + speed_factor: float, + topic: str, + name: str | None = None, + ): + """Initialize the base producer. + + Args: + kafka_bootstrap_servers: Kafka broker addresses + speed_factor: Simulation speed multiplier (1.0 = realtime, -1 = max speed) + topic: Kafka topic name for this producer + name: Optional producer name for logging + """ + self.kafka_bootstrap_servers = kafka_bootstrap_servers + self.speed_factor = speed_factor + self.topic = topic + self.name = name or self.__class__.__name__ + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + self._producer: KafkaProducer | None = None + + logger.info(f"Initialized {self.name}") + logger.info(f" Topic: {self.topic}") + logger.info(f" Speed factor: {self.speed_factor}x") + + def _get_producer(self) -> KafkaProducer: + """Get or create the Kafka producer. + + Returns: + Kafka producer instance + """ + if self._producer is None: + self._producer = get_kafka_producer(self.kafka_bootstrap_servers) + return self._producer + + def emit_message(self, message: dict[str, Any], key: str | None = None) -> None: + """Emit a message to Kafka. + + Args: + message: Message payload (will be JSON serialized) + key: Optional message key + """ + try: + send_message( + self._get_producer(), + topic=self.topic, + message=message, + key=key, + ) + except Exception as e: + logger.error(f"Failed to emit message to {self.topic}: {e}", exc_info=True) + raise + + def flush(self) -> None: + """Flush buffered messages to Kafka.""" + if self._producer: + self._producer.flush() + + def calculate_sleep_time(self, realtime_seconds: float) -> float: + """Calculate actual sleep time based on speed factor. + + Args: + realtime_seconds: Desired sleep time in realtime seconds + + Returns: + Actual sleep time adjusted for speed factor + """ + if self.speed_factor <= 0: + # Max speed (-1): minimal sleep + return 0.001 + return realtime_seconds / self.speed_factor + + @abstractmethod + def run(self) -> None: + """Run the producer (main logic). + + This method should be implemented by subclasses and will be + executed in a separate thread. + """ + pass + + def start(self) -> None: + """Start the producer in a background thread.""" + if self._thread is not None and self._thread.is_alive(): + logger.warning(f"{self.name} is already running") + return + + logger.info(f"Starting {self.name} in background thread...") + self._thread = threading.Thread(target=self._run_wrapper, daemon=True, name=self.name) + self._thread.start() + logger.info(f"{self.name} started") + + def _run_wrapper(self) -> None: + """Wrapper around run() for exception handling and cleanup.""" + try: + self.run() + except Exception as e: + if not self._stop_event.is_set(): + logger.error(f"Error in {self.name}: {e}", exc_info=True) + finally: + self._cleanup() + + def stop(self, timeout: float = 5.0) -> None: + """Stop the producer and wait for thread to finish. + + Args: + timeout: Maximum time to wait for thread termination in seconds + """ + if self._thread is None or not self._thread.is_alive(): + logger.debug(f"{self.name} is not running") + return + + logger.info(f"Stopping {self.name}...") + self._stop_event.set() + + # Wait for thread to finish + self._thread.join(timeout=timeout) + if self._thread.is_alive(): + logger.warning(f"{self.name} did not terminate within {timeout}s") + else: + logger.info(f"{self.name} stopped") + + def _cleanup(self) -> None: + """Clean up resources (called automatically on exit).""" + if self._producer: + try: + self._producer.flush() + self._producer.close() + logger.info(f"{self.name} Kafka producer closed") + except Exception as e: + logger.error(f"Error closing {self.name} producer: {e}") + finally: + self._producer = None + + def is_running(self) -> bool: + """Check if the producer thread is running. + + Returns: + True if thread is alive, False otherwise + """ + return self._thread is not None and self._thread.is_alive() + + def should_stop(self) -> bool: + """Check if stop has been requested. + + Returns: + True if stop event is set, False otherwise + """ + return self._stop_event.is_set() + + def wait_interruptible(self, seconds: float) -> bool: + """Wait for specified seconds, but can be interrupted by stop event. + + Args: + seconds: Time to wait in seconds + + Returns: + True if interrupted (should stop), False if timed out normally + """ + return self._stop_event.wait(timeout=seconds) diff --git a/services/dc-mock/dc_mock/producers/power_producer.py b/services/dc-mock/dc_mock/producers/power_producer.py new file mode 100644 index 0000000..81c4721 --- /dev/null +++ b/services/dc-mock/dc_mock/producers/power_producer.py @@ -0,0 +1,156 @@ +"""Power consumption producer for DC-Mock service. + +Streams historical power consumption data to Kafka with proper timing. +""" + +import logging +import time + +import pandas as pd +from opendt_common import Consumption, WorkloadContext + +from dc_mock.producers.base import BaseProducer + +logger = logging.getLogger(__name__) + + +class PowerProducer(BaseProducer): + """Streams power consumption events to Kafka in time order. + + Handles timestamp conversion from relative to absolute timestamps + based on the earliest task submission time and configured offset. + """ + + def __init__( + self, + workload_context: WorkloadContext, + kafka_bootstrap_servers: str, + speed_factor: float, + topic: str, + earliest_task_time_ms: int, + ): + """Initialize the power producer. + + Args: + workload_context: Workload context with resolved file paths + kafka_bootstrap_servers: Kafka broker addresses + speed_factor: Simulation speed multiplier + topic: Kafka topic name for power consumption events + earliest_task_time_ms: Earliest task submission time in epoch ms + """ + super().__init__( + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=topic, + name="PowerProducer", + ) + self.workload_context = workload_context + self.earliest_task_time_ms = earliest_task_time_ms + self.consumption_records: list[Consumption] = [] + + logger.info(f" Consumption file: {workload_context.consumption_file}") + logger.info(f" Earliest task time: {earliest_task_time_ms}ms") + + def load_consumption_data(self) -> list[Consumption]: + """Load consumption data from Parquet file. + + Returns: + List of Consumption records with absolute timestamps + + Raises: + FileNotFoundError: If consumption file doesn't exist + """ + if not self.workload_context.consumption_file.exists(): + logger.warning(f"Consumption file not found: {self.workload_context.consumption_file}") + return [] + + consumption_df = pd.read_parquet(self.workload_context.consumption_file) + logger.info(f"Loaded {len(consumption_df)} consumption records") + + # Convert relative timestamps to absolute timestamps + offset_ms = self.workload_context.consumption_offset_ms + logger.info(f"Converting consumption timestamps with offset: {offset_ms}ms") + + # Formula: absolute_time_ms = earliest_task_time_ms + relative_ms + offset_ms + consumption_df["timestamp"] = ( + self.earliest_task_time_ms + consumption_df["timestamp"] + offset_ms + ) + + # Convert to Pydantic models + consumption_records = [] + for _, row in consumption_df.iterrows(): + try: + consumption_records.append(Consumption(**row.to_dict())) + except Exception as e: + logger.warning(f"Failed to parse consumption record: {e}") + + logger.info(f"Parsed {len(consumption_records)} consumption records") + if consumption_records: + first_time = consumption_records[0].timestamp + last_time = consumption_records[-1].timestamp + logger.info(f"Time range: {first_time} to {last_time}") + + return consumption_records + + def run(self) -> None: + """Run the power producer (streams consumption events).""" + logger.info("PowerProducer running") + + try: + # Load consumption data + self.consumption_records = self.load_consumption_data() + + if not self.consumption_records: + logger.warning("No consumption data to stream") + return + + logger.info(f"Streaming {len(self.consumption_records)} consumption events...") + + # Track simulation time + sim_start_time = self.consumption_records[0].timestamp + real_start_time = time.time() + + for i, consumption in enumerate(self.consumption_records): + if self.should_stop(): + logger.info("PowerProducer interrupted") + break + + # Calculate elapsed simulation time + sim_elapsed = (consumption.timestamp - sim_start_time).total_seconds() + + # Calculate required sleep based on speed_factor + if self.speed_factor > 0: + required_real_elapsed = sim_elapsed / self.speed_factor + real_elapsed = time.time() - real_start_time + sleep_time = required_real_elapsed - real_elapsed + + if sleep_time > 0: + # Use interruptible wait + if self.wait_interruptible(sleep_time): + logger.info("PowerProducer interrupted during sleep") + break + # If speed_factor == -1, don't sleep (max speed) + + # Emit consumption event + self.emit_message( + message=consumption.model_dump(mode="json"), + key=None, # No key for consumption + ) + + # Periodic flush and logging + if (i + 1) % 100 == 0: + progress = (i + 1) / len(self.consumption_records) * 100 + logger.info( + f"PowerProducer progress: {i + 1}/" + f"{len(self.consumption_records)} ({progress:.1f}%)" + ) + self.flush() + + # Final flush + logger.info("PowerProducer flushing remaining messages...") + self.flush() + logger.info("PowerProducer finished streaming") + + except Exception as e: + logger.error(f"Fatal error in PowerProducer: {e}", exc_info=True) + raise diff --git a/services/dc-mock/dc_mock/producers/topology_producer.py b/services/dc-mock/dc_mock/producers/topology_producer.py new file mode 100644 index 0000000..99a0932 --- /dev/null +++ b/services/dc-mock/dc_mock/producers/topology_producer.py @@ -0,0 +1,129 @@ +"""Topology producer for DC-Mock service. + +Periodically publishes datacenter topology to Kafka. +""" + +import json +import logging +from datetime import datetime +from pathlib import Path + +from opendt_common import Topology, TopologySnapshot + +from dc_mock.producers.base import BaseProducer + +logger = logging.getLogger(__name__) + + +class TopologyProducer(BaseProducer): + """Periodically publishes datacenter topology to Kafka. + + Re-reads the topology file and publishes it at a fixed interval, + allowing for dynamic topology updates during simulation. + """ + + def __init__( + self, + topology_file: Path, + kafka_bootstrap_servers: str, + speed_factor: float, + topic: str, + publish_interval_seconds: float = 30.0, + ): + """Initialize the topology producer. + + Args: + topology_file: Path to topology.json file + kafka_bootstrap_servers: Kafka broker addresses + speed_factor: Simulation speed multiplier + topic: Kafka topic name for topology events + publish_interval_seconds: Publish interval in realtime seconds (default: 30s) + """ + super().__init__( + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=topic, + name="TopologyProducer", + ) + self.topology_file = topology_file + self.publish_interval_seconds = publish_interval_seconds + + logger.info(f" Topology file: {topology_file}") + logger.info(f" Publish interval (realtime): {publish_interval_seconds}s") + + def load_topology(self) -> Topology: + """Load and validate topology from JSON file. + + Returns: + Validated Topology object + + Raises: + FileNotFoundError: If topology file doesn't exist + ValueError: If topology JSON is invalid + """ + if not self.topology_file.exists(): + raise FileNotFoundError(f"Topology file not found: {self.topology_file}") + + with open(self.topology_file) as f: + topology_data = json.load(f) + + try: + topology = Topology(**topology_data) + logger.debug(f"Loaded topology with {len(topology.clusters)} cluster(s)") + logger.debug(f" Total hosts: {topology.total_host_count()}") + logger.debug(f" Total cores: {topology.total_core_count()}") + return topology + except Exception as e: + logger.error(f"Failed to parse topology: {e}") + raise ValueError(f"Invalid topology format: {e}") from e + + def run(self) -> None: + """Run the topology producer (publishes periodically in a loop).""" + logger.info("TopologyProducer running") + + try: + # Load topology once at startup + topology = self.load_topology() + logger.info( + f"Loaded topology: {topology.total_host_count()} hosts, " + f"{topology.total_core_count()} cores" + ) + + # Calculate effective interval based on speed factor + effective_interval = self.calculate_sleep_time(self.publish_interval_seconds) + logger.info(f"Effective publish interval: {effective_interval:.2f}s") + + # Publish immediately on startup + snapshot = TopologySnapshot(timestamp=datetime.now(), topology=topology) + self.emit_message( + message=snapshot.model_dump(mode="json"), + key="datacenter", # Single key for compaction + ) + self.flush() + logger.info("Published initial topology") + + # Then publish periodically + while not self.should_stop(): + # Wait for the interval (with ability to interrupt) + if self.wait_interruptible(effective_interval): + break + + # Re-read topology file (allows for dynamic updates) + try: + topology = self.load_topology() + snapshot = TopologySnapshot(timestamp=datetime.now(), topology=topology) + self.emit_message( + message=snapshot.model_dump(mode="json"), + key="datacenter", + ) + self.flush() + logger.debug("Published topology update") + except Exception as e: + logger.error(f"Error publishing topology: {e}", exc_info=True) + # Continue even if one iteration fails + + logger.info("TopologyProducer finished") + + except Exception as e: + logger.error(f"Fatal error in TopologyProducer: {e}", exc_info=True) + raise diff --git a/services/dc-mock/dc_mock/producers/workload_producer.py b/services/dc-mock/dc_mock/producers/workload_producer.py new file mode 100644 index 0000000..c2c3ef0 --- /dev/null +++ b/services/dc-mock/dc_mock/producers/workload_producer.py @@ -0,0 +1,224 @@ +"""Workload producer for DC-Mock service. + +Streams task/workload events to Kafka with proper timing. +""" + +import logging +import time +from datetime import datetime, timedelta + +import pandas as pd +from opendt_common import Fragment, Task, WorkloadContext + +from dc_mock.producers.base import BaseProducer + +logger = logging.getLogger(__name__) + + +class WorkloadProducer(BaseProducer): + """Streams workload (task) events to Kafka in time order. + + Pre-aggregates fragments into their parent tasks before streaming. + """ + + def __init__( + self, + workload_context: WorkloadContext, + kafka_bootstrap_servers: str, + speed_factor: float, + topic: str, + heartbeat_cadence_minutes: int = 1, + ): + """Initialize the workload producer. + + Args: + workload_context: Workload context with resolved file paths + kafka_bootstrap_servers: Kafka broker addresses + speed_factor: Simulation speed multiplier + topic: Kafka topic name for workload events + heartbeat_cadence_minutes: Cadence in simulation minutes for heartbeat messages + """ + super().__init__( + kafka_bootstrap_servers=kafka_bootstrap_servers, + speed_factor=speed_factor, + topic=topic, + name="WorkloadProducer", + ) + self.workload_context = workload_context + self.tasks: list[Task] = [] + self.heartbeat_cadence_minutes = heartbeat_cadence_minutes + + logger.info(f" Tasks file: {workload_context.tasks_file}") + logger.info(f" Fragments file: {workload_context.fragments_file}") + logger.info(f" Heartbeat cadence: {heartbeat_cadence_minutes} simulated minutes") + + def load_and_aggregate_tasks(self) -> tuple[list[Task], datetime]: + """Load tasks and fragments, aggregating fragments into tasks. + + Returns: + Tuple of (tasks, earliest_submission_time) + + Raises: + FileNotFoundError: If required files don't exist + """ + logger.info("Loading task and fragment data...") + + # Load tasks + tasks_df = pd.read_parquet(self.workload_context.tasks_file) + logger.info(f"Loaded {len(tasks_df)} tasks") + + # Get earliest task submission time + earliest_task_time = tasks_df["submission_time"].min().to_pydatetime() + logger.info(f"Earliest task submission time: {earliest_task_time}") + + # Load fragments + fragments_df = pd.read_parquet(self.workload_context.fragments_file) + logger.info(f"Loaded {len(fragments_df)} fragments") + + # Aggregate fragments by task_id + logger.info("Aggregating fragments into tasks...") + fragments_by_task = fragments_df.groupby("id") + + # Create Task objects with nested fragments + tasks = [] + for _, task_row in tasks_df.iterrows(): + task_dict = task_row.to_dict() + task_id = task_dict["id"] + + # Get fragments for this task + if task_id in fragments_by_task.groups: + task_fragments_df = fragments_by_task.get_group(task_id) + fragments = [ + Fragment(**frag_row.to_dict()) for _, frag_row in task_fragments_df.iterrows() + ] + task_dict["fragments"] = fragments + else: + task_dict["fragments"] = [] + + try: + tasks.append(Task(**task_dict)) + except Exception as e: + logger.warning(f"Failed to parse task {task_id}: {e}") + + logger.info(f"Created {len(tasks)} task aggregates") + total_fragments = sum(len(t.fragments) for t in tasks) + logger.info(f"Total fragments aggregated: {total_fragments}") + + # Sort tasks by submission time + tasks.sort(key=lambda t: t.submission_time) + + if tasks: + logger.info(f"Time range: {tasks[0].submission_time} to {tasks[-1].submission_time}") + + return tasks, earliest_task_time + + def run(self) -> None: + """Run the workload producer (streams task events).""" + logger.info("WorkloadProducer running") + + try: + # Load and aggregate tasks + self.tasks, earliest_task_time = self.load_and_aggregate_tasks() + + if not self.tasks: + logger.warning("No tasks to stream") + return + + logger.info(f"Streaming {len(self.tasks)} task events with heartbeats...") + + # Track simulation time + sim_start_time = self.tasks[0].submission_time + real_start_time = time.time() + + # Initialize heartbeat tracking + heartbeat_cadence = timedelta(minutes=self.heartbeat_cadence_minutes) + # Round down to the nearest minute for first heartbeat + next_heartbeat_time = sim_start_time.replace(second=0, microsecond=0) + heartbeats_sent = 0 + + for i, task in enumerate(self.tasks): + if self.should_stop(): + logger.info("WorkloadProducer interrupted") + break + + # Emit any pending heartbeats before this task + while next_heartbeat_time <= task.submission_time: + # Calculate elapsed time for heartbeat + heartbeat_elapsed = (next_heartbeat_time - sim_start_time).total_seconds() + + # Sleep until heartbeat time if needed + if self.speed_factor > 0: + required_real_elapsed = heartbeat_elapsed / self.speed_factor + real_elapsed = time.time() - real_start_time + sleep_time = required_real_elapsed - real_elapsed + + if sleep_time > 0: + if self.wait_interruptible(sleep_time): + logger.info("WorkloadProducer interrupted during heartbeat sleep") + break + + # Emit heartbeat message + heartbeat_msg = { + "message_type": "heartbeat", + "timestamp": next_heartbeat_time.isoformat(), + } + self.emit_message(message=heartbeat_msg, key="heartbeat") + heartbeats_sent += 1 + + # Move to next heartbeat time + next_heartbeat_time += heartbeat_cadence + + # Calculate elapsed simulation time for this task + sim_elapsed = (task.submission_time - sim_start_time).total_seconds() + + # Calculate required sleep based on speed_factor + if self.speed_factor > 0: + required_real_elapsed = sim_elapsed / self.speed_factor + real_elapsed = time.time() - real_start_time + sleep_time = required_real_elapsed - real_elapsed + + if sleep_time > 0: + # Use interruptible wait + if self.wait_interruptible(sleep_time): + logger.info("WorkloadProducer interrupted during sleep") + break + # If speed_factor == -1, don't sleep (max speed) + + # Emit task event wrapped with message_type + task_msg = { + "message_type": "task", + "timestamp": task.submission_time.isoformat(), + "task": task.model_dump(mode="json"), + } + self.emit_message(message=task_msg, key=str(task.id)) + + # Periodic flush and logging + if (i + 1) % 100 == 0: + progress = (i + 1) / len(self.tasks) * 100 + logger.info( + f"WorkloadProducer progress: {i + 1}/{len(self.tasks)} tasks, " + f"{heartbeats_sent} heartbeats ({progress:.1f}%)" + ) + self.flush() + + # Final flush + logger.info("WorkloadProducer flushing remaining messages...") + self.flush() + logger.info( + f"WorkloadProducer finished streaming: " + f"{len(self.tasks)} tasks, {heartbeats_sent} heartbeats" + ) + + except Exception as e: + logger.error(f"Fatal error in WorkloadProducer: {e}", exc_info=True) + raise + + def get_earliest_task_time_ms(self) -> int | None: + """Get the earliest task submission time in epoch milliseconds. + + Returns: + Earliest task time in ms, or None if no tasks loaded + """ + if not self.tasks: + return None + return int(self.tasks[0].submission_time.timestamp() * 1000) diff --git a/services/kafka-init/Dockerfile b/services/kafka-init/Dockerfile new file mode 100644 index 0000000..b3eda21 --- /dev/null +++ b/services/kafka-init/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install minimal dependencies +RUN pip install --no-cache-dir \ + kafka-python==2.0.2 \ + pyyaml==6.0.1 \ + pydantic==2.5.0 + +# Copy and install shared library +COPY libs/common /app/libs/common +RUN pip install --no-cache-dir -e /app/libs/common + +# Copy service code +COPY services/kafka-init /app/services/kafka-init + +# Create non-root user for security +RUN useradd -m -u 1000 opendt && \ + chown -R opendt:opendt /app + +USER opendt + +# Set working directory to service for proper module resolution +WORKDIR /app/services/kafka-init + +CMD ["python", "-m", "kafka_init.main"] diff --git a/services/kafka-init/README.md b/services/kafka-init/README.md new file mode 100644 index 0000000..2251f95 --- /dev/null +++ b/services/kafka-init/README.md @@ -0,0 +1,442 @@ +# kafka-init Service + +The **kafka-init** service is an infrastructure initialization container that creates and configures Kafka topics before application services start. It ensures all required topics exist with proper retention, compaction, and partitioning settings. + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ kafka-init Container β”‚ +β”‚ β”‚ +β”‚ 1. Read config/default.yaml β”‚ +β”‚ 2. Parse topic definitions β”‚ +β”‚ 3. Connect to Kafka (with retries) β”‚ +β”‚ 4. Create topics if not exist β”‚ +β”‚ 5. Apply topic configurations β”‚ +β”‚ 6. Exit (0 = success, 1 = failure) β”‚ +β”‚ β”‚ +β”‚ Blocks other services until complete β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Kafka Broker β”‚ + β”‚ Topics Created: β”‚ + β”‚ β€’ dc.workload β”‚ + β”‚ β€’ dc.power β”‚ + β”‚ β€’ dc.topology β”‚ + β”‚ β€’ sim.topology β”‚ + β”‚ β€’ sim.results β”‚ + β”‚ β€’ sys.config β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `CONFIG_FILE` | Path to YAML configuration | `/app/config/simulation.yaml` | +| `PYTHONUNBUFFERED` | Unbuffered Python output | `1` | + +### Topic Configuration + +**File**: `config/default.yaml` + +```yaml +kafka: + bootstrap_servers: "kafka:29092" + topics: + workload: + name: "dc.workload" + config: + retention.ms: "86400000" # 24 hours + + power: + name: "dc.power" + config: + retention.ms: "3600000" # 1 hour + + topology: + name: "dc.topology" + config: + cleanup.policy: "compact" + min.compaction.lag.ms: "3600000" # 1 hour + + sim_topology: + name: "sim.topology" + config: + cleanup.policy: "compact" + min.compaction.lag.ms: "0" # Immediate compaction + + system: + name: "sys.config" + config: + cleanup.policy: "compact" + + results: + name: "sim.results" + config: + retention.ms: "604800000" # 7 days +``` + +### Topic Types + +**Stream Topics** (retention-based): +- `dc.workload` - Task submissions (24h retention) +- `dc.power` - Power telemetry (1h retention) +- `sim.results` - Simulation predictions (7d retention) + +**Compacted Topics** (key-based): +- `dc.topology` - Real datacenter topology +- `sim.topology` - Simulated topology +- `sys.config` - Runtime configuration + +## Implementation + +### Main Flow + +**File**: [`kafka_init/main.py`](./kafka_init/main.py) + +```python +def main(): + # 1. Load configuration + config = load_config_from_env() + + # 2. Wait for Kafka to be ready + admin_client = wait_for_kafka(config) + + # 3. Create topics + for topic_key, topic_config in config.kafka.topics.items(): + create_topic_if_not_exists(admin_client, topic_config) + + # 4. Exit successfully + sys.exit(0) +``` + +### Kafka Connection Retry + +```python +def wait_for_kafka(config, max_retries=30, retry_delay=2): + """Wait for Kafka broker to be ready.""" + for attempt in range(max_retries): + try: + admin = KafkaAdminClient( + bootstrap_servers=config.kafka.bootstrap_servers, + client_id="kafka-init" + ) + admin.list_topics() # Test connection + return admin + except KafkaError: + if attempt < max_retries - 1: + time.sleep(retry_delay) + else: + raise +``` + +### Topic Creation + +```python +def create_topic_if_not_exists(admin, topic_config): + """Create topic with specified configuration.""" + topic_name = topic_config.name + + # Check if topic already exists + existing_topics = admin.list_topics() + if topic_name in existing_topics: + logger.info(f"Topic {topic_name} already exists") + return + + # Create topic with config + new_topic = NewTopic( + name=topic_name, + num_partitions=topic_config.get("partitions", 1), + replication_factor=topic_config.get("replication_factor", 1), + topic_configs=topic_config.config or {} + ) + + admin.create_topics([new_topic]) + logger.info(f"βœ… Created topic: {topic_name}") +``` + +## Running + +### Via Docker Compose (Automatic) + +```bash +# Start services (kafka-init runs automatically) +make up + +# kafka-init runs before other services start +# It will exit once topics are created +``` + +### Check Status + +```bash +# View kafka-init logs +docker compose logs kafka-init + +# Expected output: +# INFO - Loading configuration from /app/config/simulation.yaml +# INFO - Connecting to Kafka at kafka:29092 +# INFO - βœ… Created topic: dc.workload +# INFO - βœ… Created topic: dc.power +# INFO - βœ… Created topic: dc.topology +# INFO - βœ… Created topic: sim.topology +# INFO - βœ… Created topic: sys.config +# INFO - βœ… Created topic: sim.results +# INFO - All topics created successfully + +# List created topics +make kafka-topics +# Or: +docker exec -it opendt-kafka kafka-topics \ + --bootstrap-server localhost:9092 \ + --list +``` + +### Standalone (Development) + +```bash +cd services/kafka-init +source ../../.venv/bin/activate + +# Set environment +export CONFIG_FILE=../../config/default.yaml + +# Run initialization +python -m kafka_init.main +``` + +### Verify Topic Configuration + +```bash +# Describe specific topic +docker exec -it opendt-kafka kafka-topics \ + --bootstrap-server localhost:9092 \ + --describe \ + --topic dc.topology + +# Output shows: +# Topic: dc.topology +# PartitionCount: 1 +# ReplicationFactor: 1 +# Configs: cleanup.policy=compact,min.compaction.lag.ms=3600000 +``` + +## Docker Compose Integration + +### Dependency Chain + +```yaml +services: + kafka-init: + depends_on: + kafka: + condition: service_healthy + restart: "no" # Don't restart (runs once) + + dc-mock: + depends_on: + kafka-init: + condition: service_completed_successfully + + sim-worker: + depends_on: + kafka-init: + condition: service_completed_successfully +``` + +**Flow**: +1. Kafka starts and waits for health check +2. kafka-init starts and creates topics +3. kafka-init exits with code 0 (success) +4. Application services (dc-mock, sim-worker) start + +## Exit Codes + +| Code | Meaning | Action | +|------|---------|--------| +| 0 | Success - All topics created | Services start | +| 1 | Failure - Topic creation failed | Services blocked, check logs | + +## Troubleshooting + +### Issue: "Connection refused to Kafka" + +**Cause**: Kafka not ready yet + +**Solution**: kafka-init automatically retries for 60 seconds. Check Kafka logs: +```bash +docker compose logs kafka | grep "started" +``` + +### Issue: "Topic already exists" + +**Cause**: Normal behavior if topics were created previously + +**Solution**: No action needed. kafka-init skips existing topics. + +### Issue: Services not starting after kafka-init + +**Cause**: kafka-init exited with error (code 1) + +**Solution**: +```bash +# Check kafka-init logs +docker compose logs kafka-init + +# Look for error messages +# Common issues: +# - Invalid topic config (fix config/default.yaml) +# - Kafka permissions (check Kafka ACLs) +# - Network issues (verify Docker network) + +# Fix issue and restart +make down +make up +``` + +### Issue: "Invalid topic configuration" + +**Cause**: Syntax error in `config/default.yaml` + +**Solution**: +```bash +# Validate YAML syntax +python -c "import yaml; yaml.safe_load(open('config/default.yaml'))" + +# Check topic config format: +# - retention.ms must be string: "86400000" +# - cleanup.policy must be "delete" or "compact" +``` + +## Topic Configuration Reference + +### Retention Policies + +**Time-based retention** (stream topics): +```yaml +config: + retention.ms: "86400000" # Keep messages for 24 hours +``` + +**Size-based retention**: +```yaml +config: + retention.bytes: "1073741824" # Keep up to 1GB +``` + +**Compaction** (key-based retention): +```yaml +config: + cleanup.policy: "compact" # Keep latest value per key + min.compaction.lag.ms: "3600000" # Wait 1h before compacting +``` + +### Partitioning + +```yaml +workload: + name: "dc.workload" + partitions: 4 # 4 partitions for parallel consumption + replication_factor: 1 # 1 replica (single-broker cluster) +``` + +**Considerations**: +- More partitions = higher throughput +- Partitions enable parallel consumption +- Replication factor must be ≀ number of brokers + +### Other Settings + +```yaml +config: + # Message size limits + max.message.bytes: "10485760" # 10MB max message + + # Compression + compression.type: "gzip" + + # Segment rolling + segment.ms: "86400000" # New segment every 24h +``` + +## Development + +### Adding a New Topic + +1. **Update config**: +```yaml +# In config/default.yaml +kafka: + topics: + my_new_topic: + name: "my.new.topic" + config: + retention.ms: "3600000" +``` + +2. **Update common library** (if needed): +```python +# In libs/common/opendt_common/config.py +class KafkaConfig(BaseModel): + topics: dict[str, TopicConfig] + + class Topics: + my_new_topic: TopicConfig +``` + +3. **Restart services**: +```bash +make down +make up +``` + +4. **Verify creation**: +```bash +make kafka-topics +# Should see "my.new.topic" in list +``` + +### Testing + +```bash +# Test configuration parsing +python -m pytest libs/common/tests/test_config.py + +# Test topic creation (requires Kafka) +docker compose up -d kafka +docker compose run --rm kafka-init +docker compose down +``` + +## Monitoring + +### Logs + +```bash +# View logs +docker compose logs kafka-init + +# Successful run: +# INFO - Connecting to Kafka at kafka:29092 +# INFO - βœ… Created topic: dc.workload +# INFO - βœ… Created topic: dc.power +# INFO - All topics created successfully + +# Failed run: +# ERROR - Failed to create topic dc.workload: TopicExistsError +# ERROR - Topic creation failed +``` + +### Container Status + +```bash +# Check if kafka-init completed successfully +docker compose ps kafka-init + +# STATUS should show "exited (0)" +``` diff --git a/services/kafka-init/kafka_init/__init__.py b/services/kafka-init/kafka_init/__init__.py new file mode 100644 index 0000000..114f5cd --- /dev/null +++ b/services/kafka-init/kafka_init/__init__.py @@ -0,0 +1 @@ +"""Kafka Infrastructure Initialization Service.""" diff --git a/services/kafka-init/kafka_init/main.py b/services/kafka-init/kafka_init/main.py new file mode 100644 index 0000000..a2e61ab --- /dev/null +++ b/services/kafka-init/kafka_init/main.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +"""Kafka Infrastructure Initialization Script. + +This script: +1. Loads configuration from the environment +2. Creates Kafka topics based on the configuration +3. Applies partitions, replication factor, and topic-specific settings +4. Exits with code 1 if creation fails (Fail Fast) +""" + +import logging +import sys +import time + +from kafka import KafkaAdminClient +from kafka.admin import NewTopic +from kafka.errors import KafkaError, TopicAlreadyExistsError +from opendt_common import load_config_from_env + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def ensure_topics_exist( + bootstrap_servers: str, + topics: list[NewTopic], + max_retries: int = 10, + retry_delay: float = 2.0, +) -> bool: + """Ensure Kafka topics exist, creating them if necessary. + + Args: + bootstrap_servers: Kafka bootstrap servers + topics: List of NewTopic objects defining topic configurations + max_retries: Maximum number of connection retries + retry_delay: Delay between retries in seconds + + Returns: + True if all topics exist or were created successfully + + Raises: + KafkaError: If topics cannot be created after max retries + """ + if not topics: + logger.info("No topics to create") + return True + + topic_names = [t.name for t in topics] + + for attempt in range(max_retries): + try: + admin_client = KafkaAdminClient( + bootstrap_servers=bootstrap_servers, + client_id="topic-manager", + request_timeout_ms=10000, + ) + + # Get existing topics + existing_topics = admin_client.list_topics() + topics_to_create = [t for t in topics if t.name not in existing_topics] + + if not topics_to_create: + logger.info(f"All topics already exist: {topic_names}") + admin_client.close() + return True + + # Create missing topics with their specific configurations + try: + admin_client.create_topics(topics_to_create, validate_only=False) + created_names = [t.name for t in topics_to_create] + logger.info(f"Created topics: {created_names}") + + # Give Kafka a moment to create the topics + time.sleep(1.0) + + # Verify topics were created + existing_topics_after = admin_client.list_topics() + still_missing = [ + t.name for t in topics_to_create if t.name not in existing_topics_after + ] + + if still_missing: + logger.warning(f"Some topics may not be ready yet: {still_missing}") + else: + logger.info(f"βœ“ Successfully ensured all topics exist: {topic_names}") + + admin_client.close() + return True + + except TopicAlreadyExistsError: + logger.info("Topics already exist (race condition)") + admin_client.close() + return True + + except KafkaError as e: + logger.warning(f"Failed to connect to Kafka (attempt {attempt + 1}/{max_retries}): {e}") + if attempt < max_retries - 1: + time.sleep(retry_delay) + else: + raise + + return False + + +def main() -> int: + """Initialize Kafka infrastructure. + + Returns: + Exit code: 0 for success, 1 for failure + """ + try: + # Load configuration from environment + logger.info("Loading configuration...") + config = load_config_from_env() + logger.info(f"Configuration loaded for workload: {config.workload}") + + # Extract Kafka configuration + kafka_config = config.kafka + logger.info(f"Kafka bootstrap servers: {kafka_config.bootstrap_servers}") + logger.info(f"Topics to create: {list(kafka_config.topics.keys())}") + + # Convert topic configurations to NewTopic objects + new_topics: list[NewTopic] = [] + for logical_key, topic_config in kafka_config.topics.items(): + logger.info( + f" - {logical_key}: {topic_config.name} " + f"(partitions={topic_config.partitions}, " + f"replication_factor={topic_config.replication_factor})" + ) + + # Create NewTopic with configuration + new_topic = NewTopic( + name=topic_config.name, + num_partitions=topic_config.partitions, + replication_factor=topic_config.replication_factor, + topic_configs=topic_config.config, # Apply topic-specific configs + ) + new_topics.append(new_topic) + + # Ensure topics exist + logger.info("Creating Kafka topics...") + success = ensure_topics_exist( + bootstrap_servers=kafka_config.bootstrap_servers, + topics=new_topics, + max_retries=30, # Kafka may take time to start + retry_delay=2.0, + ) + + if success: + logger.info("βœ… Kafka infrastructure initialization complete") + return 0 + else: + logger.error("❌ Failed to initialize Kafka infrastructure") + return 1 + + except Exception as e: + logger.error(f"❌ Error during Kafka initialization: {e}", exc_info=True) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/sim-worker/Dockerfile b/services/sim-worker/Dockerfile new file mode 100644 index 0000000..3e0e0e2 --- /dev/null +++ b/services/sim-worker/Dockerfile @@ -0,0 +1,38 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies including Java for OpenDC +RUN apt-get update && apt-get install -y \ + gcc \ + openjdk-21-jre-headless \ + && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies from root pyproject.toml +COPY pyproject.toml /app/pyproject.toml +RUN pip install --no-cache-dir -e /app + +# Copy and install shared library +COPY libs/common /app/libs/common +RUN pip install --no-cache-dir -e /app/libs/common + +# Copy service code +COPY services/sim-worker/sim_worker /app/services/sim-worker/sim_worker + +# Copy OpenDC binaries (these will be in the image) +COPY services/sim-worker/opendc /app/opendc + +# Create output directory for OpenDC results +RUN mkdir -p /app/output && chmod 755 /app/output + +# Create non-root user for security +RUN useradd -m -u 1000 opendt && \ + chown -R opendt:opendt /app && \ + chmod +x /app/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner + +USER opendt + +# Set working directory to service for proper module resolution +WORKDIR /app/services/sim-worker + +CMD ["python", "-m", "sim_worker.main"] diff --git a/services/sim-worker/README.md b/services/sim-worker/README.md new file mode 100644 index 0000000..270911a --- /dev/null +++ b/services/sim-worker/README.md @@ -0,0 +1,571 @@ +# sim-worker Service + +The **sim-worker** is the core simulation engine of OpenDT. It consumes workload and topology streams from Kafka, aggregates tasks into time windows, invokes the OpenDC simulator, and outputs power consumption predictions. + +## Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ sim-worker β”‚ +β”‚ β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Kafka │────>β”‚ Window Manager β”‚ β”‚ +β”‚ β”‚ Consumer β”‚ β”‚ - Aggregation β”‚ β”‚ +β”‚ β”‚ β”‚ β”‚ - Heartbeats β”‚ β”‚ +β”‚ β”‚ Topics: β”‚ β”‚ - Closing β”‚ β”‚ +β”‚ β”‚ - workload β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ - topology β”‚ β”‚ β”‚ +β”‚ β”‚ - sim.topo β”‚ v β”‚ +β”‚ β”‚ - power β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ OpenDC Runner β”‚ β”‚ +β”‚ β”‚ - Parquet I/O β”‚ β”‚ +β”‚ β”‚ - Subprocess β”‚ β”‚ +β”‚ β”‚ - Parsing β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ v β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ Result Cache β”‚ β”‚ +β”‚ β”‚ Experiment Mgr β”‚ β”‚ +β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ +β”‚ β”‚ β”‚ +β”‚ v β”‚ +β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ +β”‚ β”‚ β”‚ β”‚ +β”‚ Kafka Topic Local Files β”‚ +β”‚ sim.results (debug/experiment) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## Core Components + +### 1. Main Worker (`main.py`) + +**Responsibilities**: +- Kafka consumer event loop +- Message routing (workload, topology, sim.topology, power) +- Window lifecycle management +- Mode selection (normal/debug/experiment) +- Coordination between components + +**Key Classes**: +- `SimulationWorker`: Main orchestrator + +### 2. OpenDC Runner (`runner/opendc_runner.py`) + +**Responsibilities**: +- Convert Pydantic models to OpenDC input formats +- Invoke OpenDC binary via subprocess +- Parse OpenDC output Parquet files +- Return structured results with timeseries + +**Key Classes**: +- `OpenDCRunner`: Simulation invocation and I/O +- `SimulationResults`: Structured output model +- `TimeseriesData`: Power/CPU timeseries points + +**Input Files Created**: +- `experiment.json` - OpenDC experiment configuration +- `topology.json` - Datacenter topology +- `tasks.parquet` - Task definitions (int32 IDs, non-nullable) +- `fragments.parquet` - Task execution profiles + +**Output Files Parsed**: +- `powerSource.parquet` - Power consumption over time +- `host.parquet` - Host-level metrics +- `service.parquet` - Service-level metrics + +### 3. Window Manager (`window_manager.py`) + +**Responsibilities**: +- Event-time windowing based on task submission timestamps +- Contiguous window creation (no gaps) +- Heartbeat-based window closing +- Task accumulation per window +- Topology tracking + +**Key Classes**: +- `TimeWindow`: Individual window with tasks and metadata +- `WindowManager`: Window lifecycle management + +**Windowing Logic**: +``` +First task at 22:13:15 β†’ Create window [22:13:00 - 22:18:00) +Heartbeat at 22:18:00 β†’ Close window 0, create window 1 [22:18:00 - 22:23:00) +Task at 22:31:45 β†’ Close windows 1-3, create window 4 [22:28:00 - 22:33:00) +``` + +### 4. Result Cache (`result_cache.py`) + +**Responsibilities**: +- Cache simulation results based on inputs +- Avoid redundant OpenDC invocations +- Invalidate cache on topology changes + +**Caching Strategy**: +```python +cache_key = SHA256(topology_json) + cumulative_task_count + +if cache.can_reuse(topology, task_count): + return cache.get_cached_results() +else: + results = run_simulation(...) + cache.update(topology, task_count, results) +``` + +**Invalidation**: +- When simulated topology updated via API +- Cache cleared manually via `cache.clear()` + +### 5. Experiment Manager (`experiment_manager.py`) + +**Responsibilities** (Experiment Mode Only): +- Record actual power from `dc.power` +- Write simulation results to Parquet +- Archive OpenDC I/O files per window +- Generate power comparison plots + +**Output Structure**: +``` +output/ +└── my_experiment/ + └── run_1/ + β”œβ”€β”€ results.parquet + β”œβ”€β”€ power_plot.png + └── opendc/ + └── window_0000/ + β”œβ”€β”€ input/ + β”‚ β”œβ”€β”€ summary.json + β”‚ β”œβ”€β”€ experiment.json + β”‚ β”œβ”€β”€ topology.json + β”‚ β”œβ”€β”€ tasks.parquet + β”‚ └── fragments.parquet + └── output/ + β”œβ”€β”€ summary.json + β”œβ”€β”€ powerSource.parquet + β”œβ”€β”€ host.parquet + └── service.parquet +``` + +## Simulation Flow + +### 1. Task Ingestion + +```python +# WorkloadMessage received from dc.workload +message = { + "message_type": "task", # or "heartbeat" + "timestamp": "2022-10-07T00:39:21", + "task": { /* Task object */ } +} + +# Route to window manager +window_manager.add_task(task) +``` + +### 2. Window Closing + +```python +# Heartbeat received +heartbeat = { + "message_type": "heartbeat", + "timestamp": "2022-10-07T00:45:00" +} + +# Close all windows ending before heartbeat timestamp +closed_windows = window_manager.close_windows_before(heartbeat.timestamp) + +# Process each closed window +for window in closed_windows: + process_window(window) +``` + +### 3. Simulation Invocation + +```python +# Collect cumulative tasks (all from beginning) +all_tasks = [] +for w in windows[0:current_window_id + 1]: + all_tasks.extend(w.tasks) + +# Check cache +if result_cache.can_reuse(simulated_topology, len(all_tasks)): + results = result_cache.get_cached_results() + logger.info("βœ… Using cached results") +else: + # Run OpenDC + results = opendc_runner.run_simulation( + tasks=all_tasks, + topology=simulated_topology, + experiment_name=f"window-{window.window_id}-simulated" + ) + result_cache.update(simulated_topology, len(all_tasks), results) +``` + +### 4. Result Handling + +**Normal Mode**: +```python +# Publish to Kafka +send_message( + producer=producer, + topic="sim.results", + message=results.model_dump(mode="json") +) +``` + +**Debug Mode**: +```python +# Write to local files +output_dir = f"output/run-{worker_id}-{timestamp}/window_{window_id:04d}/" +Path(output_dir).mkdir(parents=True, exist_ok=True) + +with open(output_dir / "results.json", "w") as f: + json.dump(results.model_dump(mode="json"), f, indent=2) +``` + +**Experiment Mode**: +```python +# Write to parquet + generate plot +experiment_manager.write_simulation_results(window, results, all_tasks) +experiment_manager.archive_opendc_files(window, results, all_tasks) +experiment_manager.generate_power_plot() +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `CONFIG_FILE` | Path to YAML configuration | `/app/config/simulation.yaml` | +| `WORKER_ID` | Unique worker identifier | `worker-1` | +| `CONSUMER_GROUP` | Kafka consumer group | `sim-workers` | +| `DEBUG_MODE` | Enable debug mode | `false` | +| `EXPERIMENT_NAME` | Experiment identifier | `default` | +| `EXPERIMENT_OUTPUT_DIR` | Experiment output path | `/app/output` | + +### YAML Configuration + +**File**: `config/default.yaml` + +```yaml +simulation: + window_size_minutes: 5 + heartbeat_cadence_minutes: 1 + experiment_mode: false # Set true for experiment mode + +kafka: + bootstrap_servers: "kafka:29092" + topics: + workload: + name: "dc.workload" + topology: + name: "dc.topology" + sim_topology: + name: "sim.topology" + power: + name: "dc.power" + results: + name: "sim.results" +``` + +## Operating Modes + +### Normal Mode + +```bash +make up +``` + +- Publishes results to Kafka +- No local files +- Production mode + +### Debug Mode + +```bash +make up-debug +``` + +- Writes JSON files per window +- No Kafka publishing +- Development mode + +### Experiment Mode + +```bash +make experiment name=my_experiment +``` + +- Writes Parquet + plots +- Archives OpenDC I/O +- Research mode + +## OpenDC Integration + +### Binary Location + +``` +services/sim-worker/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner +``` + +### Java Requirements + +- **Version**: OpenJDK 21 +- **JAVA_HOME**: Auto-detected at runtime + - macOS: `/usr/libexec/java_home` + - Linux: `/usr/lib/jvm/java-21-openjdk-amd64` + +### Invocation + +```python +result = subprocess.run( + [opendc_binary, "--experiment-path", experiment_json_path], + env={"JAVA_HOME": detected_java_home}, + capture_output=True, + text=True, + timeout=120 +) +``` + +### Input Schema + +**experiment.json**: +```json +{ + "name": "window-0-simulated", + "topologies": [{"pathToFile": "/tmp/opendc-.../topology.json"}], + "workloads": [{ + "pathToFile": "/tmp/opendc-.../workload", + "type": "ComputeWorkload" + }], + "exportModels": [{ + "exportInterval": 150, + "filesToExport": ["powerSource", "host", "task", "service"], + "computeExportConfig": { + "powerSourceExportColumns": ["energy_usage", "power_draw"] + } + }], + "outputFolder": "/tmp/opendc-.../output" +} +``` + +### Output Parsing + +```python +# Read powerSource.parquet for timeseries +power_table = pq.read_table(output_dir / "powerSource.parquet") +power_df = power_table.to_pandas() + +# Extract energy and power +energy_kwh = power_df["energy_usage"].sum() / 3_600_000 +max_power = power_df["power_draw"].max() + +# Build timeseries +power_draw_series = [ + TimeseriesData(timestamp=int(row["timestamp"]), value=float(row["power_draw"])) + for _, row in power_df.iterrows() +] +``` + +## Running + +### Via Docker Compose + +```bash +# Start all services +make up + +# View sim-worker logs +make logs-sim-worker + +# Execute command in container +docker compose exec sim-worker bash +``` + +### Standalone (Development) + +```bash +cd services/sim-worker +source ../../.venv/bin/activate + +# Set environment +export CONFIG_FILE=../../config/default.yaml +export WORKER_ID=dev-worker +export EXPERIMENT_OUTPUT_DIR=../../output + +# Run worker +python -m sim_worker.main +``` + +### Testing + +```bash +cd services/sim-worker +pytest tests/ + +# Run specific test +pytest tests/test_opendc_simple.py -v + +# With detailed logs +pytest tests/ -o log_cli=true -o log_cli_level=DEBUG +``` + +## Monitoring + +### Logs + +```bash +# Tail logs +docker compose logs -f sim-worker + +# Expected output: +# INFO - Initialized SimulationWorker 'worker-1' +# INFO - Subscribed: dc.workload, dc.topology, sim.topology +# INFO - πŸ“¦ Created window 0: [2022-10-07 00:00:00 - 2022-10-07 00:05:00) +# INFO - πŸ”’ Closed window 0 with 42 tasks +# INFO - Running simulation for window 0 with 42 cumulative tasks +# INFO - βœ… Simulation (simulated) for window 0: energy=1.649 kWh +``` + +### Metrics + +```bash +# Check window statistics (from logs) +docker compose logs sim-worker | grep "Stats:" + +# Output: +# INFO - πŸ“Š Stats: 314 tasks processed, 35 windows simulated +``` + +### Kafka Lag + +```bash +# Check consumer lag +docker exec -it opendt-kafka kafka-consumer-groups \ + --bootstrap-server localhost:9092 \ + --describe \ + --group sim-workers +``` + +## Troubleshooting + +### Issue: "OpenDC simulation failed with exit code 1" + +**Cause**: Invalid input data or topology + +**Solution**: +1. Enable debug mode: `make up-debug` +2. Check `output/run-*/window_*/tasks.json` for task data +3. Verify topology structure +4. Check OpenDC logs in sim-worker container + +### Issue: "JAVA_HOME is set to an invalid directory" + +**Cause**: Java not installed or wrong path + +**Solution**: +```bash +# Check Java in container +docker compose exec sim-worker java -version + +# Should show: openjdk version "21" + +# If missing, rebuild: +make down +make up build=true +``` + +### Issue: Windows staying open indefinitely + +**Cause**: No heartbeats or heartbeat cadence too long + +**Solution**: +```yaml +# In config file +simulation: + heartbeat_cadence_minutes: 1 # Decrease if needed +``` + +### Issue: Cache not working (redundant simulations) + +**Cause**: Topology changing between windows + +**Solution**: +```bash +# Check logs for cache hits +docker compose logs sim-worker | grep "cached" + +# Verify topology stability +docker exec -it opendt-kafka kafka-console-consumer \ + --bootstrap-server localhost:9092 \ + --topic sim.topology \ + --from-beginning +``` + +## Performance + +### Throughput + +- **Windows/minute**: ~10-20 (depends on OpenDC speed) +- **OpenDC invocation time**: ~2-5 seconds per window +- **Caching improvement**: 95%+ time savings when topology stable + +### Resource Usage + +- **Memory**: ~1-2 GB (includes OpenDC subprocess) +- **CPU**: Medium (OpenDC is CPU-intensive during simulation) +- **Disk**: Variable (experiment mode creates large archives) + +### Optimization Tips + +1. **Enable caching**: Ensure topology doesn't change unnecessarily +2. **Longer windows**: Use 15-minute windows for fewer simulations +3. **Increase heartbeat cadence**: Reduce Kafka message overhead +4. **Parquet compression**: Reduces I/O time + +## Development + +### Adding New Features + +**To modify windowing logic**: +1. Edit `sim_worker/window_manager.py` +2. Update `TimeWindow` or `WindowManager` classes +3. Add tests in `tests/test_window_manager.py` + +**To change OpenDC invocation**: +1. Edit `sim_worker/runner/opendc_runner.py` +2. Update input file generation or output parsing +3. Test with `tests/test_opendc_simple.py` + +**To add new operating mode**: +1. Add mode flag to `SimulationWorker.__init__` +2. Implement result handling in `_handle_results` + +### Code Structure + +``` +sim_worker/ +β”œβ”€β”€ __init__.py +β”œβ”€β”€ main.py # Main worker + Kafka integration +β”œβ”€β”€ window_manager.py # Windowing logic +β”œβ”€β”€ result_cache.py # Caching mechanism +β”œβ”€β”€ experiment_manager.py # Experiment mode logic +└── runner/ + β”œβ”€β”€ __init__.py + β”œβ”€β”€ opendc_runner.py # OpenDC invocation + β”œβ”€β”€ models.py # Result models + └── java_home.py # Java detection +``` + +## Related Documentation + +- [Architecture Overview](../../docs/ARCHITECTURE.md) - System design +- [Data Models](../../docs/DATA_MODELS.md) - Input/output schemas +- [dc-mock Service](../dc-mock/README.md) - Data producer +- [dashboard Service](../dashboard/README.md) - Web dashboard and topology management + +--- + +For questions or contributions, see the [Contributing Guide](../../CONTRIBUTING.md). diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/LICENSE-OpenDC.txt b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/LICENSE-OpenDC.txt similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/LICENSE-OpenDC.txt rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/LICENSE-OpenDC.txt diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner.bat b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner.bat similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner.bat rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner.bat diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/activation-1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/activation-1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/activation-1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/activation-1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/aircompressor-0.21.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/aircompressor-0.21.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/aircompressor-0.21.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/aircompressor-0.21.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/animal-sniffer-annotations-1.17.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/animal-sniffer-annotations-1.17.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/animal-sniffer-annotations-1.17.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/animal-sniffer-annotations-1.17.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/annotations-23.0.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/annotations-23.0.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/annotations-23.0.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/annotations-23.0.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/audience-annotations-0.13.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/audience-annotations-0.13.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/audience-annotations-0.13.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/audience-annotations-0.13.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/checker-qual-2.5.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/checker-qual-2.5.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/checker-qual-2.5.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/checker-qual-2.5.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/clikt-jvm-3.5.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/clikt-jvm-3.5.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/clikt-jvm-3.5.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/clikt-jvm-3.5.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-beanutils-1.9.4.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-beanutils-1.9.4.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-beanutils-1.9.4.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-beanutils-1.9.4.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-codec-1.15.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-codec-1.15.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-codec-1.15.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-codec-1.15.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-collections-3.2.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-collections-3.2.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-collections-3.2.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-collections-3.2.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-compress-1.21.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-compress-1.21.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-compress-1.21.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-compress-1.21.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-configuration2-2.8.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-configuration2-2.8.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-configuration2-2.8.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-configuration2-2.8.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-io-2.8.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-io-2.8.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-io-2.8.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-io-2.8.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-lang3-3.12.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-lang3-3.12.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-lang3-3.12.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-lang3-3.12.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-logging-1.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-logging-1.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-logging-1.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-logging-1.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-math3-3.6.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-math3-3.6.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-math3-3.6.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-math3-3.6.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-net-3.9.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-net-3.9.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-net-3.9.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-net-3.9.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-pool-1.6.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-pool-1.6.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-pool-1.6.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-pool-1.6.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-text-1.10.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-text-1.10.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/commons-text-1.10.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/commons-text-1.10.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/failureaccess-1.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/failureaccess-1.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/failureaccess-1.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/failureaccess-1.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/guava-27.0-jre.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/guava-27.0-jre.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/guava-27.0-jre.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/guava-27.0-jre.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-common-3.3.6.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-common-3.3.6.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-common-3.3.6.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-common-3.3.6.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-mapreduce-client-core-3.3.6.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-mapreduce-client-core-3.3.6.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-mapreduce-client-core-3.3.6.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-mapreduce-client-core-3.3.6.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-guava-1.1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-guava-1.1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-guava-1.1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-guava-1.1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-protobuf_3_7-1.1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-protobuf_3_7-1.1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-protobuf_3_7-1.1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/hadoop-shaded-protobuf_3_7-1.1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/j2objc-annotations-1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/j2objc-annotations-1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/j2objc-annotations-1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/j2objc-annotations-1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-annotations-2.16.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-annotations-2.16.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-annotations-2.16.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-annotations-2.16.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-core-2.16.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-core-2.16.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-core-2.16.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-core-2.16.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-databind-2.16.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-databind-2.16.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-databind-2.16.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-databind-2.16.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-dataformat-csv-2.16.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-dataformat-csv-2.16.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-dataformat-csv-2.16.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-dataformat-csv-2.16.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-module-kotlin-2.16.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-module-kotlin-2.16.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jackson-module-kotlin-2.16.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jackson-module-kotlin-2.16.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jakarta.activation-api-1.2.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jakarta.activation-api-1.2.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jakarta.activation-api-1.2.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jakarta.activation-api-1.2.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jaxb-api-2.2.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jaxb-api-2.2.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jaxb-api-2.2.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jaxb-api-2.2.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jaxb-impl-2.2.3-1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jaxb-impl-2.2.3-1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jaxb-impl-2.2.3-1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jaxb-impl-2.2.3-1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jersey-json-1.20.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jersey-json-1.20.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jersey-json-1.20.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jersey-json-1.20.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jettison-1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jettison-1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jettison-1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jettison-1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jline-3.23.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jline-3.23.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jline-3.23.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jline-3.23.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jsr305-3.0.2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jsr305-3.0.2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/jsr305-3.0.2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/jsr305-3.0.2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-logging-jvm-3.0.5.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-logging-jvm-3.0.5.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-logging-jvm-3.0.5.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-logging-jvm-3.0.5.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-reflect-1.6.21.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-reflect-1.6.21.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-reflect-1.6.21.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-reflect-1.6.21.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-1.9.22.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-1.9.22.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-1.9.22.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-1.9.22.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk7-1.8.10.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk7-1.8.10.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk7-1.8.10.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk7-1.8.10.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk8-1.8.10.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk8-1.8.10.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk8-1.8.10.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlin-stdlib-jdk8-1.8.10.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-coroutines-core-jvm-1.8.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-coroutines-core-jvm-1.8.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-coroutines-core-jvm-1.8.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-coroutines-core-jvm-1.8.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-core-jvm-1.6.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-core-jvm-1.6.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-core-jvm-1.6.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-core-jvm-1.6.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-json-jvm-1.6.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-json-jvm-1.6.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-json-jvm-1.6.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/kotlinx-serialization-json-jvm-1.6.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-api-2.23.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-api-2.23.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-api-2.23.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-api-2.23.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-core-2.23.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-core-2.23.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-core-2.23.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-core-2.23.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-slf4j2-impl-2.23.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-slf4j2-impl-2.23.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/log4j-slf4j2-impl-2.23.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/log4j-slf4j2-impl-2.23.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/metrics-core-3.2.4.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/metrics-core-3.2.4.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/metrics-core-3.2.4.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/metrics-core-3.2.4.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/microprofile-config-api-3.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/microprofile-config-api-3.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/microprofile-config-api-3.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/microprofile-config-api-3.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-common.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-common.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-common.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-common.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-api.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-api.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-api.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-api.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-carbon.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-carbon.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-carbon.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-carbon.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-failure.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-failure.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-failure.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-failure.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-simulator.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-simulator.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-simulator.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-simulator.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-topology.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-topology.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-topology.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-topology.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-workload.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-workload.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-workload.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-compute-workload.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-experiments-base.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-experiments-base.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-experiments-base.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-experiments-base.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-compute.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-compute.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-compute.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-compute.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-core.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-core.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-core.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-core.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-flow.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-flow.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-flow.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-simulator-flow.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-api.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-api.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-api.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-api.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-parquet.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-parquet.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-parquet.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/opendc-trace-parquet.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-column-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-column-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-column-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-column-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-common-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-common-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-common-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-common-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-encoding-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-encoding-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-encoding-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-encoding-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-format-structures-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-format-structures-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-format-structures-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-format-structures-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-hadoop-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-hadoop-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-hadoop-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-hadoop-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-jackson-1.13.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-jackson-1.13.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/parquet-jackson-1.13.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/parquet-jackson-1.13.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/progressbar-0.10.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/progressbar-0.10.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/progressbar-0.10.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/progressbar-0.10.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/protobuf-java-2.5.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/protobuf-java-2.5.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/protobuf-java-2.5.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/protobuf-java-2.5.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/re2j-1.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/re2j-1.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/re2j-1.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/re2j-1.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/slf4j-api-2.0.9.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/slf4j-api-2.0.9.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/slf4j-api-2.0.9.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/slf4j-api-2.0.9.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/snappy-java-1.1.8.3.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/snappy-java-1.1.8.3.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/snappy-java-1.1.8.3.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/snappy-java-1.1.8.3.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/stax-api-1.0-2.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/stax-api-1.0-2.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/stax-api-1.0-2.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/stax-api-1.0-2.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/stax2-api-4.2.1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/stax2-api-4.2.1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/stax2-api-4.2.1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/stax2-api-4.2.1.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/woodstox-core-5.4.0.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/woodstox-core-5.4.0.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/woodstox-core-5.4.0.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/woodstox-core-5.4.0.jar diff --git a/src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/zstd-jni-1.5.0-1.jar b/services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/zstd-jni-1.5.0-1.jar similarity index 100% rename from src/opendt/core/simulation/opendc/bin/OpenDCExperimentRunner/lib/zstd-jni-1.5.0-1.jar rename to services/sim-worker/opendc/bin/OpenDCExperimentRunner/lib/zstd-jni-1.5.0-1.jar diff --git a/services/sim-worker/pytest.ini b/services/sim-worker/pytest.ini new file mode 100644 index 0000000..284af56 --- /dev/null +++ b/services/sim-worker/pytest.ini @@ -0,0 +1,28 @@ +[pytest] +# Pytest configuration for sim-worker tests + +# Test discovery patterns +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Output options +addopts = + -v + --tb=short + --strict-markers + --disable-warnings + -ra + +# Markers for categorizing tests +markers = + unit: Unit tests that don't require external dependencies + integration: Integration tests that test multiple components together + slow: Tests that take a long time to run + requires_opendc: Tests that require OpenDC binaries to be present + +# Test paths +testpaths = tests + +# Minimum Python version +minversion = 3.11 diff --git a/services/sim-worker/sim_worker/__init__.py b/services/sim-worker/sim_worker/__init__.py new file mode 100644 index 0000000..55ecacd --- /dev/null +++ b/services/sim-worker/sim_worker/__init__.py @@ -0,0 +1,16 @@ +"""Simulation Worker Service Package. + +This package provides the core simulation engine for OpenDT, including: +- OpenDC binary wrapper for running simulations +- Time-based window management for task aggregation +- Kafka integration for consuming workload and topology streams +""" + +from .runner import OpenDCRunner +from .window_manager import TimeWindow, WindowManager + +__all__ = [ + "OpenDCRunner", + "TimeWindow", + "WindowManager", +] diff --git a/services/sim-worker/sim_worker/experiment_manager.py b/services/sim-worker/sim_worker/experiment_manager.py new file mode 100644 index 0000000..cfc9436 --- /dev/null +++ b/services/sim-worker/sim_worker/experiment_manager.py @@ -0,0 +1,372 @@ +"""Experiment Manager for sim-worker. + +Handles all experiment-specific functionality: +- Result persistence to parquet files +- OpenDC I/O file archiving +- Power consumption plotting (actual vs simulated) +""" + +import logging +import shutil +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any + +import matplotlib +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from opendt_common.models import Task + +from .runner import SimulationResults +from .window_manager import TimeWindow + +matplotlib.use("Agg") # Use non-interactive backend +import matplotlib.pyplot as plt + +logger = logging.getLogger(__name__) + + +class ExperimentManager: + """Manages experiment-specific operations for sim-worker. + + Responsibilities: + - Writing simulation results to parquet files + - Archiving OpenDC input/output files + - Collecting actual power consumption data + - Generating comparison plots (actual vs simulated power) + """ + + def __init__( + self, + experiment_name: str, + experiment_output_dir: str, + run_number: int, + ): + """Initialize the experiment manager. + + Args: + experiment_name: Name of the experiment + experiment_output_dir: Base directory for experiment outputs + run_number: Run number for this experiment instance + """ + self.experiment_name = experiment_name + self.run_number = run_number + + # Setup directory structure + self.experiment_run_dir = ( + Path(experiment_output_dir) / experiment_name / f"run_{run_number}" + ) + self.experiment_run_dir.mkdir(parents=True, exist_ok=True) + + self.results_parquet_path = self.experiment_run_dir / "results.parquet" + self.opendc_dir = self.experiment_run_dir / "opendc" + self.opendc_dir.mkdir(parents=True, exist_ok=True) + self.plot_path = self.experiment_run_dir / "power_plot.png" + + # Data collection + self.actual_power_data: list[dict[str, Any]] = [] + + logger.info("πŸ“ Experiment directories created:") + logger.info(f" Run dir: {self.experiment_run_dir}") + logger.info(f" Results: {self.results_parquet_path}") + logger.info(f" OpenDC I/O: {self.opendc_dir}") + logger.info(f" Plot: {self.plot_path}") + + def record_actual_power(self, timestamp: datetime, power_draw: float) -> None: + """Record actual power consumption from dc.power topic. + + Args: + timestamp: Timestamp of the measurement + power_draw: Power draw in Watts + """ + self.actual_power_data.append( + { + "timestamp": timestamp, + "power_draw": power_draw, + } + ) + logger.debug(f"Recorded actual power: {power_draw:.2f} W at {timestamp}") + + def write_simulation_results( + self, + window: TimeWindow, + simulated_results: SimulationResults, + cumulative_tasks: list[Task], + ) -> None: + """Write simulation results to parquet file. + + Extracts power draw for this specific window from cumulative simulation. + + Args: + window: The window that was simulated + simulated_results: Results from simulated topology simulation + cumulative_tasks: All cumulative tasks (used to get first task time) + """ + if not cumulative_tasks: + logger.warning(f"No tasks to determine timestamp offset for window {window.window_id}") + return + + # Get first task submission time for timestamp conversion + first_task_time = cumulative_tasks[0].submission_time + + # Convert relative timestamps to absolute and filter for this window + power_data = [] + for point in simulated_results.power_draw_series: + # point.timestamp is milliseconds offset from first task + absolute_time = first_task_time + timedelta(milliseconds=point.timestamp) + + # Only include if within this window's time range + if window.window_start <= absolute_time < window.window_end: + power_data.append( + { + "window_id": window.window_id, + "window_start": window.window_start, + "window_end": window.window_end, + "timestamp": absolute_time, + "power_draw": point.value, + } + ) + + if not power_data: + logger.warning( + f"No power draw data in window {window.window_id} time range " + f"[{window.window_start} - {window.window_end})" + ) + return + + # Create DataFrame and append to parquet + df = pd.DataFrame(power_data) + table = pa.Table.from_pandas(df) + + # Append to parquet file + if self.results_parquet_path.exists(): + # Read existing and append + existing_table = pq.read_table(self.results_parquet_path) + combined_table = pa.concat_tables([existing_table, table]) + pq.write_table(combined_table, self.results_parquet_path) + else: + # Write new file + pq.write_table(table, self.results_parquet_path) + + logger.info( + f"πŸ“Š Wrote {len(power_data)} power measurements for window {window.window_id} " + f"to {self.results_parquet_path.name}" + ) + + def archive_opendc_files( + self, + window: TimeWindow, + simulated_results: SimulationResults, + cumulative_tasks: list[Task], + ) -> None: + """Archive OpenDC input/output files for this window. + + Creates organized structure: + - input/summary.json: Window metadata + task submission times + - input/experiment.json, topology.json, *.parquet: OpenDC input files + - output/summary.json: Power draw results over time + - output/*.parquet: OpenDC output files + + Args: + window: The window that was simulated + simulated_results: Simulation results containing temp_dir path + cumulative_tasks: All cumulative tasks used in simulation + """ + if not simulated_results.temp_dir: + logger.warning(f"No temp_dir in simulation results for window {window.window_id}") + return + + import json + + # Create window-specific directory structure + window_dir = self.opendc_dir / f"window_{window.window_id:04d}" + input_dir = window_dir / "input" + output_dir_dest = window_dir / "output" + input_dir.mkdir(parents=True, exist_ok=True) + output_dir_dest.mkdir(parents=True, exist_ok=True) + + temp_dir = Path(simulated_results.temp_dir) + + # Create input/summary.json with window metadata + task submission times + input_summary = { + "window_id": window.window_id, + "window_start": window.window_start.isoformat(), + "window_end": window.window_end.isoformat(), + "task_count": len(window.tasks), + "cumulative_task_count": len(cumulative_tasks), + "task_submission_times": [ + task.submission_time.isoformat() for task in cumulative_tasks + ], + } + + input_summary_path = input_dir / "summary.json" + with open(input_summary_path, "w") as f: + json.dump(input_summary, f, indent=2) + + # Copy OpenDC input files + input_files = [ + temp_dir / "experiment.json", + temp_dir / "topology.json", + temp_dir / "workload" / "tasks.parquet", + temp_dir / "workload" / "fragments.parquet", + ] + + for input_file in input_files: + if input_file.exists(): + shutil.copy2(input_file, input_dir / input_file.name) + + # Copy entire OpenDC output directory from temp to archive + output_dir_src = ( + Path(simulated_results.opendc_output_dir) + if simulated_results.opendc_output_dir + else None + ) + + if output_dir_src and output_dir_src.exists(): + # Copy all files from OpenDC output directory + for item in output_dir_src.iterdir(): + if item.is_file(): + shutil.copy2(item, output_dir_dest / item.name) + logger.debug(f"Copied {item.name} to {output_dir_dest}") + + logger.debug(f"Copied OpenDC output directory: {output_dir_src} -> {output_dir_dest}") + else: + logger.warning(f"OpenDC output directory not found or not set: {output_dir_src}") + + # Create output/summary.json with power draw results and summary stats + output_summary = { + "window_id": window.window_id, + "window_start": window.window_start.isoformat(), + "window_end": window.window_end.isoformat(), + "summary_statistics": { + "energy_kwh": simulated_results.energy_kwh, + "max_power_draw_w": simulated_results.max_power_draw, + "avg_cpu_utilization": simulated_results.cpu_utilization, + "runtime_hours": simulated_results.runtime_hours, + }, + "power_draw_timeseries": [ + { + "timestamp_ms": data_point.timestamp, + "power_draw_w": data_point.value, + } + for data_point in simulated_results.power_draw_series + ] + if simulated_results.power_draw_series + else [], + } + + output_summary_path = output_dir_dest / "summary.json" + with open(output_summary_path, "w") as f: + json.dump(output_summary, f, indent=2) + + logger.debug(f"πŸ“ Archived OpenDC I/O files for window {window.window_id}") + + def generate_power_plot(self) -> None: + """Generate comparison plot of actual vs predicted power consumption. + + Creates a time-series plot showing: + - Actual power (ground truth from dc.power) + - Simulated power (predicted by OpenDC) + + The plot is saved to power_plot.png and overwritten with each update. + """ + if not self.results_parquet_path.exists(): + logger.warning("No experiment parquet file found, skipping plot generation") + return + + if not self.actual_power_data: + logger.warning("No actual power data available, skipping plot generation") + return + + try: + # Read simulated power data from parquet + simulated_df = pq.read_table(self.results_parquet_path).to_pandas() + + # Convert actual power data to DataFrame + actual_df = pd.DataFrame(self.actual_power_data) + + # Filter actual power to only show up to the latest simulated timestamp + # (simulator may lag behind real-time data collection) + max_simulated_time = simulated_df["timestamp"].max() + actual_df_filtered = actual_df[actual_df["timestamp"] <= max_simulated_time].copy() + + if len(actual_df_filtered) == 0: + logger.warning("No actual power data available up to simulated time range") + return + + logger.debug( + f"Plot time range: {len(actual_df_filtered)}/{len(actual_df)} actual power points " + f"(up to {max_simulated_time})" + ) + + # Convert power from Watts to Kilowatts + simulated_df["power_kw"] = simulated_df["power_draw"] / 1000.0 + actual_df_filtered["power_kw"] = actual_df_filtered["power_draw"] / 1000.0 + + # Create plot + plt.figure(figsize=(12, 6)) + + # Plot actual power (ground truth) - only up to simulated time + plt.plot( + actual_df_filtered["timestamp"], + actual_df_filtered["power_kw"], + label="Actual Power (Ground Truth)", + color="blue", + linewidth=1.5, + alpha=0.8, + ) + + # Plot simulated power + plt.plot( + simulated_df["timestamp"], + simulated_df["power_kw"], + label="Simulated Power (OpenDC)", + color="orange", + linewidth=1.5, + alpha=0.8, + ) + + plt.xlabel("Time") + plt.ylabel("Power (kW)") + plt.title(f"Power Consumption: Actual vs Simulated - {self.experiment_name}") + plt.ylim(0, 32) # 0-100 kW + plt.legend() + plt.grid(True, alpha=0.3) + plt.xticks(rotation=45) + plt.tight_layout() + + # Save plot + plt.savefig(self.plot_path, dpi=150) + plt.close() + + logger.info(f"πŸ“ˆ Updated power consumption plot: {self.plot_path.name}") + + except Exception as e: + logger.error(f"Failed to generate power plot: {e}", exc_info=True) + + @staticmethod + def get_next_run_number(experiment_name: str, experiment_output_dir: str) -> int: + """Get the next run number for an experiment. + + Scans output// for existing run_N directories + and returns N+1. + + Args: + experiment_name: Name of the experiment + experiment_output_dir: Base output directory path + + Returns: + Next run number (1 if no existing runs) + """ + experiment_dir = Path(experiment_output_dir) / experiment_name + if not experiment_dir.exists(): + return 1 + + existing_runs = [ + int(d.name.replace("run_", "")) + for d in experiment_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + + return max(existing_runs, default=0) + 1 diff --git a/services/sim-worker/sim_worker/main.py b/services/sim-worker/sim_worker/main.py new file mode 100644 index 0000000..4f6ad5e --- /dev/null +++ b/services/sim-worker/sim_worker/main.py @@ -0,0 +1,533 @@ +"""Sim-Worker Service - Main Entry Point.""" + +import copy +import logging +import os +import time +from datetime import datetime +from typing import Any + +from opendt_common import load_config_from_env +from opendt_common.models import Task, Topology, TopologySnapshot +from opendt_common.utils import get_kafka_consumer, get_kafka_producer + +from .experiment_manager import ExperimentManager +from .runner import OpenDCRunner, SimulationResults +from .window_manager import TimeWindow, WindowManager + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +class SimulationWorker: + """Consumes workload events from Kafka and runs simulations. + + The worker: + 1. Listens to dc.workload (tasks) and dc.topology (topology updates) + 2. Aggregates tasks into fixed time windows (default: 5 minutes) + 3. When a window closes, runs OpenDC simulation with: + - Real topology (from dc.topology) + - Simulated topology (operator-defined, initially same as real) + 4. Publishes simulation results to Kafka + """ + + def __init__( + self, + kafka_bootstrap_servers: str, + worker_id: str, + workload_topic: str, + topology_topic: str, + sim_topology_topic: str, + power_topic: str, + results_topic: str, + window_size_minutes: int, + consumer_group: str = "sim-workers", + debug_mode: bool = False, + debug_output_dir: str = "/app/output", + experiment_mode: bool = False, + experiment_name: str = "default", + experiment_output_dir: str = "/app/output", + ): + """Initialize the simulation worker. + + Args: + kafka_bootstrap_servers: Kafka broker addresses + worker_id: Unique identifier for this worker + workload_topic: Kafka topic name for workload events (dc.workload) + topology_topic: Kafka topic name for topology updates (dc.topology) + sim_topology_topic: Kafka topic name for simulated topology updates (sim.topology) + power_topic: Kafka topic name for power telemetry (dc.power) + results_topic: Kafka topic name for simulation results + window_size_minutes: Size of time windows in minutes + consumer_group: Kafka consumer group ID + debug_mode: If True, write debug files alongside main output + debug_output_dir: Directory to write debug output files + experiment_mode: If True, write results to parquet instead of Kafka + experiment_name: Name of the experiment (used for output directory) + """ + self.worker_id = worker_id + self.kafka_bootstrap_servers = kafka_bootstrap_servers + self.consumer_group = consumer_group + self.workload_topic = workload_topic + self.topology_topic = topology_topic + self.sim_topology_topic = sim_topology_topic + self.power_topic = power_topic + self.results_topic = results_topic + self.debug_mode = debug_mode + self.debug_output_dir = debug_output_dir + self.experiment_mode = experiment_mode + self.experiment_name = experiment_name + + # Initialize Kafka consumer + # Subscribe to: workload, real topology, simulated topology, and power (experiment mode) + topics = [workload_topic, topology_topic, sim_topology_topic] + if self.experiment_mode: + topics.append(power_topic) + + self.consumer = get_kafka_consumer( + topics=topics, + group_id=consumer_group, + bootstrap_servers=kafka_bootstrap_servers, + ) + + # Initialize Kafka producer for results + self.producer = get_kafka_producer(kafka_bootstrap_servers) + + # Initialize window manager + self.window_manager = WindowManager(window_size_minutes=window_size_minutes) + + # Initialize OpenDC runner + try: + self.opendc_runner = OpenDCRunner() + except FileNotFoundError as e: + logger.error(f"Failed to initialize OpenDC runner: {e}") + logger.error("Simulation will not be available") + self.opendc_runner = None + + # Topology state + self.real_topology: Topology | None = None + self.simulated_topology: Topology | None = None # Initially same as real + + # Statistics + self.tasks_processed = 0 + self.windows_simulated = 0 + + # Initialize result cache for avoiding redundant simulations + from .result_cache import ResultCache + + self.result_cache = ResultCache() + + # Setup experiment mode + self.experiment_manager: ExperimentManager | None = None + if self.experiment_mode: + run_number = ExperimentManager.get_next_run_number( + experiment_name, experiment_output_dir + ) + self.experiment_manager = ExperimentManager( + experiment_name=experiment_name, + experiment_output_dir=experiment_output_dir, + run_number=run_number, + ) + logger.info("πŸ§ͺ EXPERIMENT MODE ENABLED") + logger.info(f" Experiment: {experiment_name}") + logger.info(f" Run: {run_number}") + + logger.info(f"Initialized SimulationWorker '{worker_id}'") + logger.info(f"Consumer group: {consumer_group}") + logger.info(f"Subscribed: {workload_topic}, {topology_topic}, {sim_topology_topic}") + logger.info(f"Window size: {window_size_minutes} minutes") + + def _process_window(self, window: TimeWindow) -> None: + """Process a closed window by running simulation. + + This triggers the simulation run for the closed window. + + Args: + window: The window to process + """ + logger.info( + f"🎯 Window {window.window_id} closed " + f"[{window.window_start} - {window.window_end}], " + f"triggering simulation..." + ) + + if not self.opendc_runner: + logger.warning("OpenDC runner not available, skipping simulation") + return + + if not window.topology: + logger.warning(f"Window {window.window_id} has no topology, skipping simulation") + return + + # Get all tasks from window 0 up to this window (cumulative) + all_tasks = self.window_manager.get_all_tasks_up_to_window(window.window_id) + + # Use simulated topology (initially same as real topology from first message) + topology_to_use = self.simulated_topology if self.simulated_topology else window.topology + + # Check if we can reuse cached results (same topology + same task count) + cached_results = self.result_cache.get_cached_results() + if self.result_cache.can_reuse(topology_to_use, len(all_tasks)) and cached_results: + logger.info( + f"♻️ Reusing cached results for window {window.window_id} " + f"(topology unchanged, {len(all_tasks)} cumulative tasks)" + ) + simulated_results = cached_results + else: + # Run new simulation + logger.info( + f"Running simulation for window {window.window_id} " + f"with {len(all_tasks)} cumulative tasks ({len(window.tasks)} new)" + ) + simulated_results = self._run_simulation( + window_id=window.window_id, + tasks=all_tasks, + topology=topology_to_use, + topology_type="simulated", + ) + # Update cache with new results + self.result_cache.update(topology_to_use, len(all_tasks), simulated_results) + + # Handle results (write to parquet in experiment mode, or publish to Kafka) + self._handle_results(window, simulated_results, all_tasks) + + self.windows_simulated += 1 + + # Log statistics + stats = self.window_manager.get_stats() + logger.info( + f"πŸ“Š Stats: {self.tasks_processed} tasks processed, " + f"{self.windows_simulated} windows simulated, " + f"{stats['total_windows']} total windows " + f"({stats['open_windows']} open, {stats['closed_windows']} closed)" + ) + + def _run_simulation( + self, + window_id: int, + tasks: list[Task], + topology: Topology, + topology_type: str, + ) -> SimulationResults: + """Run OpenDC simulation for a window. + + Args: + window_id: Window ID + tasks: List of tasks to simulate + topology: Topology to use + topology_type: "real" or "simulated" + + Returns: + SimulationResults object + """ + if not self.opendc_runner: + return SimulationResults(status="error", error="OpenDC runner not initialized") + + experiment_name = f"window-{window_id}-{topology_type}" + + try: + results = self.opendc_runner.run_simulation( + tasks=tasks, + topology=topology, + experiment_name=experiment_name, + timeout_seconds=120, + ) + + logger.info( + f"βœ… Simulation ({topology_type}) for window {window_id}: " + f"energy={results.energy_kwh:.4f} kWh, " + f"cpu_util={results.cpu_utilization:.3f}, " + f"max_power={results.max_power_draw:.1f} W" + ) + + return results + + except Exception as e: + logger.error(f"Error running simulation ({topology_type}): {e}", exc_info=True) + return SimulationResults(status="error", error=str(e)) + + def _handle_results( + self, + window: TimeWindow, + simulated_results: SimulationResults, + cumulative_tasks: list[Task], + ) -> None: + """Handle simulation results (write to parquet in experiment mode, or publish to Kafka). + + Args: + window: The window that was simulated + simulated_results: Results from simulated topology simulation + cumulative_tasks: All tasks from window 0 up to this window (used in simulation) + """ + # Experiment mode: write results parquet, OpenDC I/O files, and update plot + if self.experiment_mode and self.experiment_manager: + try: + self.experiment_manager.write_simulation_results( + window, simulated_results, cumulative_tasks + ) + self.experiment_manager.archive_opendc_files( + window, simulated_results, cumulative_tasks + ) + self.experiment_manager.generate_power_plot() + except Exception as e: + logger.error(f"Failed to write experiment results: {e}", exc_info=True) + + def _process_workload_message(self, message_data: dict[str, Any]) -> None: + """Process a workload message (task or heartbeat) from Kafka. + + Args: + message_data: Raw message data from Kafka + """ + try: + message_type = message_data.get("message_type") + + if message_type == "task": + # Extract task from nested structure + task = Task(**message_data["task"]) + logger.debug( + f"Received task {task.id} at {task.submission_time} " + f"with {len(task.fragments)} fragments" + ) + + # Add to window manager (does NOT close windows) + self.window_manager.add_task(task) + self.tasks_processed += 1 + + elif message_type == "heartbeat": + # Parse heartbeat timestamp + heartbeat_time = datetime.fromisoformat(message_data["timestamp"]) + logger.debug(f"Received heartbeat at {heartbeat_time}") + + # Check which windows can be closed + closed_windows = self.window_manager.close_windows_before(heartbeat_time) + + # Process each closed window sequentially + for window in closed_windows: + self._process_window(window) + + else: + logger.warning(f"Unknown message_type: {message_type}") + + except Exception as e: + logger.error(f"Error processing workload message: {e}", exc_info=True) + + def _process_power_message(self, message_data: dict[str, Any]) -> None: + """Process a power consumption message from Kafka. + + Args: + message_data: Raw message data from Kafka (dc.power) + """ + if not self.experiment_manager: + return + + try: + timestamp_str = message_data.get("timestamp") + power_draw = message_data.get("power_draw") + + if timestamp_str and power_draw is not None: + timestamp = datetime.fromisoformat(timestamp_str) + self.experiment_manager.record_actual_power(timestamp, power_draw) + + except Exception as e: + logger.error(f"Error processing power message: {e}", exc_info=True) + + def _process_topology_message(self, message_data: dict[str, Any]) -> None: + """Process a topology message from Kafka. + + Args: + message_data: Raw message data from Kafka + """ + try: + # Parse into TopologySnapshot model + topology_snapshot = TopologySnapshot(**message_data) + + logger.debug( + f"πŸ“‘ Received topology snapshot (timestamp: {topology_snapshot.timestamp})" + ) + + # Update real topology + self.real_topology = topology_snapshot.topology + + # Initialize simulated topology if not set + if self.simulated_topology is None: + # Deep copy so we can modify it independently + self.simulated_topology = copy.deepcopy(self.real_topology) + logger.info("Initialized simulated topology from real topology") + + # Update window manager with new topology + self.window_manager.update_topology(topology_snapshot) + + except Exception as e: + logger.error(f"Error processing topology message: {e}", exc_info=True) + + def _process_topology_update_message(self, message_data: dict[str, Any]) -> None: + """Process a simulated topology update message from Kafka. + + This is called when an operator updates the simulated topology via the API. + The cache is cleared to force new simulations with the updated topology. + + Args: + message_data: Raw message data from Kafka (raw Topology, not TopologySnapshot) + """ + try: + # Parse into Topology model (not TopologySnapshot) + topology = Topology(**message_data) + + logger.info( + f"πŸ”„ Received simulated topology update: {len(topology.clusters)} cluster(s)" + ) + + # Update simulated topology + self.simulated_topology = topology + + # Clear result cache since topology changed + self.result_cache.clear() + logger.info("πŸ—‘οΈ Cleared result cache due to topology update") + + # Log update details + total_hosts = sum(host.count for cluster in topology.clusters for host in cluster.hosts) + logger.info(f" Total hosts: {total_hosts}") + + except Exception as e: + logger.error(f"Error processing topology update message: {e}", exc_info=True) + + def process_message(self, message): + """Process a single Kafka message. + + Args: + message: Kafka message + """ + topic = message.topic + value = message.value + + try: + if topic == self.workload_topic: + self._process_workload_message(value) + elif topic == self.topology_topic: + self._process_topology_message(value) + elif topic == self.sim_topology_topic: + self._process_topology_update_message(value) + elif topic == self.power_topic: + self._process_power_message(value) + else: + logger.warning(f"Unknown topic: {topic}") + + except Exception as e: + logger.error(f"Error processing message from {topic}: {e}", exc_info=True) + + def run(self): + """Run the simulation worker (main event loop).""" + logger.info(f"Starting Simulation Worker '{self.worker_id}'") + logger.info("Waiting for messages...") + + try: + for message in self.consumer: + self.process_message(message) + + except KeyboardInterrupt: + logger.info("Received interrupt signal, shutting down...") + + except Exception as e: + logger.error(f"Error in simulation worker: {e}", exc_info=True) + raise + + finally: + logger.info("Closing Kafka connections...") + self.consumer.close() + self.producer.close() + logger.info("Simulation worker stopped") + + +def main(): + """Main entry point.""" + # Load configuration from environment + try: + config = load_config_from_env() + logger.info(f"Loaded configuration for workload: {config.workload}") + except Exception as e: + logger.error(f"Failed to load configuration: {e}") + raise + + # Get Kafka configuration from config file + kafka_bootstrap_servers = config.kafka.bootstrap_servers + workload_topic = config.kafka.topics["workload"].name + topology_topic = config.kafka.topics["topology"].name + sim_topology_topic = config.kafka.topics["sim_topology"].name + power_topic = config.kafka.topics["power"].name + results_topic = config.kafka.topics["results"].name + + # Get simulation configuration + window_size_minutes = config.simulation.window_size_minutes + + logger.info(f"Kafka bootstrap servers: {kafka_bootstrap_servers}") + logger.info(f"Workload topic: {workload_topic}") + logger.info(f"Topology topic: {topology_topic}") + logger.info(f"Simulated topology topic: {sim_topology_topic}") + logger.info(f"Power topic: {power_topic}") + logger.info(f"Results topic: {results_topic}") + logger.info(f"Window size: {window_size_minutes} minutes") + + # Get worker configuration from environment + worker_id = os.getenv("WORKER_ID", "worker-1") + consumer_group = os.getenv("CONSUMER_GROUP", "sim-workers") + + # Debug mode configuration + debug_mode = os.getenv("DEBUG_MODE", "false").lower() in ("true", "1", "yes") + debug_output_dir = os.getenv("DEBUG_OUTPUT_DIR", "/app/output") + + # Experiment mode configuration + experiment_mode = config.simulation.experiment_mode + experiment_name = os.getenv("EXPERIMENT_NAME", "default") + experiment_output_dir = os.getenv("EXPERIMENT_OUTPUT_DIR", "/app/output") + + if debug_mode: + logger.info("=" * 60) + logger.info("πŸ› DEBUG MODE ENABLED") + logger.info(f" Debug files will be written to: {debug_output_dir}") + logger.info("=" * 60) + + if experiment_mode: + logger.info("=" * 60) + logger.info("πŸ§ͺ EXPERIMENT MODE ENABLED") + logger.info(f" Experiment: {experiment_name}") + logger.info(f" Results will be written to: {debug_output_dir}") + logger.info("=" * 60) + + # Wait for Kafka to be ready + max_retries = 30 + retry_delay = 2 + + for attempt in range(max_retries): + try: + logger.info(f"Attempting to connect to Kafka (attempt {attempt + 1}/{max_retries})") + worker = SimulationWorker( + kafka_bootstrap_servers=kafka_bootstrap_servers, + worker_id=worker_id, + workload_topic=workload_topic, + topology_topic=topology_topic, + sim_topology_topic=sim_topology_topic, + power_topic=power_topic, + results_topic=results_topic, + window_size_minutes=window_size_minutes, + consumer_group=consumer_group, + experiment_mode=experiment_mode, + experiment_name=experiment_name, + experiment_output_dir=experiment_output_dir, + debug_mode=debug_mode, + debug_output_dir=debug_output_dir, + ) + worker.run() + break + except Exception as e: + if attempt < max_retries - 1: + logger.warning(f"Connection failed: {e}. Retrying in {retry_delay}s...") + time.sleep(retry_delay) + else: + logger.error("Failed to connect to Kafka after maximum retries") + raise + + +if __name__ == "__main__": + main() diff --git a/services/sim-worker/sim_worker/result_cache.py b/services/sim-worker/sim_worker/result_cache.py new file mode 100644 index 0000000..79b5c4c --- /dev/null +++ b/services/sim-worker/sim_worker/result_cache.py @@ -0,0 +1,99 @@ +"""Result caching for simulation worker. + +Caches simulation results based on inputs (topology + task count) to avoid +redundant OpenDC invocations when inputs haven't changed. +""" + +import hashlib +import json +from dataclasses import dataclass +from typing import Optional + +from opendt_common.models import Topology + +from .runner import SimulationResults + + +@dataclass +class SimulationState: + """State used to determine if simulation can be cached. + + Attributes: + topology_hash: SHA256 hash of topology JSON + task_count: Number of tasks in the simulation + """ + + topology_hash: str + task_count: int + + +class ResultCache: + """Simple cache for simulation results to avoid redundant runs. + + Caches the last simulation's inputs (topology hash + task count) and outputs + (SimulationResults). If the next window has identical inputs, the cached + results are reused. + """ + + def __init__(self): + """Initialize empty cache.""" + self.last_state: Optional[SimulationState] = None + self.last_results: Optional[SimulationResults] = None + + def _compute_topology_hash(self, topology: Topology) -> str: + """Compute deterministic hash of topology. + + Args: + topology: Topology to hash + + Returns: + SHA256 hash of sorted JSON representation + """ + topology_dict = topology.model_dump(mode="json") + topology_json = json.dumps(topology_dict, sort_keys=True) + return hashlib.sha256(topology_json.encode()).hexdigest() + + def can_reuse(self, topology: Topology, task_count: int) -> bool: + """Check if cached results can be reused for given inputs. + + Args: + topology: Current topology + task_count: Current cumulative task count + + Returns: + True if cache hit (topology and task count match), False otherwise + """ + if self.last_state is None or self.last_results is None: + return False + + current_hash = self._compute_topology_hash(topology) + + return ( + current_hash == self.last_state.topology_hash + and task_count == self.last_state.task_count + ) + + def get_cached_results(self) -> Optional[SimulationResults]: + """Get cached simulation results. + + Returns: + Cached SimulationResults if available, None otherwise + """ + return self.last_results + + def update(self, topology: Topology, task_count: int, results: SimulationResults) -> None: + """Update cache with new simulation state and results. + + Args: + topology: Topology used in simulation + task_count: Number of tasks in simulation + results: Simulation results to cache + """ + topology_hash = self._compute_topology_hash(topology) + self.last_state = SimulationState(topology_hash=topology_hash, task_count=task_count) + self.last_results = results + + def clear(self) -> None: + """Clear the cache (e.g., when topology changes).""" + self.last_state = None + self.last_results = None diff --git a/services/sim-worker/sim_worker/runner/__init__.py b/services/sim-worker/sim_worker/runner/__init__.py new file mode 100644 index 0000000..75619c4 --- /dev/null +++ b/services/sim-worker/sim_worker/runner/__init__.py @@ -0,0 +1,6 @@ +"""OpenDC runner module for sim-worker.""" + +from .models import SimulationResults, TimeseriesData +from .opendc_runner import OpenDCRunner + +__all__ = ["OpenDCRunner", "SimulationResults", "TimeseriesData"] diff --git a/services/sim-worker/sim_worker/runner/java_home.py b/services/sim-worker/sim_worker/runner/java_home.py new file mode 100644 index 0000000..f5a0bed --- /dev/null +++ b/services/sim-worker/sim_worker/runner/java_home.py @@ -0,0 +1,71 @@ +"""Java home detection utilities.""" + +import logging +import os +import subprocess +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def detect_java_home() -> str: + """Detect JAVA_HOME by finding the Java installation directory. + + Returns: + Path to Java home directory + + Raises: + RuntimeError: If Java cannot be found + """ + # First check if JAVA_HOME is already set + if "JAVA_HOME" in os.environ: + java_home = os.environ["JAVA_HOME"] + if Path(java_home).exists(): + return java_home + + # On macOS, use /usr/libexec/java_home + try: + result = subprocess.run( + ["/usr/libexec/java_home"], + capture_output=True, + text=True, + check=True, + ) + java_home = result.stdout.strip() + if java_home and Path(java_home).exists(): + logger.debug(f"Auto-detected JAVA_HOME (macOS): {java_home}") + return java_home + except (subprocess.CalledProcessError, FileNotFoundError): + pass + + # On Linux, try readlink approach + try: + result = subprocess.run( + ["readlink", "-f", "/usr/bin/java"], + capture_output=True, + text=True, + check=True, + ) + java_binary = result.stdout.strip() + # Java home is the parent of the bin directory + java_home = str(Path(java_binary).parent.parent) + if java_home and Path(java_home).exists(): + logger.debug(f"Auto-detected JAVA_HOME (Linux): {java_home}") + return java_home + except (subprocess.CalledProcessError, FileNotFoundError): + pass + + # Try common Linux paths + common_paths = [ + "/usr/lib/jvm/java-21-openjdk-arm64", + "/usr/lib/jvm/java-21-openjdk-amd64", + "/usr/lib/jvm/default-java", + "/usr/lib/jvm/java-21", + ] + + for path in common_paths: + if Path(path).exists(): + logger.info(f"Found JAVA_HOME at: {path}") + return path + + raise RuntimeError("Could not detect JAVA_HOME. Please set the JAVA_HOME environment variable.") diff --git a/services/sim-worker/sim_worker/runner/models.py b/services/sim-worker/sim_worker/runner/models.py new file mode 100644 index 0000000..6c82ad2 --- /dev/null +++ b/services/sim-worker/sim_worker/runner/models.py @@ -0,0 +1,56 @@ +"""Pydantic models for OpenDC simulation results.""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + + +class TimeseriesData(BaseModel): + """Timeseries data point from simulation.""" + + timestamp: int = Field(..., description="Simulation timestamp in milliseconds") + value: float = Field(..., description="Metric value at this timestamp") + + +class SimulationResults(BaseModel): + """Results from an OpenDC simulation run. + + Contains both summary statistics and full timeseries data for power and CPU metrics. + """ + + # Summary statistics + energy_kwh: float = Field( + default=0.0, description="Total energy consumed during simulation (kWh)" + ) + cpu_utilization: float = Field( + default=0.0, description="Average CPU utilization across all hosts (0-1)" + ) + max_power_draw: float = Field(default=0.0, description="Maximum power draw observed (Watts)") + runtime_hours: float = Field(default=0.0, description="Simulated runtime duration (hours)") + status: Literal["success", "error"] = Field(default="success", description="Simulation status") + error: str | None = Field(default=None, description="Error message if status is error") + + # Timeseries data + power_draw_series: list[TimeseriesData] = Field( + default_factory=list, + description="Timeseries of power draw measurements (Watts) over simulation time", + ) + cpu_utilization_series: list[TimeseriesData] = Field( + default_factory=list, + description="Timeseries of CPU utilization measurements (0-1) over simulation time", + ) + + # File paths (for experiment mode) + temp_dir: str | None = Field( + default=None, description="Temporary directory containing OpenDC input files" + ) + opendc_output_dir: str | None = Field( + default=None, description="Directory where OpenDC wrote its output files" + ) + + class Config: + """Pydantic config.""" + + frozen = False diff --git a/services/sim-worker/sim_worker/runner/opendc_runner.py b/services/sim-worker/sim_worker/runner/opendc_runner.py new file mode 100644 index 0000000..f9b927b --- /dev/null +++ b/services/sim-worker/sim_worker/runner/opendc_runner.py @@ -0,0 +1,388 @@ +"""OpenDC Experiment Runner wrapper for sim-worker.""" + +from __future__ import annotations + +import json +import logging +import os +import subprocess +from pathlib import Path + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from opendt_common.models import Task, Topology + +from .java_home import detect_java_home +from .models import SimulationResults + +logger = logging.getLogger(__name__) + + +class OpenDCRunner: + """Wrapper around OpenDC ExperimentRunner binary. + + This runner handles: + - Converting Task/Fragment models to Parquet files + - Creating OpenDC experiment JSON files + - Invoking the OpenDC binary + - Parsing results from output Parquet files + """ + + def __init__(self, opendc_bin_path: Path | None = None) -> None: + """Initialize the OpenDC runner. + + Args: + opendc_bin_path: Path to OpenDCExperimentRunner binary. + Defaults to /app/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner + """ + if opendc_bin_path is None: + opendc_bin_path = Path( + "/app/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner" + ) + + self.opendc_path = opendc_bin_path + + # Verify the binary exists + if not self.opendc_path.exists(): + raise FileNotFoundError( + f"OpenDC runner not found at {self.opendc_path}. " + "Ensure the OpenDC binaries are mounted into the container." + ) + + # Ensure it's executable + if not os.access(self.opendc_path, os.X_OK): + logger.warning( + f"OpenDC runner not executable, attempting to fix permissions: {self.opendc_path}" + ) + try: + os.chmod(self.opendc_path, 0o755) + except Exception as e: + logger.error(f"Failed to make OpenDC runner executable: {e}") + + logger.info(f"βœ… OpenDC runner initialized: {self.opendc_path}") + + def _create_tasks_parquet(self, tasks: list[Task], output_path: Path) -> None: + """Create tasks.parquet file from Task models.""" + if not tasks: + logger.warning("No tasks provided, creating empty tasks.parquet") + schema = pa.schema( + [ + ("id", pa.int64()), + ("submission_time", pa.int64()), + ("duration", pa.int64()), + ("cpu_count", pa.int32()), + ("cpu_capacity", pa.float64()), + ("mem_capacity", pa.int64()), + ] + ) + table = pa.Table.from_pydict({}, schema=schema) + pq.write_table(table, output_path) + return + + # Create explicit schema (OpenDC requires non-nullable columns) + schema = pa.schema( + [ + ("id", pa.int32(), False), # required (not nullable) + ("submission_time", pa.int64(), False), + ("duration", pa.int64(), False), + ("cpu_count", pa.int32(), False), + ("cpu_capacity", pa.float64(), False), + ("mem_capacity", pa.int64(), False), + ] + ) + + data = { + "id": [t.id for t in tasks], + "submission_time": [int(t.submission_time.timestamp() * 1000) for t in tasks], + "duration": [t.duration for t in tasks], + "cpu_count": [t.cpu_count for t in tasks], + "cpu_capacity": [t.cpu_capacity for t in tasks], + "mem_capacity": [t.mem_capacity for t in tasks], + } + + table = pa.Table.from_pydict(data, schema=schema) + pq.write_table(table, output_path) + logger.debug(f"Created tasks.parquet with {len(tasks)} tasks") + + def _create_fragments_parquet(self, tasks: list[Task], output_path: Path) -> None: + """Create fragments.parquet file from Task models.""" + all_fragments = [] + for task in tasks: + all_fragments.extend(task.fragments) + + if not all_fragments: + logger.warning("No fragments provided, creating empty fragments.parquet") + schema = pa.schema( + [ + ("id", pa.int64()), + ("duration", pa.int64()), + ("cpu_count", pa.int32()), + ("cpu_usage", pa.float64()), + ] + ) + table = pa.Table.from_pydict({}, schema=schema) + pq.write_table(table, output_path) + return + + # Create explicit schema (OpenDC requires non-nullable columns) + schema = pa.schema( + [ + ("id", pa.int32(), False), # required (not nullable) + ("duration", pa.int64(), False), + ("cpu_count", pa.int32(), False), + ("cpu_usage", pa.float64(), False), + ] + ) + + data = { + "id": [f.task_id for f in all_fragments], + "duration": [f.duration for f in all_fragments], + "cpu_count": [f.cpu_count for f in all_fragments], + "cpu_usage": [f.cpu_usage for f in all_fragments], + } + + table = pa.Table.from_pydict(data, schema=schema) + pq.write_table(table, output_path) + logger.debug(f"Created fragments.parquet with {len(all_fragments)} fragments") + + def _create_topology_json(self, topology: Topology, output_path: Path) -> None: + """Create topology.json file from Topology model.""" + topology_dict = topology.model_dump(mode="json") + + with open(output_path, "w") as f: + json.dump(topology_dict, f, indent=2) + + logger.debug(f"Created topology.json at {output_path}") + + def _create_experiment_json( + self, + experiment_name: str, + workload_path: Path, + topology_path: Path, + output_path: Path, + opendc_output_folder: str, + ) -> None: + """Create experiment.json file for OpenDC.""" + experiment = { + "name": experiment_name, + "topologies": [{"pathToFile": str(topology_path)}], + "workloads": [{"pathToFile": str(workload_path), "type": "ComputeWorkload"}], + "outputFolder": opendc_output_folder, + "exportModels": [ + { + "exportInterval": 150, + "filesToExport": ["powerSource", "host", "task", "service"], + "computeExportConfig": { + "powerSourceExportColumns": ["energy_usage", "power_draw"] + }, + } + ], + } + + with open(output_path, "w") as f: + json.dump(experiment, f, indent=2) + + logger.debug(f"Created experiment.json at {output_path} (output: {opendc_output_folder})") + + def run_simulation( + self, + tasks: list[Task], + topology: Topology, + experiment_name: str = "sim-worker-window", + timeout_seconds: int = 120, + ) -> SimulationResults: + """Run OpenDC simulation with given tasks and topology. + + Args: + tasks: List of Task models (with fragments) + topology: Topology model + experiment_name: Name for this simulation run + timeout_seconds: Maximum time to wait for simulation + + Returns: + SimulationResults object with summary stats and timeseries data + """ + logger.info(f"Starting OpenDC simulation: {experiment_name}") + logger.debug(f"Tasks: {len(tasks)}, Fragments: {sum(len(t.fragments) for t in tasks)}") + + # Create temp directory for this simulation + tmp_dir = Path("/tmp") / f"opendc-{experiment_name}-{os.getpid()}" + tmp_dir.mkdir(parents=True, exist_ok=True) + + workload_dir = tmp_dir / "workload" + workload_dir.mkdir(exist_ok=True) + + topology_file = tmp_dir / "topology.json" + experiment_file = tmp_dir / "experiment.json" + + try: + # Create input files + self._create_tasks_parquet(tasks, workload_dir / "tasks.parquet") + self._create_fragments_parquet(tasks, workload_dir / "fragments.parquet") + self._create_topology_json(topology, topology_file) + + # Configure OpenDC to write to temp directory + opendc_output_folder = str(tmp_dir / "output") + self._create_experiment_json( + experiment_name, workload_dir, topology_file, experiment_file, opendc_output_folder + ) + + # Run OpenDC + result = self._execute_opendc(experiment_file, timeout_seconds) + + if result.returncode != 0: + logger.error(f"OpenDC simulation failed with exit code {result.returncode}") + logger.error(f"stdout: {result.stdout[:500] if result.stdout else '(empty)'}") + logger.error(f"stderr: {result.stderr[:500] if result.stderr else '(empty)'}") + + error_msg = result.stderr if result.stderr else result.stdout + if not error_msg: + error_msg = f"OpenDC exited with code {result.returncode}" + + return SimulationResults(status="error", error=error_msg) + + # Parse results + results = self._parse_results(experiment_name, opendc_output_folder) + results.temp_dir = str(tmp_dir) + + # Store the actual OpenDC output directory path (in temp directory) + opendc_output_dir = ( + Path(opendc_output_folder) / experiment_name / "raw-output" / "0" / "seed=0" + ) + results.opendc_output_dir = str(opendc_output_dir) + + logger.info(f"Simulation complete: {experiment_name}") + return results + + except Exception as e: + logger.error(f"Error running OpenDC simulation: {e}", exc_info=True) + return SimulationResults(status="error", error=str(e)) + + def _execute_opendc( + self, experiment_file: Path, timeout: int + ) -> subprocess.CompletedProcess[str]: + """Execute the OpenDC binary.""" + # Set up environment with JAVA_HOME + env = os.environ.copy() + if "JAVA_HOME" not in env: + env["JAVA_HOME"] = detect_java_home() + + logger.debug(f"Using JAVA_HOME: {env['JAVA_HOME']}") + + # Build command + command = [str(self.opendc_path), "--experiment-path", str(experiment_file)] + logger.debug(f"Command: {' '.join(command)}") + + # Execute + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + except subprocess.TimeoutExpired as e: + logger.error(f"OpenDC timed out after {timeout}s") + raise TimeoutError(f"OpenDC simulation timed out after {timeout}s") from e + + logger.debug(f"Exit code: {result.returncode}") + if result.stdout: + logger.debug(f"stdout: {result.stdout}") + if result.stderr: + logger.debug(f"stderr: {result.stderr}") + + return result + + def _parse_results( + self, experiment_name: str, opendc_output_folder: str = "/app/output" + ) -> SimulationResults: + """Parse OpenDC output files to extract metrics and timeseries data.""" + output_dir = Path(opendc_output_folder) / experiment_name / "raw-output" / "0" / "seed=0" + + logger.debug(f"Looking for output in: {output_dir}") + + if not output_dir.exists(): + logger.warning(f"Output directory not found: {output_dir}") + return SimulationResults(status="error", error="Output directory not found") + + try: + from .models import TimeseriesData + + # Read power data (timeseries + summary) + power_file = output_dir / "powerSource.parquet" + power_series = [] + energy_kwh = 0.0 + max_power = 0.0 + + if power_file.exists(): + power_df = pd.read_parquet(power_file) + + # Extract timeseries + if "timestamp" in power_df.columns and "power_draw" in power_df.columns: + for _, row in power_df.iterrows(): + power_series.append( + TimeseriesData( + timestamp=int(row["timestamp"]), + value=float(row["power_draw"]), + ) + ) + + # Calculate summary stats + energy_kwh = power_df["energy_usage"].sum() / 3_600_000 + max_power = float(power_df["power_draw"].max()) + else: + logger.warning("powerSource.parquet not found") + + # Read host data for CPU utilization (timeseries + summary) + # host_file = output_dir / "host.parquet" + # cpu_series = [] + cpu_util = 0.0 + + # if host_file.exists(): + # host_df = pd.read_parquet(host_file) + + # if "cpu_utilization" in host_df.columns: + # # Extract timeseries + # if "timestamp" in host_df.columns: + # for _, row in host_df.iterrows(): + # cpu_series.append( + # TimeseriesData( + # timestamp=int(row["timestamp"]), + # value=float(row["cpu_utilization"]), + # ) + # ) + + # # Calculate summary stat (mean across all hosts and time) + # cpu_util = float(host_df["cpu_utilization"].mean()) + # else: + # logger.warning("host.parquet not found") + + # Read service data for runtime + service_file = output_dir / "service.parquet" + runtime_hours = 0.0 + + if service_file.exists(): + service_df = pd.read_parquet(service_file) + if "timestamp" in service_df.columns and len(service_df) > 0: + runtime_ms = service_df["timestamp"].max() - service_df["timestamp"].min() + runtime_hours = float(runtime_ms) / (1000 * 3600) + else: + logger.warning("service.parquet not found") + + logger.debug(f"Parsed {len(power_series)} power points") + + return SimulationResults( + energy_kwh=round(float(energy_kwh), 4), + cpu_utilization=round(float(cpu_util), 3), + max_power_draw=round(float(max_power), 1), + runtime_hours=round(float(runtime_hours), 2), + status="success", + power_draw_series=power_series, + cpu_utilization_series=[], + ) + + except Exception as e: + logger.error(f"Error parsing OpenDC results: {e}", exc_info=True) + return SimulationResults(status="error", error=str(e)) diff --git a/services/sim-worker/sim_worker/window_manager.py b/services/sim-worker/sim_worker/window_manager.py new file mode 100644 index 0000000..d0da394 --- /dev/null +++ b/services/sim-worker/sim_worker/window_manager.py @@ -0,0 +1,318 @@ +"""Time window manager for aggregating tasks into simulation windows.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from datetime import datetime, timedelta + +from opendt_common.models import Task, Topology, TopologySnapshot + +logger = logging.getLogger(__name__) + + +@dataclass +class TimeWindow: + """Represents a fixed time window for task aggregation. + + Windows are closed when: + - A task arrives with submission_time >= window_end + - The window is explicitly closed + """ + + window_id: int + window_start: datetime + window_end: datetime + tasks: list[Task] = field(default_factory=list) + topology: Topology | None = None + is_closed: bool = False + + def add_task(self, task: Task) -> bool: + """Add a task to this window. + + Args: + task: Task to add + + Returns: + True if task was added, False if task is outside window bounds + """ + if self.is_closed: + logger.warning(f"Attempted to add task to closed window {self.window_id}") + return False + + if task.submission_time < self.window_start: + logger.warning( + f"Task {task.id} submission time {task.submission_time} " + f"is before window start {self.window_start}" + ) + return False + + if task.submission_time >= self.window_end: + return False + + self.tasks.append(task) + return True + + def update_topology(self, topology: Topology) -> None: + """Update the topology for this window. + + Args: + topology: New topology to use + """ + self.topology = topology + logger.debug(f"Updated topology for window {self.window_id}") + + def close(self) -> None: + """Mark this window as closed and ready for processing.""" + if not self.is_closed: + self.is_closed = True + logger.info( + f"πŸ”’ Closed window {self.window_id} " + f"[{self.window_start} - {self.window_end}] " + f"with {len(self.tasks)} tasks" + ) + + def __repr__(self) -> str: + return ( + f"TimeWindow(id={self.window_id}, " + f"start={self.window_start}, end={self.window_end}, " + f"tasks={len(self.tasks)}, closed={self.is_closed})" + ) + + +class WindowManager: + """Manages time-based windowing of tasks for simulation. + + The manager: + - Creates windows based on event time (task submission_time) + - Aggregates tasks into appropriate windows + - Tracks topology updates per window + - Closes windows when new tasks arrive after window boundary + - Maintains history of all windows for cumulative simulation + """ + + def __init__(self, window_size_minutes: int = 5) -> None: + """Initialize the window manager. + + Args: + window_size_minutes: Size of each window in minutes + """ + self.window_size_minutes = window_size_minutes + self.window_size = timedelta(minutes=window_size_minutes) + + # State + self.windows: dict[int, TimeWindow] = {} + self.first_task_time: datetime | None = None + self.latest_topology: Topology | None = None + + logger.info(f"Initialized WindowManager with {window_size_minutes}-minute windows") + + def _round_down_to_minute(self, dt: datetime) -> datetime: + """Round datetime down to the start of the minute. + + Args: + dt: Datetime to round + + Returns: + Datetime rounded down to start of minute (seconds/microseconds = 0) + """ + return dt.replace(second=0, microsecond=0) + + def _create_window(self, window_id: int, start_time: datetime) -> TimeWindow: + """Create a new time window. + + Args: + window_id: ID for the window + start_time: Start time of the window (should be rounded to minute) + + Returns: + New TimeWindow instance + """ + window_start = start_time + window_end = window_start + self.window_size + + window = TimeWindow( + window_id=window_id, + window_start=window_start, + window_end=window_end, + topology=self.latest_topology, + ) + + self.windows[window_id] = window + logger.info(f"πŸ“¦ Created window {window_id}: [{window_start} - {window_end})") + + return window + + def _get_or_create_window_for_time(self, timestamp: datetime) -> TimeWindow: + """Get or create the appropriate window for a given timestamp. + + Args: + timestamp: Timestamp to find window for + + Returns: + TimeWindow that should contain this timestamp + """ + # Round down to minute + rounded_time = self._round_down_to_minute(timestamp) + + # If this is the first task, initialize the first window + if self.first_task_time is None: + self.first_task_time = rounded_time + return self._create_window(window_id=0, start_time=rounded_time) + + # Check if timestamp belongs to an existing window + for window in self.windows.values(): + if window.window_start <= timestamp < window.window_end: + return window + + # Calculate which window this timestamp belongs to + time_since_first = timestamp - self.first_task_time + window_index = int(time_since_first.total_seconds() // (self.window_size_minutes * 60)) + window_start = self.first_task_time + timedelta( + minutes=window_index * self.window_size_minutes + ) + + # Create the window if it doesn't exist + return self._create_window(window_id=window_index, start_time=window_start) + + def add_task(self, task: Task) -> None: + """Add a task to the appropriate window. + + Does NOT close windows - that's done via heartbeat messages. + + Args: + task: Task to add + """ + # Get or create the appropriate window + window = self._get_or_create_window_for_time(task.submission_time) + + # Add task to the target window + if window.add_task(task): + logger.debug( + f"Added task {task.id} (submitted {task.submission_time}) " + f"to window {window.window_id}" + ) + else: + logger.warning( + f"Failed to add task {task.id} to window {window.window_id}. " + f"Task time: {task.submission_time}, " + f"Window: [{window.window_start} - {window.window_end})" + ) + + def update_topology(self, topology_snapshot: TopologySnapshot) -> None: + """Update the current topology. + + This topology will be used for all new windows and updated in any open windows. + + Args: + topology_snapshot: New topology snapshot from Kafka + """ + self.latest_topology = topology_snapshot.topology + logger.debug(f"πŸ“‘ Updated topology (snapshot time: {topology_snapshot.timestamp})") + + # Update topology in all open windows + for window in self.windows.values(): + if not window.is_closed: + window.update_topology(self.latest_topology) + + def close_windows_before(self, timestamp: datetime) -> list[TimeWindow]: + """Close all open windows that end before the given timestamp. + + Creates intermediate empty windows if needed (e.g., if a heartbeat arrives + much later than the last task, all intermediate windows are created and closed). + + Returns closed windows in chronological order (window_id 0, 1, 2, ...). + + Args: + timestamp: Timestamp to check against + + Returns: + List of closed windows in chronological order + """ + # If no windows exist yet, nothing to close + if not self.windows or self.first_task_time is None: + return [] + + # Calculate the latest window ID that should exist before this timestamp + time_since_first = timestamp - self.first_task_time + latest_window_id = int(time_since_first.total_seconds() // (self.window_size_minutes * 60)) + + # Create any missing intermediate windows + # (e.g., if last window is 1 but heartbeat is at window 6, create windows 2-6) + existing_window_ids = set(self.windows.keys()) + max_existing_id = max(existing_window_ids) if existing_window_ids else -1 + + for window_id in range(max_existing_id + 1, latest_window_id + 1): + window_start = self.first_task_time + timedelta( + minutes=window_id * self.window_size_minutes + ) + # Only create if the window ends before or at the timestamp + window_end = window_start + self.window_size + if window_end <= timestamp: + self._create_window(window_id=window_id, start_time=window_start) + + # Now close all windows that end before the timestamp + closed_windows = [] + for window_id in sorted(self.windows.keys()): + window = self.windows[window_id] + + if not window.is_closed and window.window_end <= timestamp: + window.close() + closed_windows.append(window) + + return closed_windows + + def get_all_tasks_up_to_window(self, window_id: int) -> list[Task]: + """Get all tasks from window 0 up to and including the specified window. + + This is used for cumulative simulation runs. + + Args: + window_id: Window ID to aggregate up to + + Returns: + List of all tasks from windows 0..window_id + """ + all_tasks = [] + for wid in range(window_id + 1): + if wid in self.windows: + all_tasks.extend(self.windows[wid].tasks) + + logger.debug(f"Aggregated {len(all_tasks)} tasks from windows 0..{window_id}") + return all_tasks + + def get_window(self, window_id: int) -> TimeWindow | None: + """Get a specific window by ID. + + Args: + window_id: Window ID + + Returns: + TimeWindow or None if not found + """ + return self.windows.get(window_id) + + def get_closed_windows(self) -> list[TimeWindow]: + """Get all closed windows. + + Returns: + List of closed windows, sorted by window_id + """ + closed = [w for w in self.windows.values() if w.is_closed] + return sorted(closed, key=lambda w: w.window_id) + + def get_stats(self) -> dict[str, int]: + """Get statistics about the window manager state. + + Returns: + Dictionary with statistics + """ + open_windows = [w for w in self.windows.values() if not w.is_closed] + closed_windows = [w for w in self.windows.values() if w.is_closed] + + return { + "total_windows": len(self.windows), + "open_windows": len(open_windows), + "closed_windows": len(closed_windows), + "total_tasks": sum(len(w.tasks) for w in self.windows.values()), + } diff --git a/services/sim-worker/tests/__init__.py b/services/sim-worker/tests/__init__.py new file mode 100644 index 0000000..78b09bf --- /dev/null +++ b/services/sim-worker/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for sim-worker service.""" diff --git a/services/sim-worker/tests/conftest.py b/services/sim-worker/tests/conftest.py new file mode 100644 index 0000000..5b5cb39 --- /dev/null +++ b/services/sim-worker/tests/conftest.py @@ -0,0 +1,95 @@ +"""Pytest configuration and fixtures for sim-worker tests.""" + +from datetime import UTC, datetime +from pathlib import Path + +import pytest +from opendt_common.models import CPU, Cluster, CPUPowerModel, Fragment, Host, Memory, Task, Topology + + +@pytest.fixture +def opendc_bin_path() -> Path: + """Get the path to the OpenDC binary. + + Returns: + Path to OpenDCExperimentRunner binary + + Raises: + pytest.skip: If OpenDC binary is not found + """ + # Try to find OpenDC binary in the service directory + possible_paths = [ + Path(__file__).parent.parent + / "opendc" + / "bin" + / "OpenDCExperimentRunner" + / "bin" + / "OpenDCExperimentRunner", + Path("/app/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner"), + ] + + for path in possible_paths: + if path.exists(): + return path + + pytest.skip("OpenDC binary not found. Please ensure OpenDC binaries are available for testing.") + + +@pytest.fixture +def simple_task() -> Task: + """Create a single simple task for testing.""" + base_time = datetime(2022, 10, 7, 0, 0, 0, tzinfo=UTC) + + return Task( + id=1, + submission_time=base_time, + duration=5000, # 5 seconds + cpu_count=4, + cpu_capacity=2400.0, + mem_capacity=8000, + fragments=[ + Fragment( + id=1, + duration=5000, + cpu_count=4, + cpu_usage=50.0, + ) + ], + ) + + +@pytest.fixture +def simple_topology() -> Topology: + """Create a minimal topology for testing.""" + return Topology( + clusters=[ + Cluster( + name="test-cluster", + hosts=[ + Host( + count=1, + name="test-host", + memory=Memory(memorySize=32000), + cpu=CPU(coreCount=8, coreSpeed=2400), + cpuPowerModel=CPUPowerModel( + modelType="asymptotic", + power=200.0, + idlePower=50.0, + maxPower=250.0, + asymUtil=0.5, + ), + ) + ], + ) + ] + ) + + +@pytest.fixture +def base_time() -> datetime: + """Get a base timestamp for testing. + + Returns: + A fixed datetime for reproducible tests + """ + return datetime(2024, 1, 1, 0, 0, 0) diff --git a/services/sim-worker/tests/test_contiguous_windows.py b/services/sim-worker/tests/test_contiguous_windows.py new file mode 100644 index 0000000..6a73484 --- /dev/null +++ b/services/sim-worker/tests/test_contiguous_windows.py @@ -0,0 +1,184 @@ +"""Test contiguous window behavior.""" + +from datetime import UTC, datetime + +import pytest +from opendt_common.models import CPU, Cluster, CPUPowerModel, Fragment, Host, Memory, Task, Topology +from sim_worker.window_manager import WindowManager + + +@pytest.fixture +def sample_topology() -> Topology: + """Create a sample topology for testing.""" + return Topology( + clusters=[ + Cluster( + name="test-cluster", + hosts=[ + Host( + count=1, + name="test-host", + memory=Memory(memorySize=32000), + cpu=CPU(coreCount=8, coreSpeed=2400), + cpuPowerModel=CPUPowerModel( + modelType="asymptotic", + power=200.0, + idlePower=50.0, + maxPower=250.0, + asymUtil=0.5, + ), + ) + ], + ) + ] + ) + + +def test_contiguous_windows_basic(sample_topology): + """Test that windows are created contiguously. + + If first task at 22:00:03, window should be 22:00-22:05. + If next task at 22:31, windows should be created for: + - 22:00-22:05 (has task) + - 22:05-22:10 (empty, closed) + - 22:10-22:15 (empty, closed) + - 22:15-22:20 (empty, closed) + - 22:20-22:25 (empty, closed) + - 22:25-22:30 (empty, closed) + - 22:30-22:35 (has task, open) + """ + wm = WindowManager(window_size_minutes=5) + closed_windows = [] + + def on_window_closed(window): + closed_windows.append(window) + + wm.register_window_closed_callback(on_window_closed) + + # First task at 22:00:03 + task1 = Task( + id=1, + submission_time=datetime(2024, 1, 1, 22, 0, 3, tzinfo=UTC), + duration=5000, + cpu_count=4, + cpu_capacity=2400.0, + mem_capacity=8000, + fragments=[Fragment(id=1, duration=5000, cpu_count=4, cpu_usage=50.0)], + ) + + wm.add_task(task1) + + # Should have created window 0 (22:00-22:05) + assert 0 in wm.windows + assert wm.windows[0].window_start == datetime(2024, 1, 1, 22, 0, tzinfo=UTC) + assert wm.windows[0].window_end == datetime(2024, 1, 1, 22, 5, tzinfo=UTC) + assert len(wm.windows[0].tasks) == 1 + assert not wm.windows[0].is_closed + + # Second task at 22:31 - should create and close windows 0-5, create window 6 + task2 = Task( + id=2, + submission_time=datetime(2024, 1, 1, 22, 31, 0, tzinfo=UTC), + duration=5000, + cpu_count=4, + cpu_capacity=2400.0, + mem_capacity=8000, + fragments=[Fragment(id=2, duration=5000, cpu_count=4, cpu_usage=50.0)], + ) + + wm.add_task(task2) + + # Should have created windows 0-6 + assert len(wm.windows) == 7, f"Expected 7 windows, got {len(wm.windows)}" + + # Window 0 (22:00-22:05) - closed, has 1 task + assert wm.windows[0].is_closed + assert len(wm.windows[0].tasks) == 1 + + # Windows 1-5 (22:05-22:30) - closed, empty + for i in range(1, 6): + assert i in wm.windows, f"Window {i} should exist" + assert wm.windows[i].is_closed, f"Window {i} should be closed" + assert len(wm.windows[i].tasks) == 0, f"Window {i} should be empty" + + # Window 6 (22:30-22:35) - open, has 1 task + assert 6 in wm.windows + assert not wm.windows[6].is_closed + assert len(wm.windows[6].tasks) == 1 + + # Check window time ranges + for i in range(7): + expected_start = datetime(2024, 1, 1, 22, 0, tzinfo=UTC).replace(minute=i * 5) + expected_end = expected_start.replace(minute=(i + 1) * 5) + assert wm.windows[i].window_start == expected_start + assert wm.windows[i].window_end == expected_end + + # Check that 6 windows were closed via callback (windows 0-5) + # Note: order might vary (intermediate windows closed during creation, window 0 closed by add_task) + assert len(closed_windows) == 6 + closed_window_ids = sorted([w.window_id for w in closed_windows]) + assert closed_window_ids == [0, 1, 2, 3, 4, 5] + + print("βœ… Contiguous windows test passed!") + print(f" Created {len(wm.windows)} windows total") + print(f" Closed {len(closed_windows)} windows") + print(" Windows 0-5 closed, window 6 open") + + +def test_empty_windows_are_simulated(sample_topology): + """Test that empty windows still trigger simulation callbacks.""" + wm = WindowManager(window_size_minutes=5) + closed_windows = [] + + def on_window_closed(window): + closed_windows.append(window) + + wm.register_window_closed_callback(on_window_closed) + + # Task at 22:00 + task1 = Task( + id=1, + submission_time=datetime(2024, 1, 1, 22, 0, 0, tzinfo=UTC), + duration=5000, + cpu_count=4, + cpu_capacity=2400.0, + mem_capacity=8000, + fragments=[Fragment(id=1, duration=5000, cpu_count=4, cpu_usage=50.0)], + ) + wm.add_task(task1) + + # Task at 22:15 - should close windows 0, 1, 2 and create window 3 + task2 = Task( + id=2, + submission_time=datetime(2024, 1, 1, 22, 15, 0, tzinfo=UTC), + duration=5000, + cpu_count=4, + cpu_capacity=2400.0, + mem_capacity=8000, + fragments=[Fragment(id=2, duration=5000, cpu_count=4, cpu_usage=50.0)], + ) + wm.add_task(task2) + + # Should have windows 0-3 + assert len(wm.windows) == 4 + + # Window 0 has task, windows 1-2 are empty + assert len(wm.windows[0].tasks) == 1 + assert len(wm.windows[1].tasks) == 0 + assert len(wm.windows[2].tasks) == 0 + assert len(wm.windows[3].tasks) == 1 + + # All except window 3 should be closed + assert wm.windows[0].is_closed + assert wm.windows[1].is_closed + assert wm.windows[2].is_closed + assert not wm.windows[3].is_closed + + # Callbacks should have been triggered for windows 0, 1, 2 + assert len(closed_windows) == 3 + + print("βœ… Empty windows simulation test passed!") + print(f" Window 0: {len(wm.windows[0].tasks)} task(s)") + print(f" Window 1: {len(wm.windows[1].tasks)} task(s) (empty)") + print(f" Window 2: {len(wm.windows[2].tasks)} task(s) (empty)") + print(f" Window 3: {len(wm.windows[3].tasks)} task(s)") diff --git a/services/sim-worker/tests/test_opendc_simple.py b/services/sim-worker/tests/test_opendc_simple.py new file mode 100644 index 0000000..e100cb8 --- /dev/null +++ b/services/sim-worker/tests/test_opendc_simple.py @@ -0,0 +1,71 @@ +"""Simple unit test to verify OpenDC invocation and execution time.""" + +import time + +from sim_worker.runner import OpenDCRunner, SimulationResults + + +def test_opendc_invocation_speed(opendc_bin_path, simple_task, simple_topology): + """Test that OpenDC is invoked correctly and completes within 5 seconds. + + This is a simple integration test that verifies: + 1. The OpenDC binary can be invoked + 2. It processes a simple workload + 3. It completes within 5 seconds + """ + runner = OpenDCRunner(opendc_bin_path) + + start_time = time.time() + result = runner.run_simulation( + tasks=[simple_task], + topology=simple_topology, + experiment_name="speed-test", + timeout_seconds=5, + ) + elapsed = time.time() - start_time + + # Verify it completed within 5 seconds + assert elapsed < 5.0, f"OpenDC took {elapsed:.2f}s, expected < 5.0s" + + # Verify we got a SimulationResults object + assert isinstance(result, SimulationResults) + assert result.status in ["success", "error"] + + print(f"βœ… OpenDC completed in {elapsed:.3f}s") + print(f" Status: {result.status}") + if result.status == "error": + print(f" Error: {result.error or 'Unknown'}") + + +def test_opendc_with_valid_workload(opendc_bin_path, simple_task, simple_topology): + """Test that OpenDC successfully processes a valid workload.""" + runner = OpenDCRunner(opendc_bin_path) + + result = runner.run_simulation( + tasks=[simple_task], + topology=simple_topology, + experiment_name="valid-workload-test", + timeout_seconds=5, + ) + + # Check result structure + assert isinstance(result, SimulationResults) + assert result.status in ["success", "error"] + + # If successful, verify we got metrics and timeseries + if result.status == "success": + assert result.energy_kwh >= 0 + assert result.max_power_draw >= 0 + assert isinstance(result.power_draw_series, list) + assert isinstance(result.cpu_utilization_series, list) + + print("βœ… Simulation successful:") + print(f" Energy: {result.energy_kwh} kWh") + print(f" Max Power: {result.max_power_draw} W") + print(f" CPU Util: {result.cpu_utilization}") + print(f" Power timeseries points: {len(result.power_draw_series)}") + print(f" CPU timeseries points: {len(result.cpu_utilization_series)}") + else: + # Print error for debugging + print("⚠️ Simulation returned error status:") + print(f" Error: {result.error or 'Unknown'}") diff --git a/src/opendt/__init__.py b/src/opendt/__init__.py deleted file mode 100644 index d420ce8..0000000 --- a/src/opendt/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""OpenDT package initialization.""" -from __future__ import annotations - -from pathlib import Path - -from dotenv import load_dotenv - -# Load environment variables early so downstream modules can rely on them. -load_dotenv() # fall back to default discovery (useful for tooling) -load_dotenv(Path(__file__).resolve().parents[1] / ".env", override=False) - -from .app import create_app - -__all__ = ["create_app"] diff --git a/src/opendt/adapters/__init__.py b/src/opendt/adapters/__init__.py deleted file mode 100644 index e971391..0000000 --- a/src/opendt/adapters/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Infrastructure adapters for external systems.""" diff --git a/src/opendt/adapters/ingestion/__init__.py b/src/opendt/adapters/ingestion/__init__.py deleted file mode 100644 index fbc4614..0000000 --- a/src/opendt/adapters/ingestion/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Kafka and streaming ingestion adapters.""" diff --git a/src/opendt/adapters/ingestion/kafka/__init__.py b/src/opendt/adapters/ingestion/kafka/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/opendt/adapters/ingestion/kafka/consumer.py b/src/opendt/adapters/ingestion/kafka/consumer.py deleted file mode 100644 index 529a333..0000000 --- a/src/opendt/adapters/ingestion/kafka/consumer.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Kafka consumer that assembles workload windows.""" -from __future__ import annotations - -import json -import logging -import threading -import time -from collections import deque -from typing import Deque, Dict - -import pandas as pd -from kafka import KafkaConsumer - -from ....config.settings import REAL_WINDOW_SIZE_SEC, VIRTUAL_WINDOW_SIZE - -logger = logging.getLogger(__name__) - - -def kafka_serializer(message: bytes) -> Dict: - return json.loads(message.decode()) - - -class DigitalTwinConsumer: - """Consumes tasks and fragments from Kafka and creates processing windows.""" - - def __init__(self, bootstrap_servers: str, kafka_group_id: str) -> None: - self.bootstrap_servers = bootstrap_servers - self.tasks_buffer: Deque = deque(maxlen=2000) - self.tasks_df = pd.DataFrame() - self.fragments_buffer: Deque = deque(maxlen=10000) - self.stop_consuming = threading.Event() - self.kafka_group_id = kafka_group_id - self.tasks_lock = threading.Lock() - self.fragments_lock = threading.Lock() - - self.windows_lock = threading.Condition() - self.windows: Deque = deque(maxlen=50) - - def process_windows(self): - logger.info("πŸ“₯ Starting Kafka consumers...") - - tasks_thread = threading.Thread(target=self.consume_tasks, daemon=True) - fragments_thread = threading.Thread(target=self.consume_fragments, daemon=True) - - tasks_thread.start() - fragments_thread.start() - - time.sleep(5) - - window_count = 0 - first_wait_for_win = True - while not self.stop_consuming.is_set(): - batch_data = self.create_batch(window_count + 1) - if batch_data: - window_count += 1 - first_wait_for_win = True - - if batch_data['task_count'] > 0 or batch_data['fragment_count'] > 0: - yield batch_data - - wait_time = VIRTUAL_WINDOW_SIZE if first_wait_for_win else 0.5 - if not self.stop_consuming.wait(wait_time): - first_wait_for_win = False - continue - break - - def __add_to_window(self, data: Dict, list_name: str): - sub_time = pd.to_datetime(data["submission_time"]) - - window = None - for i in range(0, len(self.windows)): - curr_window = self.windows[i] - if sub_time >= curr_window["start"] and sub_time <= curr_window["end"]: - window = curr_window - break - if sub_time >= curr_window["end"]: - curr_window["ready"] = True - else: - logger.error(f"Anomaly found for {list_name}!") - - if not window: - window = { - "start": sub_time, - "end": sub_time + pd.Timedelta(seconds=REAL_WINDOW_SIZE_SEC), - "tasks": [], - "fragments": [], - "ready": False, - } - - self.windows.append(window) - - window[list_name].append(data) - return window - - def consume_tasks(self) -> None: - try: - consumer = KafkaConsumer( - 'tasks', - bootstrap_servers=self.bootstrap_servers, - value_deserializer=kafka_serializer, - key_deserializer=kafka_serializer, - ) - - for message in consumer: - if self.stop_consuming.is_set(): - break - - task_data = message.key | message.value - - with self.windows_lock: - self.__add_to_window(task_data, "tasks") - - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Task consumer error: %s", exc) - - def consume_fragments(self) -> None: - try: - consumer = KafkaConsumer( - 'fragments', - bootstrap_servers=self.bootstrap_servers, - value_deserializer=kafka_serializer, - key_deserializer=kafka_serializer, - ) - - for message in consumer: - if self.stop_consuming.is_set(): - break - - fragment_data = message.key | message.value - - with self.windows_lock: - self.__add_to_window(fragment_data, "fragments") - - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Fragment consumer error: %s", exc) - - def create_batch(self, window_number: int): - with self.windows_lock: - if len(self.windows) == 0 or not self.windows[0]["ready"]: - return None - - window = self.windows.popleft() - - self.tasks_df = pd.concat([self.tasks_df, pd.DataFrame(window["tasks"])], ignore_index=True) - frags_df = pd.DataFrame(window["fragments"]) - - curr_tasks_df = pd.DataFrame() - avg_cpu_usage = 0.0 - if not self.tasks_df.empty and not frags_df.empty: - frags = len(frags_df['id'].unique()) - logger.info(f"{frags} tasks should be!") - - frags_df["submission_time"] = pd.to_datetime(frags_df["submission_time"]) - - window_start = window["start"] - window_end = window["end"] - logger.info(f"wstart: {window_start}, wend: {window_end}") - if (window_end - window_start).total_seconds() > REAL_WINDOW_SIZE_SEC: - logger.error("Window is larger than expected, wsize in seconds = %s", (window_end - window_start).total_seconds()) - - curr_tasks_df = self.tasks_df[self.tasks_df["id"].isin(frags_df["id"])] - - assert len(curr_tasks_df) == len(frags_df["id"].unique()) - - avg_cpu_usage = frags_df['cpu_usage'].mean() - - task_count = len(curr_tasks_df) - fragment_count = len(frags_df) - - batch_data = { - 'task_count': task_count, - 'fragment_count': fragment_count, - 'avg_cpu_usage': avg_cpu_usage, - 'timestamp': time.time(), - 'window_number': window_number, - 'window_info': f"Window {window_number}: {task_count} tasks, {fragment_count} fragments", - "window_start": window["start"], - "window_end": window["end"], - 'tasks_sample': curr_tasks_df.to_dict(orient='records'), - 'fragments_sample': frags_df.to_dict(orient='records'), - } - - logger.info("πŸ“Š Window %s: %s tasks, %s fragments", window_number, task_count, fragment_count) - return batch_data - - def stop(self) -> None: - self.stop_consuming.set() diff --git a/src/opendt/adapters/ingestion/kafka/producer.py b/src/opendt/adapters/ingestion/kafka/producer.py deleted file mode 100644 index 791273d..0000000 --- a/src/opendt/adapters/ingestion/kafka/producer.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Kafka producer streaming telemetry traces with real-time pacing.""" -from __future__ import annotations - -import json -import logging -import threading -from datetime import datetime -from time import sleep -from typing import Optional - -import pandas as pd -from kafka import KafkaProducer - -from ....config.settings import TIME_SCALE - -logger = logging.getLogger(__name__) - - -class TimedKafkaProducer: - """Streams parquet data to Kafka with paced windows.""" - - def __init__(self, bootstrap_servers: str) -> None: - self.bootstrap_servers = bootstrap_servers - self.producer: Optional[KafkaProducer] = None - self.stop_streaming = threading.Event() - self.start_time: Optional[datetime] = None - self.start_streaming_barrier = threading.Barrier(parties=2) - - def connect(self) -> None: - self.producer = KafkaProducer( - bootstrap_servers=self.bootstrap_servers, - key_serializer=lambda k: json.dumps(k).encode(), - value_serializer=lambda v: json.dumps(v).encode(), - ) - logger.info("πŸ“‘ Connected to Kafka: %s", self.bootstrap_servers) - - def tasks_streaming_thread(self, tasks: pd.DataFrame, start_time: pd.Timestamp) -> None: - self.start_streaming_barrier.wait() - logger.info("Started streaming tasks") - - last_submission_time = start_time - for index, row in tasks.iterrows(): - if self.stop_streaming.is_set(): - return - - key = {'id': int(row['id'])} - value = { - 'submission_time': row['submission_time'].isoformat(), - 'duration': int(row['duration']), - 'cpu_count': int(row['cpu_count']), - 'cpu_capacity': float(row['cpu_capacity']), - 'mem_capacity': int(row['mem_capacity']), - } - - submission_time = row["submission_time"] - if submission_time > last_submission_time: - sleep_time = (submission_time - last_submission_time).total_seconds() - sleep_time_virt = sleep_time * TIME_SCALE - sleep(sleep_time_virt) - - self.producer.send("tasks", key=key, value=value) - - if index % 20 == 0: - self.producer.flush() - - last_submission_time = submission_time - - def fragments_streaming_thread(self, frags: pd.DataFrame, start_time: pd.Timestamp) -> None: - self.start_streaming_barrier.wait() - logger.info("Started streaming fragments") - - last_submission_time = start_time - - for index, row in frags.iterrows(): - if self.stop_streaming.is_set(): - return - - key = {'id': int(row['id'])} - value = { - 'duration': int(row['duration']), - 'cpu_usage': float(row['cpu_usage']), - 'submission_time': row['submission_time'].isoformat(), - } - - submission_time = row["submission_time"] - if submission_time > last_submission_time: - sleep_time = (submission_time - last_submission_time).total_seconds() - sleep_time_virt = sleep_time * TIME_SCALE - sleep(sleep_time_virt) - - self.producer.send("fragments", key=key, value=value) - - if index % 100 == 0: - self.producer.flush() - - last_submission_time = submission_time - - def stream_parquet_data_timed(self, tasks_file: str, fragments_file: str): - if not self.producer: - self.connect() - - logger.info("πŸ“‚ Loading parquet files...") - tasks_df = pd.read_parquet(tasks_file) - fragments_df = pd.read_parquet(fragments_file) - - tasks_df['submission_time'] = pd.to_datetime(tasks_df['submission_time']) - - fragments_df["frag_nr"] = fragments_df.groupby("id").cumcount() + 1 - - fragments_df = fragments_df.join( - tasks_df.set_index("id")["submission_time"], - on="id", - how="left", - ) - - cum_dur = fragments_df.groupby("id")["duration"].cumsum() - - fragments_df["submission_time"] = fragments_df["submission_time"] + pd.to_timedelta(cum_dur, unit="ms") - fragments_df = fragments_df.drop(columns=["frag_nr"]) - - logger.info("πŸ“Š Loaded %s tasks, %s fragments", len(tasks_df), len(fragments_df)) - - tasks_df = tasks_df.sort_values('submission_time') - fragments_df = fragments_df.sort_values('submission_time') - - start_time = tasks_df['submission_time'].min() - end_time = fragments_df['submission_time'].max() - total_duration = (end_time - start_time).total_seconds() - - logger.info("⏰ Trace time span: %s to %s (%s hours)", start_time, end_time, total_duration / 3600) - - tasks_thread = threading.Thread(target=self.tasks_streaming_thread, args=(tasks_df, start_time,)) - frags_thread = threading.Thread(target=self.fragments_streaming_thread, args=(fragments_df, start_time,)) - - tasks_thread.start() - frags_thread.start() - - logger.info("Started producer threads") - - tasks_thread.join() - frags_thread.join() - - logger.info("βœ… All data streamed") - return { - 'total_tasks': len(tasks_df), - 'total_fragments': len(fragments_df), - } - - def stop(self) -> None: - self.stop_streaming.set() - logger.info("πŸ›‘ Producer stop requested") diff --git a/src/opendt/adapters/ingestion/models.py b/src/opendt/adapters/ingestion/models.py deleted file mode 100644 index 544a8fa..0000000 --- a/src/opendt/adapters/ingestion/models.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Canonical telemetry message schemas.""" -from __future__ import annotations - -from dataclasses import dataclass - - -@dataclass -class TaskMessage: - id: int - submission_time: str - duration: int - cpu_count: int - cpu_capacity: float - mem_capacity: int - - -@dataclass -class FragmentMessage: - id: int - duration: int - cpu_usage: float - submission_time: str diff --git a/src/opendt/api/__init__.py b/src/opendt/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/opendt/api/dependencies.py b/src/opendt/api/dependencies.py deleted file mode 100644 index 78c3088..0000000 --- a/src/opendt/api/dependencies.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Dependency wiring for API routes.""" -from __future__ import annotations - -from ..core.orchestrator.controller import OpenDTOrchestrator - -_orchestrator = OpenDTOrchestrator() - - -def get_orchestrator() -> OpenDTOrchestrator: - return _orchestrator diff --git a/src/opendt/api/routes.py b/src/opendt/api/routes.py deleted file mode 100644 index 1c39cbb..0000000 --- a/src/opendt/api/routes.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Flask blueprints exposing the OpenDT API and dashboard.""" -from __future__ import annotations - -import logging -import threading -from copy import deepcopy - -from flask import Blueprint, jsonify, render_template, request -from pydantic import ValidationError - -from .dependencies import get_orchestrator -from .schemas import SLORequest - -logger = logging.getLogger(__name__) - -api_bp = Blueprint("api", __name__, url_prefix="/api") -ui_bp = Blueprint("ui", __name__) - - -@ui_bp.route("/") -def dashboard(): - orchestrator = get_orchestrator() - return render_template("index.html", state=orchestrator.state) - - -@api_bp.route("/set_slo", methods=["POST"]) -@api_bp.route("/submit_slo", methods=["POST"]) -def set_slo(): - orchestrator = get_orchestrator() - try: - data = request.get_json(force=True, silent=False) or {} - slo_request = SLORequest.model_validate(data) - except ValidationError as exc: - return jsonify({'error': str(exc)}), 400 - - requested = { - 'energy_target': float(slo_request.energy_target), - 'runtime_target': float(slo_request.runtime_target), - } - - result = orchestrator.update_slo_file(requested) - if result == "error": - return jsonify({'error': 'Failed to update SLO file'}), 500 - - status = 'success' if result == "applied" else 'unchanged' - - return jsonify({ - 'status': status, - 'energy_target': orchestrator.slo_targets['energy_target'], - 'runtime_target': orchestrator.slo_targets['runtime_target'], - }) - - -@api_bp.route('/status') -def api_status(): - orchestrator = get_orchestrator() - return jsonify(orchestrator.state) - - -@api_bp.route('/start', methods=['POST']) -def api_start(): - orchestrator = get_orchestrator() - if orchestrator.state['status'] in ['stopped', 'error']: - threading.Thread(target=orchestrator.start_system, daemon=True).start() - return jsonify({'message': 'System is starting...'}) - return jsonify({'message': f'System is {orchestrator.state["status"]}'}) - - -@api_bp.route('/stop', methods=['POST']) -def api_stop(): - orchestrator = get_orchestrator() - if orchestrator.state['status'] in ['running', 'starting']: - orchestrator.stop_system() - return jsonify({'message': 'System stopped'}) - return jsonify({'message': 'System already stopped'}) - - -@api_bp.route('/topology') -def api_topology(): - orchestrator = get_orchestrator() - return jsonify({ - 'current_topology': orchestrator.state.get('current_topology'), - 'best_config': orchestrator.state.get('best_config'), - 'topology_updates': orchestrator.state.get('topology_updates', 0), - }) - - -@api_bp.route('/accept_recommendation', methods=['POST']) -def api_accept_recommendation(): - orchestrator = get_orchestrator() - try: - payload = request.get_json(silent=True) or {} - proposed = payload.get('topology') if isinstance(payload, dict) else None - - best_config = orchestrator.state.get('best_config') or {} - - if proposed is not None: - recommended_topology = proposed - else: - if 'config' not in best_config: - return jsonify({'error': 'No recommendation available'}), 400 - recommended_topology = best_config['config'] - - success = orchestrator.update_topology_file(recommended_topology) - - if success: - merged = deepcopy(best_config) if isinstance(best_config, dict) else {} - merged['config'] = deepcopy(recommended_topology) - orchestrator.state['best_config'] = merged - return jsonify({ - 'message': 'Topology updated successfully with LLM recommendation', - 'topology_updates': orchestrator.state.get('topology_updates', 0), - 'applied_config': recommended_topology, - }) - return jsonify({'error': 'Failed to update topology file'}), 500 - - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Error accepting recommendation: %s", exc) - return jsonify({'error': str(exc)}), 500 - - -@api_bp.route('/reset_topology', methods=['POST']) -def api_reset_topology(): - orchestrator = get_orchestrator() - try: - orchestrator.load_initial_topology() - return jsonify({'message': 'Topology reset to initial configuration'}) - except Exception as exc: # pragma: no cover - defensive logging path - return jsonify({'error': str(exc)}), 500 - - -@api_bp.route("/sim/timeseries") -def api_sim_timeseries(): - orchestrator = get_orchestrator() - res = orchestrator.simulation_timeseries() - logger.info("Sending opendc results of all time: %s", res) - res['timestamps'] = deepcopy(res['timestamps']) - return jsonify(res) diff --git a/src/opendt/api/schemas.py b/src/opendt/api/schemas.py deleted file mode 100644 index c2dfe3b..0000000 --- a/src/opendt/api/schemas.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Request/response schemas for the API layer.""" -from __future__ import annotations - -from pydantic import BaseModel, Field - - -class SLORequest(BaseModel): - energy_target: float = Field(gt=0) - runtime_target: float = Field(gt=0) diff --git a/src/opendt/app.py b/src/opendt/app.py deleted file mode 100644 index 40d953f..0000000 --- a/src/opendt/app.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Application factory for the OpenDT service.""" -from __future__ import annotations - -from pathlib import Path - -from flask import Flask - -from .api.routes import api_bp, ui_bp -from .logging import configure_logging # noqa: F401 # ensures logging is configured - - -def create_app() -> Flask: - package_root = Path(__file__).resolve().parents[1] - template_folder = package_root / "templates" - static_folder = package_root / "static" - - app = Flask( - __name__, - template_folder=str(template_folder), - static_folder=str(static_folder), - ) - - app.register_blueprint(api_bp) - app.register_blueprint(ui_bp) - return app diff --git a/src/opendt/cli.py b/src/opendt/cli.py deleted file mode 100644 index 1482f6e..0000000 --- a/src/opendt/cli.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Command-line entry points for OpenDT.""" -from __future__ import annotations - -from .app import create_app - - -def main() -> None: - app = create_app() - app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) - - -if __name__ == "__main__": - main() diff --git a/src/opendt/config/__init__.py b/src/opendt/config/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/opendt/config/loaders.py b/src/opendt/config/loaders.py deleted file mode 100644 index 6d254eb..0000000 --- a/src/opendt/config/loaders.py +++ /dev/null @@ -1,71 +0,0 @@ -"""Topology and environment loading helpers.""" -from __future__ import annotations - -import json -import logging -import os -from pathlib import Path -from typing import Any - -logger = logging.getLogger(__name__) - - -def _read_json(path: str) -> dict[str, Any] | None: - if not os.path.exists(path): - return None - with open(path, "r", encoding="utf-8") as handle: - return json.load(handle) - - -def read_topology(path: str) -> dict[str, Any] | None: - if not os.path.exists(path): - logger.warning("⚠️ Topology not found, a default will be used at runtime") - return None - return _read_json(path) - - -def _write_json(path: str, payload: dict[str, Any]) -> None: - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "w", encoding="utf-8") as handle: - json.dump(payload, handle, indent=2) - - -def write_topology(path: str, topology: dict[str, Any]) -> None: - _write_json(path, topology) - - -def _backup_json(path: str) -> None: - if not os.path.exists(path): - return - backup_path = f"{path}.backup" - data = _read_json(path) - if data is None: - return - _write_json(backup_path, data) - - -def backup_topology(path: str) -> None: - _backup_json(path) - - -def topology_mtime(path: str) -> float: - return Path(path).stat().st_mtime if os.path.exists(path) else 0.0 - - -def read_slo(path: str) -> dict[str, Any] | None: - if not os.path.exists(path): - logger.info("⚠️ SLO configuration not found; defaults will be used until one is created") - return None - return _read_json(path) - - -def write_slo(path: str, slo: dict[str, Any]) -> None: - _write_json(path, slo) - - -def backup_slo(path: str) -> None: - _backup_json(path) - - -def slo_mtime(path: str) -> float: - return Path(path).stat().st_mtime if os.path.exists(path) else 0.0 diff --git a/src/opendt/config/settings.py b/src/opendt/config/settings.py deleted file mode 100644 index 8b9f39e..0000000 --- a/src/opendt/config/settings.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Runtime settings and orchestrator thresholds.""" -from __future__ import annotations - -import os -from dataclasses import dataclass -from typing import Any, Dict - - -IMPROVEMENT_DELTA: float = 0.05 -WINDOW_TRY_BUDGET_SEC: float = 30.0 -MAX_TRIES_PER_WINDOW: int = 1 -NO_IMPROVEMENT_STOP_AFTER: int = 3 - -TIME_SCALE: float = 1 / 10 -REAL_WINDOW_SIZE_SEC: int = 5 * 60 -VIRTUAL_WINDOW_SIZE: float = REAL_WINDOW_SIZE_SEC * TIME_SCALE - - -@dataclass(frozen=True) -class SLOTargets: - energy_target: float = 10.0 - runtime_target: float = 2.0 - - def to_dict(self) -> dict[str, float]: - return { - "energy_target": float(self.energy_target), - "runtime_target": float(self.runtime_target), - } - - @classmethod - def from_dict(cls, data: Dict[str, Any] | None) -> "SLOTargets": - """Coerce arbitrary payloads into a validated ``SLOTargets`` instance.""" - - data = data or {} - defaults = cls() - - def _coerce(key: str, fallback: float) -> float: - try: - value = data.get(key, fallback) - if value is None: - return fallback - return float(value) - except (TypeError, ValueError): - return fallback - - return cls( - energy_target=_coerce("energy_target", defaults.energy_target), - runtime_target=_coerce("runtime_target", defaults.runtime_target), - ) - - -def kafka_bootstrap_servers() -> str: - return os.environ.get("KAFKA_BOOTSTRAP_SERVERS", "kafka:29092") - - -def openai_api_key() -> str | None: - return os.environ.get("OPENAI_API_KEY") diff --git a/src/opendt/core/__init__.py b/src/opendt/core/__init__.py deleted file mode 100644 index 2f74b10..0000000 --- a/src/opendt/core/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Core domain modules for the OpenDT service.""" - -from .orchestrator.controller import OpenDTOrchestrator - -__all__ = ["OpenDTOrchestrator"] diff --git a/src/opendt/core/optimization/__init__.py b/src/opendt/core/optimization/__init__.py deleted file mode 100644 index 1b40f48..0000000 --- a/src/opendt/core/optimization/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Optimization strategies for topology tuning.""" diff --git a/src/opendt/core/optimization/base.py b/src/opendt/core/optimization/base.py deleted file mode 100644 index ae11ae6..0000000 --- a/src/opendt/core/optimization/base.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Base protocol for optimization strategies.""" -from __future__ import annotations - -from typing import Protocol - - -class OptimizationStrategy(Protocol): - def optimize(self, simulation_results, batch_data, slo_targets, current_topology=None): - ... diff --git a/src/opendt/core/optimization/llm.py b/src/opendt/core/optimization/llm.py deleted file mode 100644 index 0c4025d..0000000 --- a/src/opendt/core/optimization/llm.py +++ /dev/null @@ -1,269 +0,0 @@ -"""LLM-powered optimization strategy with rule-based fallback.""" -from __future__ import annotations - -import copy -import json -import logging -from typing import Any, Dict - -from .rule_based import rule_based_optimization -from .scoring import performance_score - -logger = logging.getLogger(__name__) - - -class LLM: - """LLM-based topology optimizer with better error handling and topology updates.""" - - def __init__(self, openai_key: str | None) -> None: - self.openai_key = openai_key - self.has_llm = bool(openai_key) - self.best_config: Dict[str, Any] | None = None - self.best_score = float('inf') - logger.info("LLM Optimizer initialized. API Key present: %s", self.has_llm) - - def calculate_performance_score(self, sim_results: Dict[str, Any]) -> float: - return performance_score(sim_results) - - def update_best_configuration(self, sim_results: Dict[str, Any], topology_data: Dict[str, Any]) -> bool: - score = self.calculate_performance_score(sim_results) - - if score < self.best_score: - self.best_score = score - self.best_config = copy.deepcopy(topology_data) - logger.info( - "πŸ† New best configuration! Score: %.2f (Energy: %.2f kWh)", - score, - sim_results.get('energy_kwh', 0), - ) - return True - return False - - def optimize( - self, - simulation_results: Dict[str, Any], - batch_data: Dict[str, Any], - slo_targets: Dict[str, Any], - current_topology: Dict[str, Any] | None = None, - ) -> Dict[str, Any]: - if not self.has_llm: - return rule_based_optimization( - simulation_results, - batch_data, - slo_targets, - current_topology, - reason="No OpenAI API key", - best_config=self.best_config, - best_score=self.best_score if self.best_config else None, - ) - - try: - return self.llm_optimization(simulation_results, batch_data, slo_targets, current_topology) - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("LLM optimization failed: %s", exc) - return rule_based_optimization( - simulation_results, - batch_data, - slo_targets, - current_topology, - reason=f"LLM Error: {str(exc)}", - best_config=self.best_config, - best_score=self.best_score if self.best_config else None, - ) - - def rule_based_optimization( - self, - sim_results: Dict[str, Any], - batch_data: Dict[str, Any], - slo_targets: Dict[str, Any], - current_topology: Dict[str, Any] | None = None, - reason: str = "", - ) -> Dict[str, Any]: - result = rule_based_optimization( - sim_results, - batch_data, - slo_targets, - current_topology, - reason, - best_config=self.best_config, - best_score=self.best_score if self.best_config else None, - ) - if current_topology: - self.update_best_configuration(sim_results, current_topology) - result['best_config'] = self.best_config - result['best_score'] = self.best_score if self.best_config else None - return result - - def llm_optimization( - self, - sim_results: Dict[str, Any], - batch_data: Dict[str, Any], - slo_targets: Dict[str, Any], - current_topology: Dict[str, Any] | None = None, - ) -> Dict[str, Any]: - from langchain_core.output_parsers import JsonOutputParser - from langchain_openai import ChatOpenAI - from pydantic import BaseModel, Field - from typing import List - - class TopologyRecommendation(BaseModel): - cluster_name: List[str] = Field(description="List of cluster names") - host_name: List[str] = Field(description="List of host names") - coreCount: List[int] = Field(description="List of core counts") - coreSpeed: List[int] = Field(description="List of core speeds in MHz") - count: List[int] = Field(description="List of host counts") - - if current_topology: - self.update_best_configuration(sim_results, current_topology) - - llm = ChatOpenAI( - api_key=self.openai_key, - model="gpt-3.5-turbo", - temperature=0.3, - timeout=15, - ) - - parser = JsonOutputParser(pydantic_object=TopologyRecommendation) - - prompt = f"""You are an expert datacenter practitioner. - Based on these simulation results, provide specific recommendations to optimize energy utilization and performance (execution time). - - You will be provided data from OpenDC simulator which simulates datacenter energy usage and runtime. - You need to recommend next core count and core speed for simulation for each cluster and host. - - You need to recommend similar configuration which helps to achieve objectives: - - Lesser runtime - - Less energy consumption - - Try to achieve both objectives(SLOs) at the same time as much as possible: - {slo_targets} - - SIMULATION RESULTS: - - Energy Usage: {sim_results.get('energy_kwh', 'N/A')} kWh - - Runtime: {sim_results.get('runtime_hours', 'N/A')} hours - - CPU Utilization: {sim_results.get('cpu_utilization', 'N/A')} - - Task Count: {batch_data.get('task_count', 'N/A')} - - Fragment Count: {batch_data.get('fragment_count', 'N/A')} - - Average CPU Usage: {batch_data.get('avg_cpu_usage', 'N/A')} - - Current topology: {json.dumps(current_topology, indent=2) if current_topology else 'Not provided'} - - {parser.get_format_instructions()} - - Example: - {{ - "cluster_name": ["C01", "C01"], - "host_name": ["H01", "H02"], - "coreCount": [32, 16], - "coreSpeed": [3200, 2100], - "count": [2, 3] - }} - """ - - logger.info("πŸ€– Calling OpenAI for topology optimization...") - response = llm.invoke(prompt) - logger.info("Received response from OpenAI") - - raw_content = self._extract_text_content(response) - parsed = parser.parse(raw_content) - - if current_topology: - new_topology = copy.deepcopy(current_topology) - else: - new_topology = {'clusters': []} - - new_topology = self.convert_llm_to_topology(parsed, current_topology) - - logger.info("Generated new topology from LLM recommendation") - - recommendations = parsed.dict() if hasattr(parsed, "dict") else parsed - - return { - 'type': 'llm', - 'reason': 'LLM recommendation applied', - 'new_topology': new_topology, - 'best_config': self.best_config, - 'best_score': self.best_score if self.best_config else None, - 'recommendations': recommendations, - } - - def convert_llm_to_topology(self, llm_result, current_topology): - """Convert LLM output (pydantic OR dict) to OpenDC topology format.""" - - def field(obj, name, default=None): - if hasattr(obj, name): - return getattr(obj, name, default) - if isinstance(obj, dict): - return obj.get(name, default) - return default - - new_topology = copy.deepcopy(current_topology) if current_topology else {'clusters': []} - - try: - clusters = field(llm_result, "cluster_name", []) or [] - hosts = field(llm_result, "host_name", []) or [] - counts = field(llm_result, "count", []) or [] - cores = field(llm_result, "coreCount", []) or [] - clocks = field(llm_result, "coreSpeed", []) or [] - - total = min(len(clusters), len(hosts)) - for index in range(total): - cluster_name = clusters[index] - host_name = hosts[index] - count = counts[index] if index < len(counts) else 1 - core_count = cores[index] if index < len(cores) else 16 - core_speed = clocks[index] if index < len(clocks) else 2400 - - cluster = next((c for c in new_topology["clusters"] if c["name"] == cluster_name), None) - if not cluster: - cluster = {"name": cluster_name, "hosts": []} - new_topology["clusters"].append(cluster) - - host = next((h for h in cluster["hosts"] if h["name"] == host_name), None) - if not host: - host = { - "name": host_name, - "count": int(count), - "cpu": {"coreCount": int(core_count), "coreSpeed": int(core_speed)}, - "memory": {"memorySize": 34359738368}, - } - cluster["hosts"].append(host) - else: - host["count"] = int(count) - host["cpu"]["coreCount"] = int(core_count) - host["cpu"]["coreSpeed"] = int(core_speed) - - logger.info("βœ… Successfully converted LLM output to topology format") - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Error converting LLM output to topology: %s", exc) - - return new_topology - - @staticmethod - def _extract_text_content(message: Any) -> str: - """Normalize LangChain/OpenAI message payloads into plain text.""" - - if message is None: - return "" - - content = getattr(message, "content", message) - - if isinstance(content, str): - return content.strip() - - if isinstance(content, list): - parts: list[str] = [] - for item in content: - if isinstance(item, str): - parts.append(item) - elif isinstance(item, dict): - text = item.get("text") - if text: - parts.append(str(text)) - else: - text = getattr(item, "text", None) - if text: - parts.append(str(text)) - return "".join(parts).strip() - - return str(content).strip() diff --git a/src/opendt/core/optimization/rule_based.py b/src/opendt/core/optimization/rule_based.py deleted file mode 100644 index 3106b58..0000000 --- a/src/opendt/core/optimization/rule_based.py +++ /dev/null @@ -1,113 +0,0 @@ -"""Rule-based optimization strategy used as a fallback and baseline.""" -from __future__ import annotations - -import copy -from typing import Any, Dict - -def rule_based_optimization( - sim_results: Dict[str, Any], - batch_data: Dict[str, Any], - slo_targets: Dict[str, Any], - current_topology: Dict[str, Any] | None = None, - reason: str = "", - best_config: Dict[str, Any] | None = None, - best_score: float | None = None, -) -> Dict[str, Any]: - energy = sim_results.get('energy_kwh', 2.0) - cpu_util = sim_results.get('cpu_utilization', 0.0) - runtime_hours = sim_results.get('runtime_hours', 2) - task_count = batch_data.get('task_count', 10) - - try: - energy = float(energy) - except (TypeError, ValueError): - energy = 0.0 - - try: - runtime_hours = float(runtime_hours) - except (TypeError, ValueError): - runtime_hours = 0.0 - - recommendations: list[str] = [] - new_topology = None - action = "maintain" - - if current_topology: - new_topology = copy.deepcopy(current_topology) - - energy_target = slo_targets.get('energy_target', 10.0) or 10.0 - runtime_target = slo_targets.get('runtime_target', 2.0) or 2.0 - - def relative_delta(actual: float, target: float) -> float: - if target <= 0: - return 0.0 if actual <= 0 else 1.0 - return (actual - target) / target - - energy_delta = relative_delta(energy, float(energy_target)) - runtime_delta = relative_delta(runtime_hours, float(runtime_target)) - - if energy_delta >= 0.3: - recommendations.append( - f"πŸ”₯ CRITICAL: Energy usage is {energy_delta:.0%} above the target ({energy:.2f} kWh vs {energy_target:.2f} kWh)." - ) - action = "massive downscale" - for cluster in new_topology.get('clusters', []): - for host in cluster.get('hosts', []): - if host.get('count', 1) > 1: - host['count'] = max(1, host['count'] - 1) - elif energy_delta >= 0.15: - recommendations.append( - f"⚠️ HIGH: Trim CPU frequency to curb energy usage ({energy:.2f} kWh vs target {energy_target:.2f} kWh)." - ) - action = "downscale" - for cluster in new_topology.get('clusters', []): - for host in cluster.get('hosts', []): - current_speed = host.get('cpu', {}).get('coreSpeed', 2400) - if current_speed > 1800: - host['cpu']['coreSpeed'] = max(1800, int(current_speed * 0.9)) - elif runtime_delta >= 0.25: - recommendations.append( - f"πŸ“ˆ SCALE UP: Runtime exceeds the SLO by {runtime_delta:.0%} ({runtime_hours:.2f}h vs {runtime_target:.2f}h)." - ) - action = "upscale" - for cluster in new_topology.get('clusters', []): - for host in cluster.get('hosts', []): - current_cores = host.get('cpu', {}).get('coreCount', 16) - if current_cores < 48: - host['cpu']['coreCount'] = min(48, current_cores + 4) - elif runtime_delta >= 0.1: - recommendations.append( - f"βš™οΈ Moderate runtime pressure detected ({runtime_hours:.2f}h vs target {runtime_target:.2f}h). Consider a light scale-up." - ) - action = "light upscale" - for cluster in new_topology.get('clusters', []): - for host in cluster.get('hosts', []): - current_cores = host.get('cpu', {}).get('coreCount', 16) - if current_cores < 32: - host['cpu']['coreCount'] = min(32, current_cores + 2) - elif energy_delta <= -0.2 and runtime_delta <= -0.1: - recommendations.append( - "πŸ“‰ CONSOLIDATE: Metrics are comfortably under SLO; consider shedding idle capacity." - ) - action = "slightly downscale" - for cluster in new_topology.get('clusters', []): - for host in cluster.get('hosts', []): - current_cores = host.get('cpu', {}).get('coreCount', 16) - if current_cores > 8: - host['cpu']['coreCount'] = max(8, current_cores - 2) - else: - recommendations.append("βœ… OPTIMAL: Current configuration meets configured SLOs") - - return { - 'type': 'rule_based', - 'reason': reason, - 'energy_kwh': energy, - 'cpu_utilization': cpu_util, - 'task_count': task_count, - 'recommendations': recommendations, - 'action_taken': action, - 'action_type': [action], - 'new_topology': new_topology, - 'best_config': best_config, - 'best_score': best_score, - } diff --git a/src/opendt/core/optimization/scoring.py b/src/opendt/core/optimization/scoring.py deleted file mode 100644 index 36dc975..0000000 --- a/src/opendt/core/optimization/scoring.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Scoring helpers shared across optimization strategies.""" -from __future__ import annotations - - -def performance_score(sim_results: dict[str, float]) -> float: - energy = sim_results.get('energy_kwh', 5.0) - performance = sim_results.get('runtime_hours', 1.0) - return (energy * 2.0) + (performance * 1.0) diff --git a/src/opendt/core/orchestrator/__init__.py b/src/opendt/core/orchestrator/__init__.py deleted file mode 100644 index ec2f81b..0000000 --- a/src/opendt/core/orchestrator/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Top-level orchestrator runtime components.""" - -from .controller import OpenDTOrchestrator - -__all__ = ["OpenDTOrchestrator"] diff --git a/src/opendt/core/orchestrator/controller.py b/src/opendt/core/orchestrator/controller.py deleted file mode 100644 index d62623f..0000000 --- a/src/opendt/core/orchestrator/controller.py +++ /dev/null @@ -1,388 +0,0 @@ -"""Core OpenDT orchestrator implementation.""" -from __future__ import annotations - -import logging -import os -import shutil -import threading -import time -from copy import deepcopy -from pathlib import Path -from typing import Any - -from ...config import loaders -from ...config.settings import ( - IMPROVEMENT_DELTA, - MAX_TRIES_PER_WINDOW, - WINDOW_TRY_BUDGET_SEC, - SLOTargets, - kafka_bootstrap_servers, - openai_api_key, -) -from ...adapters.ingestion.kafka.consumer import DigitalTwinConsumer -from ...adapters.ingestion.kafka.producer import TimedKafkaProducer -from ..optimization.llm import LLM -from .state import SimulationResultsBuffer, default_state_dict -from ..simulation.runner import OpenDCRunner -from .topology import topology_hash, watch_topology_file -from .slo import slo_hash, watch_slo_file - -logger = logging.getLogger(__name__) - - -class OpenDTOrchestrator: - def __init__(self) -> None: - self.kafka_servers = kafka_bootstrap_servers() - self.openai_key = openai_api_key() - self.slo_targets: dict[str, float] = SLOTargets().to_dict() - - self.state = default_state_dict() - self.state["slo_targets"] = dict(self.slo_targets) - - self.open_dc_buffer = SimulationResultsBuffer() - - self.producer: TimedKafkaProducer | None = None - self.consumer: DigitalTwinConsumer | None = None - self.opendc_runner = OpenDCRunner() - self.optimizer = LLM(self.openai_key) - - self.stop_event = threading.Event() - self.producer_thread: threading.Thread | None = None - self.consumer_thread: threading.Thread | None = None - - self.topology_path = "/app/config/topology.json" - self.slo_template_path = Path(__file__).resolve().parents[4] / "config" / "slo.json" - self.slo_path = os.environ.get("OPENDT_SLO_PATH", "/app/config/slo.json") - self.last_topology_hash: str | None = None - self.last_slo_hash: str | None = None - - self._ensure_slo_file() - self.load_initial_topology() - self.load_initial_slo() - self.start_topology_watcher() - self.start_slo_watcher() - - def load_initial_topology(self) -> None: - topo = loaders.read_topology(self.topology_path) - if topo is not None: - self.state["current_topology"] = topo - self.last_topology_hash = topology_hash(topo) - logger.info("πŸ“„ Loaded initial topology configuration") - - def _ensure_slo_file(self) -> None: - target = Path(self.slo_path) - template = self.slo_template_path - try: - target.parent.mkdir(parents=True, exist_ok=True) - if not target.exists() and template.exists(): - shutil.copyfile(template, target) - except Exception as exc: # pragma: no cover - defensive logging path - logger.warning("Unable to seed SLO configuration: %s", exc) - - def start_topology_watcher(self) -> None: - def _on_change(new_topology: dict[str, Any]) -> None: - self.state["current_topology"] = new_topology - self.state["topology_updates"] = (self.state.get("topology_updates") or 0) + 1 - - self._watch_thread = threading.Thread( - target=watch_topology_file, - args=(self.topology_path, self.stop_event, _on_change), - daemon=True, - ) - self._watch_thread.start() - - def load_initial_slo(self) -> None: - slo = loaders.read_slo(self.slo_path) - if slo is None: - self.last_slo_hash = slo_hash(self.slo_targets) - logger.info("βš™οΈ Using default SLO configuration: %s", self.slo_targets) - return - - normalized = self._normalize_slo_targets(slo) - self.slo_targets = normalized - self.state["slo_targets"] = dict(normalized) - self.last_slo_hash = slo_hash(normalized) - logger.info("πŸ“„ Loaded initial SLO configuration") - - def start_slo_watcher(self) -> None: - def _on_change(new_slo: dict[str, Any]) -> None: - normalized = self._normalize_slo_targets(new_slo) - new_hash = slo_hash(normalized) - if new_hash == self.last_slo_hash: - return - self.slo_targets = normalized - self.state["slo_targets"] = dict(normalized) - self.last_slo_hash = new_hash - - self._slo_watch_thread = threading.Thread( - target=watch_slo_file, - args=(self.slo_path, self.stop_event, _on_change), - daemon=True, - ) - self._slo_watch_thread.start() - - def _normalize_slo_targets(self, payload: dict[str, Any] | None) -> dict[str, float]: - return SLOTargets.from_dict(payload).to_dict() - - def _topo_hash(self, topo: dict[str, Any] | None) -> str: - return topology_hash(topo) - - def update_topology_file(self, new_topology: dict[str, Any]) -> bool: - if not new_topology: - return False - new_hash = self._topo_hash(new_topology) - if self.last_topology_hash == new_hash: - logger.info("↩️ Skipping apply: topology identical to current (no-op)") - return False - try: - loaders.backup_topology(self.topology_path) - loaders.write_topology(self.topology_path, new_topology) - self.state["current_topology"] = new_topology - self.state["topology_updates"] += 1 - self.last_topology_hash = new_hash - logger.info( - "βœ… Applied new topology (update #%s)", - self.state["topology_updates"], - ) - return True - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Failed to update topology file: %s", exc) - return False - - def update_slo_file(self, new_slo: dict[str, Any]) -> str: - normalized = self._normalize_slo_targets(new_slo) - new_hash = slo_hash(normalized) - target_exists = Path(self.slo_path).exists() - if target_exists and self.last_slo_hash == new_hash: - logger.info("↩️ Skipping apply: SLO configuration identical to current (no-op)") - return "noop" - - try: - loaders.backup_slo(self.slo_path) - loaders.write_slo(self.slo_path, normalized) - self.slo_targets = dict(normalized) - self.state["slo_targets"] = dict(normalized) - self.last_slo_hash = new_hash - logger.info("βœ… Applied new SLO configuration: %s", normalized) - return "applied" - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Failed to update SLO file: %s", exc) - return "error" - - def start_system(self) -> None: - logger.info("πŸš€ Starting OpenDT Digital Twin System") - self.state["status"] = "starting" - self.stop_event.clear() - - try: - self.producer = TimedKafkaProducer(self.kafka_servers) - self.consumer = DigitalTwinConsumer(self.kafka_servers, "OpenDT_telemetry") - - self.consumer_thread = threading.Thread(target=self.run_consumer, daemon=False) - self.consumer_thread.start() - - time.sleep(5) - - self.producer_thread = threading.Thread(target=self.run_producer, daemon=False) - self.producer_thread.start() - - self.state["status"] = "running" - logger.info("βœ… System started successfully") - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Failed to start system: %s", exc) - self.state["status"] = "error" - - def stop_system(self) -> None: - logger.info("πŸ›‘ Stopping OpenDT Digital Twin System") - self.state["status"] = "stopping" - self.stop_event.set() - - if self.producer: - self.producer.stop() - - if self.consumer: - self.consumer.stop() - - if self.producer_thread and self.producer_thread.is_alive(): - self.producer_thread.join(timeout=5) - - if self.consumer_thread and self.consumer_thread.is_alive(): - self.consumer_thread.join(timeout=5) - - self.state["status"] = "stopped" - logger.info("βœ… System stopped") - - def run_producer(self) -> None: - try: - logger.info("πŸ“‘ Starting timed telemetry producer...") - stats = self.producer.stream_parquet_data_timed( - tasks_file="/../app/surf-workload/tasks.parquet", - fragments_file="/../app/surf-workload/fragments.parquet", - ) - self.state["total_tasks"] = stats["total_tasks"] - self.state["total_fragments"] = stats["total_fragments"] - logger.info("βœ… Producer finished: %s", stats) - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Producer error: %s", exc) - - def _score(self, sim_results: dict[str, Any]) -> float: - defaults = SLOTargets() - energy_target = float(self.slo_targets.get("energy_target", defaults.energy_target)) - runtime_target = float(self.slo_targets.get("runtime_target", defaults.runtime_target)) - - metrics = [ - (sim_results.get("energy_kwh"), energy_target, 0.6), - (sim_results.get("runtime_hours"), runtime_target, 0.4), - ] - - weighted_delta = 0.0 - total_weight = 0.0 - - for actual, target, weight in metrics: - if actual is None: - continue - - try: - actual_value = float(actual) - except (TypeError, ValueError): - continue - - total_weight += weight - - if target <= 0: - delta = 0.0 if actual_value <= 0 else 1.0 - else: - delta = (actual_value - target) / target - - # Clamp extreme values to keep the score stable - delta = max(-1.0, min(delta, 5.0)) - weighted_delta += delta * weight - - if total_weight == 0: - return 0.0 - - return round(weighted_delta / total_weight, 4) - - def _append_simulation_result(self, result: dict[str, Any], timestamp: str) -> None: - with self.open_dc_buffer.lock: - self.open_dc_buffer.results.append(result) - self.open_dc_buffer.timestamps.append(timestamp) - - def _simulation_timestamps(self) -> list[str]: - with self.open_dc_buffer.lock: - return list(self.open_dc_buffer.timestamps) - - def _simulation_results(self) -> list[dict[str, Any]]: - with self.open_dc_buffer.lock: - return list(self.open_dc_buffer.results) - - def run_consumer(self) -> None: - try: - logger.info("πŸ“₯ Starting digital twin consumer...") - for cycle, batch_data in enumerate(self.consumer.process_windows(), start=1): - if self.stop_event.is_set(): - break - - self.state["cycle_count"] = cycle - self.state["current_window"] = batch_data.get("window_info", "Processing...") - logger.info("πŸ”„ Processing cycle %s", cycle) - - baseline = self.run_simulation(batch_data) - - timestamp = batch_data["window_end"].strftime("%Y-%m-%dT%H:%M:%SZ") - self._append_simulation_result(baseline, timestamp) - - self.state["last_simulation"] = baseline - baseline_score = self._score(baseline) - - self.state.update( - { - "window_baseline_score": round(baseline_score, 3), - "window_best_score": round(baseline_score, 3), - "window_trials": 0, - "window_accepted": False, - } - ) - - best_topology = self.state.get("current_topology") - best_score = baseline_score - seen = {self._topo_hash(best_topology) if best_topology else ""} - tries = 0 - deadline = time.monotonic() + WINDOW_TRY_BUDGET_SEC - - while ( - time.monotonic() < deadline - and tries < MAX_TRIES_PER_WINDOW - and not self.stop_event.is_set() - ): - tries += 1 - opt = self.optimizer.optimize( - baseline, - batch_data, - self.slo_targets, - current_topology=best_topology, - ) - - proposed = opt.get("new_topology") - if not proposed: - continue - topo_hash = self._topo_hash(proposed) - if topo_hash in seen: - continue - seen.add(topo_hash) - - probe = self.opendc_runner.run_simulation( - tasks_data=batch_data.get("tasks_sample", []), - fragments_data=batch_data.get("fragments_sample", []), - topology_data=proposed, - expName=f"window_{cycle}_try_{tries}", - ) - - self.state["last_optimization"] = opt - self.state["last_optimization"]["energy_kwh"] = probe.get("energy_kwh", None) - self.state["last_optimization"]["runtime_hours"] = probe.get("runtime_hours", None) - self.state["last_optimization"]["cpu_utilization"] = probe.get("cpu_utilization", None) - self.state["last_optimization"]["max_power_draw"] = probe.get("max_power_draw", None) - - self.state["cycle_count_opt"] += 1 - score = self._score(probe) - if score < best_score - IMPROVEMENT_DELTA: - best_topology, best_score = proposed, score - self.state["window_best_score"] = round(best_score, 3) - - self.state["window_trials"] = tries - - self.state["best_config"] = { - "config": best_topology, - "score": round(best_score, 3), - } - - if self.stop_event.wait(0.1): - break - except Exception as exc: # pragma: no cover - defensive logging path - logger.exception("Consumer error: %s", exc) - - def run_simulation(self, batch_data: dict[str, Any], expName: str = "simple") -> dict[str, Any]: - logger.info("πŸ”„ Running OpenDC simulation...") - tasks_data = batch_data.get("tasks_sample", []) - fragments_data = batch_data.get("fragments_sample", []) - topology_data = self.state.get("current_topology") - results = self.opendc_runner.run_simulation( - tasks_data=tasks_data, - fragments_data=fragments_data, - topology_data=topology_data, - expName=expName, - ) - logger.info("πŸ“Š Simulation Results: %s", results) - return results - - def simulation_timeseries(self) -> dict[str, Any]: - results = self._simulation_results() - timestamps = self._simulation_timestamps() - cpu_usages = [res.get("cpu_utilization") for res in results] - power_usages = [res.get("energy_kwh") for res in results] - return { - "cpu_usages": cpu_usages, - "power_usages": power_usages, - "timestamps": deepcopy(timestamps), - } diff --git a/src/opendt/core/orchestrator/events.py b/src/opendt/core/orchestrator/events.py deleted file mode 100644 index 1783945..0000000 --- a/src/opendt/core/orchestrator/events.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Lightweight observer helpers for orchestrator events.""" -from __future__ import annotations - -from collections import defaultdict -from typing import Any, Callable, DefaultDict, List - - -class EventBus: - def __init__(self) -> None: - self._subscribers: DefaultDict[str, List[Callable[[Any], None]]] = defaultdict(list) - - def subscribe(self, event: str, handler: Callable[[Any], None]) -> None: - self._subscribers[event].append(handler) - - def publish(self, event: str, payload: Any) -> None: - for handler in self._subscribers.get(event, []): - handler(payload) diff --git a/src/opendt/core/orchestrator/slo.py b/src/opendt/core/orchestrator/slo.py deleted file mode 100644 index 2eb545a..0000000 --- a/src/opendt/core/orchestrator/slo.py +++ /dev/null @@ -1,47 +0,0 @@ -"""SLO hashing and watcher helpers mirroring topology utilities.""" -from __future__ import annotations - -import hashlib -import json -import logging -import os -import threading -import time -from typing import Any, Callable - -from ...config import loaders - -logger = logging.getLogger(__name__) - - -def slo_hash(slo: dict[str, Any] | None) -> str: - if not slo: - return "" - try: - canonical = json.dumps(slo, sort_keys=True, separators=(",", ":")) - except Exception: - canonical = str(slo) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - - -def watch_slo_file( - path: str, - stop_event: threading.Event, - on_change: Callable[[dict[str, Any]], None], -) -> None: - last_mtime = 0.0 - while not stop_event.is_set(): - try: - if os.path.exists(path): - mtime = loaders.slo_mtime(path) - if mtime != last_mtime: - slo = loaders.read_slo(path) - if slo is not None: - on_change(slo) - last_mtime = mtime - logger.info("πŸ” SLO file changed; dashboard state updated") - else: - last_mtime = mtime - except Exception as exc: # pragma: no cover - defensive logging path - logger.warning("SLO watcher error: %s", exc) - time.sleep(0.5) diff --git a/src/opendt/core/orchestrator/state.py b/src/opendt/core/orchestrator/state.py deleted file mode 100644 index 0673e70..0000000 --- a/src/opendt/core/orchestrator/state.py +++ /dev/null @@ -1,46 +0,0 @@ -"""State helpers for the OpenDT orchestrator.""" -from __future__ import annotations - -from dataclasses import dataclass, field -from threading import Event, Lock -from typing import Any, Dict, List - - -@dataclass -class OrchestratorState: - status: str = "stopped" - cycle_count: int = 0 - cycle_count_opt: int = 0 - last_simulation: Dict[str, Any] | None = None - last_optimization: Dict[str, Any] | None = None - total_tasks: int = 0 - total_fragments: int = 0 - current_window: str | None = None - current_topology: Dict[str, Any] | None = None - best_config: Dict[str, Any] | None = None - topology_updates: int = 0 - slo_targets: Dict[str, Any] = field(default_factory=dict) - window_baseline_score: float | None = None - window_best_score: float | None = None - window_trials: int = 0 - window_accepted: bool = False - window_time_used_sec: float = 0.0 - - def as_dict(self) -> Dict[str, Any]: - return self.__dict__ - - -@dataclass -class SimulationResultsBuffer: - results: List[Dict[str, Any]] = field(default_factory=list) - timestamps: List[str] = field(default_factory=list) - lock: Lock = field(default_factory=Lock) - - -@dataclass -class OrchestratorRuntime: - stop_event: Event = field(default_factory=Event) - - -def default_state_dict() -> Dict[str, Any]: - return OrchestratorState().as_dict() diff --git a/src/opendt/core/orchestrator/topology.py b/src/opendt/core/orchestrator/topology.py deleted file mode 100644 index a629b2f..0000000 --- a/src/opendt/core/orchestrator/topology.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Topology hashing and file-watcher utilities.""" -from __future__ import annotations - -import hashlib -import json -import logging -import os -import threading -import time -from typing import Any, Callable - -from ...config import loaders - -logger = logging.getLogger(__name__) - - -def topology_hash(topo: dict[str, Any] | None) -> str: - if topo is None: - return "" - try: - canonical = json.dumps(topo, sort_keys=True, separators=(",", ":")) - except Exception: - canonical = str(topo) - return hashlib.sha256(canonical.encode("utf-8")).hexdigest() - - -def watch_topology_file( - path: str, - stop_event: threading.Event, - on_change: Callable[[dict[str, Any]], None], -) -> None: - last_mtime = 0.0 - while not stop_event.is_set(): - try: - if os.path.exists(path): - mtime = loaders.topology_mtime(path) - if mtime != last_mtime: - topo = loaders.read_topology(path) - if topo is not None: - on_change(topo) - last_mtime = mtime - logger.info("πŸ” Topology file changed; dashboard state updated") - except Exception as exc: # pragma: no cover - defensive logging path - logger.warning("Topology watcher error: %s", exc) - time.sleep(0.5) diff --git a/src/opendt/core/simulation/__init__.py b/src/opendt/core/simulation/__init__.py deleted file mode 100644 index 463b6f7..0000000 --- a/src/opendt/core/simulation/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Simulation adapters for orchestrator workloads.""" diff --git a/src/opendt/core/simulation/adapters.py b/src/opendt/core/simulation/adapters.py deleted file mode 100644 index 43d012f..0000000 --- a/src/opendt/core/simulation/adapters.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Helpers translating streaming data into OpenDC artifacts.""" -from __future__ import annotations - -from pathlib import Path -from typing import Iterable, Mapping - -import pandas as pd -import pyarrow as pa - - -def ensure_workload_dir(base: str = "/tmp/opendt_workload") -> Path: - path = Path(base) - path.mkdir(parents=True, exist_ok=True) - return path - - -def tasks_to_table(tasks_data: Iterable[Mapping[str, object]]) -> pa.Table: - tasks_df = pd.DataFrame([ - { - "id": task.get("id", 0), - "submission_time": int(pd.to_datetime(task.get("submission_time", "2024-01-01")).value) // 1_000_000, - "duration": task.get("duration", 30000), - "cpu_count": task.get("cpu_count", 1), - "cpu_capacity": task.get("cpu_capacity", 2400.0), - "mem_capacity": task.get("mem_capacity", 1024 ** 3), - } - for task in tasks_data - ]) - - schema = pa.schema( - [ - pa.field("id", pa.int32(), False), - pa.field("submission_time", pa.int64(), False), - pa.field("duration", pa.int64(), False), - pa.field("cpu_count", pa.int32(), False), - pa.field("cpu_capacity", pa.float64(), False), - pa.field("mem_capacity", pa.int64(), False), - ] - ) - return pa.Table.from_pandas(tasks_df, schema=schema, preserve_index=False) - - -def fragments_to_table(fragments_data: Iterable[Mapping[str, object]]) -> pa.Table: - frags_df = pd.DataFrame([ - { - "id": frag.get("id", 0), - "duration": frag.get("duration", 10000), - "cpu_count": 1, - "cpu_usage": frag.get("cpu_usage", 0.5), - } - for frag in fragments_data - ]) - - schema = pa.schema( - [ - pa.field("id", pa.int32(), False), - pa.field("duration", pa.int64(), False), - pa.field("cpu_count", pa.int32(), False), - pa.field("cpu_usage", pa.float64(), False), - ] - ) - return pa.Table.from_pandas(frags_df, schema=schema, preserve_index=False) diff --git a/src/opendt/core/simulation/opendc/__init__.py b/src/opendt/core/simulation/opendc/__init__.py deleted file mode 100644 index 6afc4b3..0000000 --- a/src/opendt/core/simulation/opendc/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Bundled OpenDC simulator binaries.""" - -from pathlib import Path - -__all__ = ["SIMULATOR_ROOT"] - -SIMULATOR_ROOT = Path(__file__).resolve().parent -"""Path to the packaged OpenDC simulator assets.""" diff --git a/src/opendt/core/simulation/runner.py b/src/opendt/core/simulation/runner.py deleted file mode 100644 index 4717f8f..0000000 --- a/src/opendt/core/simulation/runner.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Wrapper around the OpenDC experiment runner binary.""" -from __future__ import annotations - -import json -import logging -import os -import subprocess -from pathlib import Path -from typing import Any, Iterable, Mapping - -import pandas as pd -import pyarrow.parquet as pq - -from .adapters import ensure_workload_dir, fragments_to_table, tasks_to_table -from .opendc import SIMULATOR_ROOT - -logger = logging.getLogger(__name__) - - -class OpenDCRunner: - """OpenDC ExperimentRunner with comprehensive path detection and diagnostics.""" - - def __init__(self) -> None: - package_root = SIMULATOR_ROOT - possible_paths = [ - package_root / "bin" / "OpenDCExperimentRunner" / "bin" / "OpenDCExperimentRunner", - package_root / "bin" / "OpenDCExperimentRunner" / "OpenDCExperimentRunner", - Path("/app/opendt-simulator/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner"), - Path("/app/opendt-simulator/bin/OpenDCExperimentRunner/OpenDCExperimentRunner"), - Path("/app/opendc/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner"), - Path("/app/opendc/bin/OpenDCExperimentRunner/OpenDCExperimentRunner"), - Path("./opendt-simulator/bin/OpenDCExperimentRunner/bin/OpenDCExperimentRunner"), - ] - - self.opendc_path: str | None = None - self._force_shell: bool = False - - logger.info("πŸ” Searching for OpenDC runner...") - for candidate in possible_paths: - logger.info("Checking: %s", candidate) - logger.info(" - Exists: %s", candidate.exists()) - if not candidate.exists(): - continue - - if candidate.is_file(): - if os.access(candidate, os.X_OK): - self.opendc_path = str(candidate) - logger.info("βœ… Found executable OpenDC runner: %s", candidate) - break - logger.warning("⚠️ OpenDC found but not executable, fixing perms: %s", candidate) - try: - os.chmod(candidate, 0o755) - if os.access(candidate, os.X_OK): - self.opendc_path = str(candidate) - logger.info("βœ… Fixed permissions for OpenDC runner: %s", candidate) - break - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("❌ Failed to chmod OpenDC runner: %s", exc) - - # Persist the candidate for a shell-based fallback in environments - # where chmod is a no-op (e.g., CIFS/NTFS bind mounts on Docker Desktop). - if self.opendc_path is None: - self.opendc_path = str(candidate) - self._force_shell = True - logger.info( - "πŸ” Falling back to /bin/sh execution for OpenDC runner: %s", - candidate, - ) - break - - logger.info("πŸ“ Directory structure:") - for base in [package_root, Path("/app/opendt-simulator"), Path("/app/opendc")]: - if Path(base).exists(): - logger.info("Contents of %s:", base) - try: - for item in Path(base).rglob("*OpenDC*"): - if item.is_file(): - size = item.stat().st_size - perms = oct(item.stat().st_mode)[-3:] - execb = os.access(str(item), os.X_OK) - logger.info(" πŸ“„ %s [%s bytes, %s, exec: %s]", item, size, perms, execb) - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Error listing %s: %s", base, exc) - - self.base_experiment = { - "name": "opendt-simulation", - "exportModels": [ - { - "exportInterval": 150, - "filesToExport": ["powerSource", "host", "task", "service"], - "computeExportConfig": { - "powerSourceExportColumns": ["energy_usage", "power_draw"] - }, - } - ], - } - - def create_workload( - self, - tasks_data: Iterable[Mapping[str, Any]] | None, - fragments_data: Iterable[Mapping[str, Any]] | None, - ) -> str: - workload_dir = ensure_workload_dir() - - if tasks_data: - tasks_table = tasks_to_table(tasks_data) - pq.write_table(tasks_table, workload_dir / "tasks.parquet") - logger.info("πŸ“„ Created tasks.parquet with %s tasks", tasks_table.num_rows) - - if fragments_data: - frags_table = fragments_to_table(fragments_data) - pq.write_table(frags_table, workload_dir / "fragments.parquet") - logger.info("πŸ“„ Created fragments.parquet with %s fragments", frags_table.num_rows) - - return str(workload_dir) - - def run_simulation( - self, - tasks_data: Iterable[Mapping[str, Any]] | None, - fragments_data: Iterable[Mapping[str, Any]] | None, - topology_data: Mapping[str, Any] | None, - expName: str = "simple", - ) -> dict[str, Any]: - if not self.opendc_path: - raise FileNotFoundError( - "OpenDC runner executable was not found. Ensure the simulator binaries " - "are available and accessible before invoking the simulation." - ) - - workload_path = self.create_workload(tasks_data, fragments_data) - - topology_file = Path("/tmp/topology.json") - topology_file.write_text(json.dumps(topology_data, indent=2)) - logger.info("πŸ“„ Created topology: %s", topology_file) - - experiment = dict(self.base_experiment) - experiment["name"] = expName - experiment.update( - { - "topologies": [{"pathToFile": str(topology_file)}], - "workloads": [{"pathToFile": workload_path, "type": "ComputeWorkload"}], - } - ) - experiment_file = Path("/tmp/experiment.json") - experiment_file.write_text(json.dumps(experiment, indent=2)) - logger.info("πŸ“„ Created experiment: %s", experiment_file) - - logger.info("πŸš€ Running OpenDC simulation: %s", self.opendc_path) - env = os.environ.copy() - env.setdefault("JAVA_HOME", "/usr/lib/jvm/java-21-openjdk-amd64") - - command = [self.opendc_path, "--experiment-path", str(experiment_file)] - - def _run(cmd: list[str]) -> subprocess.CompletedProcess[str]: - return subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=120, - env=env, - ) - - if not os.access(self.opendc_path, os.X_OK): - logger.warning( - "⚠️ OpenDC runner lacks execute bit (%s). Will invoke through /bin/sh.", - self.opendc_path, - ) - self._force_shell = True - - try: - if self._force_shell: - shell_command = ["/bin/sh", *command] - logger.info("πŸͺ„ Launching OpenDC runner via /bin/sh: %s", shell_command) - result = _run(shell_command) - else: - result = _run(command) - except subprocess.TimeoutExpired as exc: - logger.error("OpenDC simulation timed out: %s", exc) - raise TimeoutError("OpenDC simulation timed out after 120 seconds") from exc - except PermissionError as exc: - logger.warning( - "🚫 Permission denied executing OpenDC directly (%s). Retrying through /bin/sh.", - exc, - ) - self._force_shell = True - shell_command = ["/bin/sh", *command] - result = _run(shell_command) - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("OpenDC execution failed: %s", exc) - raise RuntimeError(f"OpenDC execution failed: {exc}") from exc - logger.info("OpenDC return code: %s", result.returncode) - if result.stdout: - logger.info("OpenDC stdout: %s", result.stdout) - if result.stderr: - logger.info("OpenDC stderr: %s", result.stderr) - - if result.returncode != 0: - raise RuntimeError( - "OpenDC simulation failed with exit code " - f"{result.returncode}. stdout: {result.stdout!r} stderr: {result.stderr!r}" - ) - - logger.info("βœ… OpenDC simulation completed successfully") - return self.parse_opendc_results() - - def parse_opendc_results(self) -> dict[str, Any]: - try: - output_dirs = [ - Path("output/opendt-simulation/raw-output/0/seed=0"), - Path("./output/simple/raw-output/0/seed=0"), - Path("/tmp/output"), - Path(os.environ.get("OPENDT_SIM_DIR") or "/app/output/opendt-simulation/raw-output"), - ] - - power_df = host_df = service_df = None - for odir in output_dirs: - if not odir.exists(): - continue - pfile = odir / "powerSource.parquet" - hfile = odir / "host.parquet" - sfile = odir / "service.parquet" - if pfile.exists(): - power_df = pd.read_parquet(pfile) - if hfile.exists(): - host_df = pd.read_parquet(hfile) - if sfile.exists(): - service_df = pd.read_parquet(sfile) - if power_df is not None or host_df is not None: - break - - if power_df is not None and len(power_df) > 0: - energy_kwh = power_df["energy_usage"].sum() / 3_600_000 - max_power = float(power_df["power_draw"].max()) - else: - energy_kwh, max_power = 0.0, 0.0 - - if host_df is not None and len(host_df) > 0 and "cpu_utilization" in host_df.columns: - cpu_util = float(host_df["cpu_utilization"].mean()) - else: - cpu_util = 0.0 - - if service_df is not None and len(service_df) > 0 and "timestamp" in service_df.columns: - runtime_ms = service_df["timestamp"].max() - service_df["timestamp"].min() - runtime_hours = float(runtime_ms) / (1000 * 3600) - else: - runtime_hours = 0.0 - - return { - "energy_kwh": round(float(energy_kwh), 4), - "cpu_utilization": round(float(cpu_util), 3), - "max_power_draw": round(float(max_power), 1), - "runtime_hours": round(float(runtime_hours), 2), - "status": "success", - } - except Exception as exc: # pragma: no cover - defensive logging path - logger.error("Failed to parse OpenDC results: %s", exc) - return { - "energy_kwh": 0.0, - "cpu_utilization": 0.0, - "max_power_draw": 0.0, - "runtime_hours": 0.0, - "status": "error", - } - diff --git a/src/opendt/core/workers/__init__.py b/src/opendt/core/workers/__init__.py deleted file mode 100644 index d634a6b..0000000 --- a/src/opendt/core/workers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Threading utilities for orchestrator background work.""" diff --git a/src/opendt/core/workers/scheduler.py b/src/opendt/core/workers/scheduler.py deleted file mode 100644 index 4e47b24..0000000 --- a/src/opendt/core/workers/scheduler.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Utilities for spawning background worker threads.""" -from __future__ import annotations - -import threading -from typing import Callable - - -def start_thread(target: Callable[[], None], *, daemon: bool = False) -> threading.Thread: - thread = threading.Thread(target=target, daemon=daemon) - thread.start() - return thread diff --git a/src/opendt/core/workers/tasks.py b/src/opendt/core/workers/tasks.py deleted file mode 100644 index 2b3bbb8..0000000 --- a/src/opendt/core/workers/tasks.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Task helpers for background orchestration jobs.""" -from __future__ import annotations - -from typing import Callable - - -def run_task(task: Callable[[], None]) -> None: - task() diff --git a/src/opendt/logging.py b/src/opendt/logging.py deleted file mode 100644 index fd21e58..0000000 --- a/src/opendt/logging.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Application-wide logging configuration with JSON output.""" -from __future__ import annotations - -import json -import logging -import os -from datetime import datetime, timezone - - -class JSONLogFormatter(logging.Formatter): - """Serialize log records as structured JSON for easy filtering.""" - - def format(self, record: logging.LogRecord) -> str: - payload = { - "timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(), - "level": record.levelname, - "logger": record.name, - "message": record.getMessage(), - } - if record.exc_info: - payload["exc_info"] = self.formatException(record.exc_info) - for key, value in record.__dict__.items(): - if key.startswith("_"): - continue - if key in payload or key in {"args", "created", "exc_info", "exc_text", "filename", "funcName", - "levelname", "levelno", "lineno", "message", "module", "msecs", - "msg", "name", "pathname", "process", "processName", "relativeCreated", - "stack_info", "thread", "threadName"}: - continue - payload[key] = value - return json.dumps(payload, ensure_ascii=False) - - -def configure_logging() -> None: - level = os.environ.get("OPENDT_LOG_LEVEL", "INFO").upper() - handler = logging.StreamHandler() - handler.setFormatter(JSONLogFormatter()) - - root = logging.getLogger() - root.handlers.clear() - root.addHandler(handler) - root.setLevel(level) - - -configure_logging() diff --git a/tests/api/test_routes.py b/tests/api/test_routes.py deleted file mode 100644 index dace26e..0000000 --- a/tests/api/test_routes.py +++ /dev/null @@ -1,150 +0,0 @@ -"""API route integration tests.""" -from __future__ import annotations - -from copy import deepcopy -import json -from pathlib import Path - -import pytest - -from opendt.app import create_app -from opendt.api.dependencies import get_orchestrator - - -@pytest.fixture -def client(): - app = create_app() - return app.test_client() - - -def test_set_slo_endpoint_updates_targets(client): - orchestrator = get_orchestrator() - slo_path = Path(orchestrator.slo_path) - slo_path.parent.mkdir(parents=True, exist_ok=True) - backup_path = Path(str(slo_path) + ".backup") - - original = slo_path.read_text() if slo_path.exists() else None - backup_original = backup_path.read_text() if backup_path.exists() else None - - try: - response = client.post("/api/set_slo", json={"energy_target": 5, "runtime_target": 1}) - assert response.status_code == 200 - data = response.get_json() - assert data["status"] == "success" - assert data["energy_target"] == 5 - assert data["runtime_target"] == 1 - assert orchestrator.slo_targets["energy_target"] == 5 - assert orchestrator.slo_targets["runtime_target"] == 1 - - persisted = json.loads(slo_path.read_text()) - assert persisted["energy_target"] == 5 - assert persisted["runtime_target"] == 1 - finally: - if original is not None: - slo_path.write_text(original) - elif slo_path.exists(): - slo_path.unlink() - - if backup_original is not None: - backup_path.write_text(backup_original) - elif backup_path.exists(): - backup_path.unlink() - - -def test_accept_recommendation_requires_existing_topology(client): - orchestrator = get_orchestrator() - previous = orchestrator.state.get("best_config") - try: - orchestrator.state["best_config"] = None - response = client.post("/api/accept_recommendation") - assert response.status_code == 400 - assert response.get_json()["error"] == "No recommendation available" - finally: - orchestrator.state["best_config"] = previous - - -def test_accept_recommendation_applies_best_config(client, monkeypatch): - orchestrator = get_orchestrator() - previous = orchestrator.state.get("best_config") - previous_updates = orchestrator.state.get("topology_updates", 0) - - staged = {"clusters": [{"name": "A", "hosts": []}]} - applied = {} - - def fake_update_topology(new_topology): - applied["topology"] = deepcopy(new_topology) - orchestrator.state["current_topology"] = new_topology - orchestrator.state["topology_updates"] = orchestrator.state.get("topology_updates", 0) + 1 - return True - - monkeypatch.setattr(orchestrator, "update_topology_file", fake_update_topology) - - try: - orchestrator.state["best_config"] = {"config": staged, "score": 1.0} - response = client.post("/api/accept_recommendation") - assert response.status_code == 200 - data = response.get_json() - assert data["applied_config"] == staged - assert applied["topology"] == staged - assert orchestrator.state["best_config"]["config"] == staged - finally: - orchestrator.state["best_config"] = previous - orchestrator.state["topology_updates"] = previous_updates - - -def test_accept_recommendation_allows_custom_payload(client, monkeypatch): - orchestrator = get_orchestrator() - previous = orchestrator.state.get("best_config") - previous_updates = orchestrator.state.get("topology_updates", 0) - - target = { - "clusters": [ - { - "name": "B", - "hosts": [ - { - "name": "H1", - "count": 2, - "cpu": {"coreCount": 16, "coreSpeed": 2400}, - "memory": {"memorySize": 34359738368}, - } - ], - } - ] - } - - applied = {} - - def fake_update_topology(new_topology): - applied["topology"] = deepcopy(new_topology) - orchestrator.state["topology_updates"] = orchestrator.state.get("topology_updates", 0) + 1 - return True - - monkeypatch.setattr(orchestrator, "update_topology_file", fake_update_topology) - - try: - orchestrator.state["best_config"] = {"config": {"clusters": []}, "score": 3.2} - response = client.post("/api/accept_recommendation", json={"topology": target}) - assert response.status_code == 200 - data = response.get_json() - assert data["applied_config"] == target - assert applied["topology"] == target - assert orchestrator.state["best_config"]["config"] == target - finally: - orchestrator.state["best_config"] = previous - orchestrator.state["topology_updates"] = previous_updates - - -def test_accept_recommendation_reports_failure(client, monkeypatch): - orchestrator = get_orchestrator() - previous = orchestrator.state.get("best_config") - - monkeypatch.setattr(orchestrator, "update_topology_file", lambda *_: False) - - try: - orchestrator.state["best_config"] = {"config": {"clusters": []}} - response = client.post("/api/accept_recommendation") - assert response.status_code == 500 - assert response.get_json()["error"] == "Failed to update topology file" - finally: - orchestrator.state["best_config"] = previous diff --git a/tests/config/test_settings.py b/tests/config/test_settings.py deleted file mode 100644 index 82c351a..0000000 --- a/tests/config/test_settings.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Unit tests for configuration helpers.""" - -from opendt.config import settings - - -def test_openai_api_key_reads_environment(monkeypatch): - """Ensure the OpenAI API key is sourced from environment variables.""" - - monkeypatch.setenv("OPENAI_API_KEY", "env-key") - - assert settings.openai_api_key() == "env-key" diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 0a205fa..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Shared pytest fixtures and Kafka stubs used across the test suite.""" - -import sys -from pathlib import Path -import types - -import pytest - - -ROOT = Path(__file__).resolve().parents[1] -SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) - - -fake_kafka = types.ModuleType("kafka") - -fake_dotenv = types.ModuleType("dotenv") -fake_dotenv.load_dotenv = lambda *args, **kwargs: None - - -class _KafkaProducer: - """Minimal KafkaProducer stub used to satisfy orchestrator imports.""" - def __init__(self, *args, **kwargs): - pass - - def send(self, *args, **kwargs): - pass - - def flush(self): - pass - - -class _KafkaConsumer: - """Minimal KafkaConsumer stub returning an empty iterator.""" - def __init__(self, *args, **kwargs): - pass - - def __iter__(self): - return iter(()) - - -fake_kafka.KafkaProducer = _KafkaProducer -fake_kafka.KafkaConsumer = _KafkaConsumer - -sys.modules.setdefault("kafka", fake_kafka) -sys.modules.setdefault("dotenv", fake_dotenv) - -from opendt.api.dependencies import get_orchestrator - - -@pytest.fixture(scope="session", autouse=True) -def stop_background_threads(): - """Ensure the global orchestrator threads do not interfere with tests.""" - orchestrator = get_orchestrator() - orchestrator.stop_event.set() - yield - orchestrator.stop_event.set() diff --git a/tests/ingestion/test_consumer.py b/tests/ingestion/test_consumer.py deleted file mode 100644 index 3d32f2e..0000000 --- a/tests/ingestion/test_consumer.py +++ /dev/null @@ -1,34 +0,0 @@ -"""End-to-end tests for assembling DigitalTwinConsumer telemetry windows.""" - -from datetime import datetime, timedelta - -import pandas as pd - -from opendt.adapters.ingestion.kafka.consumer import DigitalTwinConsumer - - -def _iso(ts: datetime) -> str: - """Return the ISO-8601 string representation of ``ts``.""" - return ts.isoformat() - - -def test_create_batch_compiles_window_data(): - """Create a window with tasks/fragments and ensure ``create_batch`` aggregates it.""" - consumer = DigitalTwinConsumer(bootstrap_servers="localhost:9092", kafka_group_id="test") - now = datetime.utcnow() - task = {"id": 1, "submission_time": _iso(now), "duration": 100, "cpu_count": 2, "cpu_capacity": 2.4, "mem_capacity": 1024} - fragment = {"id": 1, "submission_time": _iso(now + timedelta(seconds=10)), "duration": 50, "cpu_usage": 0.5} - - with consumer.windows_lock: - window = consumer._DigitalTwinConsumer__add_to_window(task, "tasks") - consumer._DigitalTwinConsumer__add_to_window(fragment, "fragments") - window["ready"] = True - - batch = consumer.create_batch(window_number=1) - - assert batch["task_count"] == 1 - assert batch["fragment_count"] == 1 - assert isinstance(batch["avg_cpu_usage"], float) - assert batch["tasks_sample"][0]["id"] == 1 - assert batch["fragments_sample"][0]["id"] == 1 - assert "window_info" in batch diff --git a/tests/ingestion/test_producer.py b/tests/ingestion/test_producer.py deleted file mode 100644 index a37e474..0000000 --- a/tests/ingestion/test_producer.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Behavioral tests for the TimedKafkaProducer streaming helpers.""" - -from datetime import datetime - -import pandas as pd - -from opendt.adapters.ingestion.kafka.producer import TimedKafkaProducer - - -class DummyProducer: - """Kafka producer test double that records outbound messages.""" - def __init__(self): - self.messages = [] - - def send(self, topic, key=None, value=None): - self.messages.append((topic, key, value)) - - def flush(self): - pass - - -def test_tasks_streaming_thread_sends_in_order(monkeypatch): - """Ensure tasks are emitted sequentially with the correct key and payload.""" - producer = TimedKafkaProducer(bootstrap_servers="localhost:9092") - producer.start_streaming_barrier = type("B", (), {"wait": staticmethod(lambda: None)}) - producer.producer = DummyProducer() - - now = datetime.utcnow() - tasks = pd.DataFrame([ - {"id": 1, "submission_time": now, "duration": 100, "cpu_count": 2, "cpu_capacity": 2.4, "mem_capacity": 1024}, - {"id": 2, "submission_time": now, "duration": 200, "cpu_count": 4, "cpu_capacity": 3.0, "mem_capacity": 2048}, - ]) - - monkeypatch.setattr("opendt.adapters.ingestion.kafka.producer.sleep", lambda _: None) - producer.tasks_streaming_thread(tasks, now) - - assert len(producer.producer.messages) == 2 - first = producer.producer.messages[0] - assert first[0] == "tasks" - assert first[1]["id"] == 1 - assert first[2]["duration"] == 100 - - -def test_fragments_streaming_thread_sends(monkeypatch): - """Verify fragment payloads are flushed to the expected Kafka topic.""" - producer = TimedKafkaProducer(bootstrap_servers="localhost:9092") - producer.start_streaming_barrier = type("B", (), {"wait": staticmethod(lambda: None)}) - producer.producer = DummyProducer() - - now = datetime.utcnow() - frags = pd.DataFrame([ - {"id": 1, "submission_time": now, "duration": 50, "cpu_usage": 0.5}, - {"id": 1, "submission_time": now, "duration": 60, "cpu_usage": 0.6}, - ]) - - monkeypatch.setattr("opendt.adapters.ingestion.kafka.producer.sleep", lambda _: None) - producer.fragments_streaming_thread(frags, now) - - assert len(producer.producer.messages) == 2 - topics = {topic for topic, *_ in producer.producer.messages} - assert topics == {"fragments"} diff --git a/tests/optimization/test_llm.py b/tests/optimization/test_llm.py deleted file mode 100644 index 60f1418..0000000 --- a/tests/optimization/test_llm.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Unit tests for the LLM optimization strategies and topology translation.""" - -import json -import os -import sys -import types - -import pytest - -from opendt.core.optimization.llm import LLM - - -@pytest.fixture(scope="module") -def openai_key(): - """Return the configured OpenAI API key so tests exercise the real credential.""" - - key = os.environ.get("OPENAI_API_KEY") - if not key: - raise RuntimeError("OPENAI_API_KEY must be set for LLM integration tests") - return key - - -def sample_topology(): - """Return a minimal topology fixture for optimizer exercises.""" - return { - "clusters": [ - { - "name": "C01", - "hosts": [ - { - "name": "H01", - "count": 2, - "cpu": {"coreCount": 16, "coreSpeed": 2400}, - "memory": {"memorySize": 34359738368}, - } - ], - } - ] - } - - -def test_rule_based_optimization_downscales_energy(monkeypatch): - """The rule-based optimizer should downscale when consumption exceeds the SLO.""" - optimizer = LLM(openai_key=None) - topology = sample_topology() - sim_results = {"energy_kwh": 15.0, "runtime_hours": 1.5, "cpu_utilization": 0.7} - batch = {"task_count": 5} - slo = {"energy_target": 10.0, "runtime_target": 2.0} - - result = optimizer.rule_based_optimization(sim_results, batch, slo, current_topology=topology) - - assert result["type"] == "rule_based" - assert result["action_taken"] in {"downscale", "massive downscale"} - assert result["new_topology"]["clusters"][0]["hosts"][0]["count"] < topology["clusters"][0]["hosts"][0]["count"] - - -def test_rule_based_tracks_best_configuration(): - """Persist the best configuration score after an optimization run.""" - optimizer = LLM(openai_key=None) - topology = sample_topology() - sim_results = {"energy_kwh": 5.0, "runtime_hours": 1.0} - - optimizer.rule_based_optimization(sim_results, {"task_count": 2}, {"energy_target": 10.0, "runtime_target": 2.0}, current_topology=topology) - assert optimizer.best_config is not None - assert optimizer.best_score < float("inf") - - -def test_rule_based_returns_best_config_snapshot(): - """Rule-based results should expose the cached best configuration snapshot.""" - - optimizer = LLM(openai_key=None) - topology = sample_topology() - - result = optimizer.rule_based_optimization( - {"energy_kwh": 6.0, "runtime_hours": 1.1}, - {"task_count": 3}, - {"energy_target": 10.0, "runtime_target": 2.0}, - current_topology=topology, - ) - - assert result["best_config"] is not None - assert result["best_config"] is not topology - assert result["best_config"]["clusters"][0]["hosts"][0]["cpu"]["coreCount"] == 16 - assert result["best_score"] == optimizer.best_score - assert result["best_score"] == pytest.approx(optimizer.best_score) - - -def test_convert_llm_to_topology_adds_hosts(openai_key): - """Ensure LLM recommendations are merged as new hosts into the topology.""" - optimizer = LLM(openai_key=openai_key) - topology = sample_topology() - rec = type("Obj", (), { - "cluster_name": ["C01", "C02"], - "host_name": ["H02", "H99"], - "count": [1, 2], - "coreCount": [12, 20], - "coreSpeed": [2200, 2600], - })() - - new_topology = optimizer.convert_llm_to_topology(rec, topology) - - cluster_names = {c["name"] for c in new_topology["clusters"]} - assert "C02" in cluster_names - host_counts = [h["count"] for c in new_topology["clusters"] for h in c["hosts"] if h["name"] in {"H02", "H99"}] - assert host_counts == [1, 2] - - -def test_extract_text_content_handles_structured_payloads(openai_key): - """The helper should flatten LangChain message payloads into a JSON string.""" - optimizer = LLM(openai_key=openai_key) - - message = type( - "Message", - (), - { - "content": [ - {"type": "text", "text": "{"}, - "\"foo\"", - type("Chunk", (), {"text": ": \"bar\"}"})(), - ] - }, - )() - - flattened = optimizer._extract_text_content(message) - assert flattened == '{"foo": "bar"}' - - -def test_llm_optimization_parses_ai_message_list(monkeypatch, openai_key): - """LLM.optimize should parse structured AIMessage content returned by LangChain.""" - - optimizer = LLM(openai_key=openai_key) - topology = sample_topology() - sim_results = {"energy_kwh": 9.0, "runtime_hours": 1.2, "cpu_utilization": 0.5} - batch = {"task_count": 4, "fragment_count": 12, "avg_cpu_usage": 0.4} - slo = {"energy_target": 10.0, "runtime_target": 2.0} - - payload = { - "cluster_name": ["C01"], - "host_name": ["H02"], - "count": [1], - "coreCount": [24], - "coreSpeed": [2500], - } - - class DummyChatModule: - class ChatOpenAI: # noqa: D401 - simple stub for tests - def __init__(self, *_, api_key=None, **__): # pragma: no cover - trivial - assert api_key == openai_key - - def invoke(self, prompt): # pragma: no cover - simple deterministic stub - assert "SIMULATION RESULTS" in prompt - return type( - "AIMessage", - (), - { - "content": [ - {"type": "text", "text": json.dumps(payload)}, - ] - }, - )() - - monkeypatch.setitem(sys.modules, "langchain_openai", DummyChatModule) - monkeypatch.setitem(sys.modules, "langchain_core", types.ModuleType("langchain_core")) - parser_module = types.ModuleType("langchain_core.output_parsers") - - class DummyParser: - def __init__(self, pydantic_object): # pragma: no cover - simple stub - self._model = pydantic_object - - def get_format_instructions(self): # pragma: no cover - simple stub - return "Return a JSON object with topology recommendations." - - def parse(self, content): # pragma: no cover - simple stub - data = json.loads(content) - return self._model(**data) - - parser_module.JsonOutputParser = DummyParser - monkeypatch.setitem(sys.modules, "langchain_core.output_parsers", parser_module) - - result = optimizer.llm_optimization(sim_results, batch, slo, current_topology=topology) - - assert result["type"] == "llm" - assert result["recommendations"]["host_name"] == payload["host_name"] - assert any(host["name"] == "H02" for c in result["new_topology"]["clusters"] for host in c["hosts"]) - - -def test_optimize_without_key_returns_rule_based(): - """When no API key is configured the optimizer must fall back to the rule-based engine.""" - - optimizer = LLM(openai_key=None) - topology = sample_topology() - - outcome = optimizer.optimize( - simulation_results={"energy_kwh": 12.0, "runtime_hours": 1.8}, - batch_data={"task_count": 6}, - slo_targets={"energy_target": 10.0, "runtime_target": 2.0}, - current_topology=topology, - ) - - assert outcome["type"] == "rule_based" - assert "No OpenAI API key" in outcome["reason"] - assert optimizer.best_config is None - - -def test_optimize_falls_back_when_llm_errors(monkeypatch, openai_key): - """If the LLM call fails the optimizer should gracefully fall back to the rule-based plan.""" - - optimizer = LLM(openai_key=openai_key) - - def explode(*_args, **_kwargs): - raise RuntimeError("boom") - - monkeypatch.setattr(optimizer, "llm_optimization", explode) - - fallback = optimizer.optimize( - simulation_results={"energy_kwh": 8.0, "runtime_hours": 1.1}, - batch_data={"task_count": 3}, - slo_targets={"energy_target": 10.0, "runtime_target": 2.0}, - current_topology=sample_topology(), - ) - - assert fallback["type"] == "rule_based" - assert fallback["reason"].startswith("LLM Error: boom") - - -def test_convert_llm_to_topology_updates_existing_host(openai_key): - """Existing topology entries should be updated rather than duplicated.""" - - optimizer = LLM(openai_key=openai_key) - topology = sample_topology() - - recommendations = { - "cluster_name": ["C01"], - "host_name": ["H01"], - "count": [4], - "coreCount": [28], - "coreSpeed": [2600], - } - - updated = optimizer.convert_llm_to_topology(recommendations, topology) - host = updated["clusters"][0]["hosts"][0] - - assert host["count"] == 4 - assert host["cpu"]["coreCount"] == 28 - assert host["cpu"]["coreSpeed"] == 2600 - - -def test_convert_llm_to_topology_uses_defaults_for_missing_fields(openai_key): - """Missing optional recommendation fields should fall back to sensible defaults.""" - - optimizer = LLM(openai_key=openai_key) - original = {"clusters": []} - rec = { - "cluster_name": ["C07"], - "host_name": ["H11"], - "coreCount": [20], - # intentionally omit count and coreSpeed to trigger defaults - } - - updated = optimizer.convert_llm_to_topology(rec, original) - - assert original == {"clusters": []} # ensure we didn't mutate the input - host = updated["clusters"][0]["hosts"][0] - assert host["count"] == 1 - assert host["cpu"]["coreCount"] == 20 - assert host["cpu"]["coreSpeed"] == 2400 - - -def test_update_best_configuration_only_on_improvement(openai_key): - """Only better performance scores should replace the stored best configuration.""" - - optimizer = LLM(openai_key=openai_key) - initial = sample_topology() - worse = {"clusters": [{"name": "C99", "hosts": []}]} - - optimizer.update_best_configuration({"energy_kwh": 12.0, "runtime_hours": 2.5}, worse) - best_before = optimizer.best_config - - optimizer.update_best_configuration({"energy_kwh": 6.0, "runtime_hours": 1.0}, initial) - - assert optimizer.best_config is not best_before - assert optimizer.best_config["clusters"][0]["name"] == "C01" - assert optimizer.best_config["clusters"][0]["hosts"][0]["cpu"]["coreCount"] == 16 - assert optimizer.best_score == pytest.approx(13.0) - - -def test_extract_text_content_handles_non_list_payloads(openai_key): - """Extractor should gracefully process raw strings and mapping-based payloads.""" - - optimizer = LLM(openai_key=openai_key) - - assert optimizer._extract_text_content(" hello \n") == "hello" - - message = type("Message", (), {"content": {"text": "ignored", "other": 1}})() - assert optimizer._extract_text_content(message) == "{'text': 'ignored', 'other': 1}" - - -def test_extract_text_content_handles_empty_message(openai_key): - """Empty or None messages should result in an empty string.""" - - optimizer = LLM(openai_key=openai_key) - - assert optimizer._extract_text_content(None) == "" - blank_message = type("Message", (), {"content": []})() - assert optimizer._extract_text_content(blank_message) == "" diff --git a/tests/orchestrator/test_controller.py b/tests/orchestrator/test_controller.py deleted file mode 100644 index bb31c16..0000000 --- a/tests/orchestrator/test_controller.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Integration tests for the main orchestrator module.""" - -import json -from pathlib import Path - -import pytest - -from opendt.core.orchestrator.controller import OpenDTOrchestrator - - -@pytest.fixture -def orchestrator(monkeypatch): - """Instantiate an orchestrator without background watchers for isolated tests.""" - monkeypatch.setattr(OpenDTOrchestrator, "start_topology_watcher", lambda self: None) - monkeypatch.setattr(OpenDTOrchestrator, "start_slo_watcher", lambda self: None) - monkeypatch.setattr(OpenDTOrchestrator, "_ensure_slo_file", lambda self: None) - orch = OpenDTOrchestrator() - orch.stop_event.set() - return orch - - -def test_score_prefers_lower_values(orchestrator): - """Lower energy/runtime metrics should produce a higher score.""" - orchestrator.slo_targets = {"energy_target": 10.0, "runtime_target": 2.0} - better = orchestrator._score({"energy_kwh": 9.0, "runtime_hours": 1.8}) - worse = orchestrator._score({"energy_kwh": 15.0, "runtime_hours": 2.5}) - assert better < worse - assert better < 0 # below target should produce a negative (better-than-SLO) delta - - -def test_topology_update_creates_backup(orchestrator, tmp_path): - """Updating the topology should persist a backup and increment counters.""" - topology_path = tmp_path / "topology.json" - initial = {"clusters": [{"name": "A", "hosts": []}]} - topology_path.write_text(json.dumps(initial)) - - orch = orchestrator - orch.topology_path = str(topology_path) - orch.state["current_topology"] = initial - orch.state["topology_updates"] = 0 - orch.last_topology_hash = orch._topo_hash(initial) - - new_topology = {"clusters": [{"name": "A", "hosts": [{"name": "H1", "count": 1, "cpu": {"coreCount": 8, "coreSpeed": 2200}, "memory": {"memorySize": 1024}}]}]} - updated = orch.update_topology_file(new_topology) - - assert updated is True - written = json.loads(topology_path.read_text()) - assert written == new_topology - backup = json.loads(Path(str(topology_path) + ".backup").read_text()) - assert backup == initial - assert orch.state["topology_updates"] == 1 - - -def test_update_slo_file_persists_and_backs_up(orchestrator, tmp_path): - """SLO updates should be written to disk and previous values backed up.""" - slo_path = tmp_path / "slo.json" - - orch = orchestrator - orch.slo_path = str(slo_path) - orch.last_slo_hash = None - - first = orch.update_slo_file({"energy_target": 8.0, "runtime_target": 1.5}) - assert first == "applied" - written = json.loads(slo_path.read_text()) - assert written["energy_target"] == pytest.approx(8.0) - assert orch.slo_targets["runtime_target"] == pytest.approx(1.5) - - second = orch.update_slo_file({"energy_target": 7.5, "runtime_target": 1.25}) - assert second == "applied" - backup = json.loads((tmp_path / "slo.json.backup").read_text()) - assert backup["energy_target"] == pytest.approx(8.0) - - noop = orch.update_slo_file({"energy_target": 7.5, "runtime_target": 1.25}) - assert noop == "noop" - - -def test_run_simulation_passes_window_data(orchestrator): - """Simulations must receive the sampled window data and topology snapshot.""" - captured = {} - - def fake_run_simulation(*, tasks_data, fragments_data, topology_data, expName="simple"): - captured["tasks"] = tasks_data - captured["fragments"] = fragments_data - captured["topology"] = topology_data - captured["expName"] = expName - return {"energy_kwh": 1.0} - - orch = orchestrator - orch.opendc_runner = type("R", (), {"run_simulation": staticmethod(fake_run_simulation)}) - orch.state["current_topology"] = {"clusters": []} - - batch = { - "tasks_sample": [{"id": 1}], - "fragments_sample": [{"id": 1}], - } - result = orch.run_simulation(batch, expName="window-1") - - assert result == {"energy_kwh": 1.0} - assert captured["tasks"] == batch["tasks_sample"] - assert captured["fragments"] == batch["fragments_sample"] - assert captured["topology"] == orch.state["current_topology"] - assert captured["expName"] == "window-1" - - -def test_topo_hash_stable(orchestrator): - """Equivalent topologies should hash identically regardless of key order.""" - topo_a = {"clusters": [{"name": "A", "hosts": [{"name": "h", "count": 1}]}]} - topo_b = {"clusters": [{"hosts": [{"count": 1, "name": "h"}], "name": "A"}]} - assert orchestrator._topo_hash(topo_a) == orchestrator._topo_hash(topo_b) diff --git a/tests/simulation/test_adapters.py b/tests/simulation/test_adapters.py deleted file mode 100644 index 051cf83..0000000 --- a/tests/simulation/test_adapters.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Unit coverage for the simulation data adapter utilities.""" - -from __future__ import annotations - -import math - -import pandas as pd - -from opendt.core.simulation import adapters - - -def test_ensure_workload_dir_creates_directory(tmp_path): - target = tmp_path / "nested" / "workload" - - path = adapters.ensure_workload_dir(str(target)) - - assert path == target - assert path.exists() - assert path.is_dir() - - -def test_tasks_to_table_applies_defaults(): - table = adapters.tasks_to_table( - [ - { - "id": 7, - "duration": 42, - "cpu_count": 4, - "cpu_capacity": 3.2, - # purposely omit submission_time and mem_capacity - } - ] - ) - - df = table.to_pandas() - - assert list(df.columns) == [ - "id", - "submission_time", - "duration", - "cpu_count", - "cpu_capacity", - "mem_capacity", - ] - assert df.loc[0, "id"] == 7 - assert df.loc[0, "duration"] == 42 - assert df.loc[0, "cpu_count"] == 4 - assert math.isclose(df.loc[0, "cpu_capacity"], 3.2) - # defaults to start of 2024 converted to milliseconds epoch - assert df.loc[0, "submission_time"] == pd.Timestamp("2024-01-01").value // 1_000_000 - assert df.loc[0, "mem_capacity"] == 1024 ** 3 - - -def test_fragments_to_table_populates_usage(): - table = adapters.fragments_to_table( - [ - { - "id": 3, - "duration": 10_000, - "cpu_usage": 0.75, - }, - { - "id": 4, - }, - ] - ) - - df = table.to_pandas() - - assert list(df.columns) == ["id", "duration", "cpu_count", "cpu_usage"] - assert df.loc[0, "id"] == 3 - assert df.loc[0, "duration"] == 10_000 - assert df.loc[0, "cpu_count"] == 1 - assert math.isclose(df.loc[0, "cpu_usage"], 0.75) - # Missing values should fall back to defaults - assert df.loc[1, "duration"] == 10_000 - assert math.isclose(df.loc[1, "cpu_usage"], 0.5) diff --git a/tests/simulation/test_runner.py b/tests/simulation/test_runner.py deleted file mode 100644 index 5f89781..0000000 --- a/tests/simulation/test_runner.py +++ /dev/null @@ -1,240 +0,0 @@ -"""Regression tests for the OpenDC runner workload generation utilities.""" - -from __future__ import annotations - -import os -from pathlib import Path -from typing import Any - -import pandas as pd -import pytest - -from opendt.core.simulation.runner import OpenDCRunner - - -def test_create_workload_writes_parquet(tmp_path, monkeypatch): - """Workload creation should persist tasks/fragments parquet datasets.""" - runner = OpenDCRunner() - monkeypatch.setenv("OPENDT_SIM_DIR", str(tmp_path)) - - tasks = [ - { - "id": 1, - "submission_time": "2024-01-01T00:00:00Z", - "duration": 1000, - "cpu_count": 2, - "cpu_capacity": 2.4, - "mem_capacity": 1024, - } - ] - fragments = [{"id": 1, "duration": 500, "cpu_usage": 0.5}] - - workload_dir = runner.create_workload(tasks, fragments) - - tasks_file = Path(workload_dir) / "tasks.parquet" - frags_file = Path(workload_dir) / "fragments.parquet" - - assert tasks_file.exists() - assert frags_file.exists() - - tasks_df = pd.read_parquet(tasks_file) - frags_df = pd.read_parquet(frags_file) - - assert len(tasks_df) == 1 - assert len(frags_df) == 1 - - -def test_create_workload_handles_missing_payload(tmp_path, monkeypatch): - """The helper should create the workload directory even without payload data.""" - runner = OpenDCRunner() - monkeypatch.setenv("OPENDT_SIM_DIR", str(tmp_path)) - - target_dir = tmp_path / "empty" - - def _ensure_dir(): - target_dir.mkdir(parents=True, exist_ok=True) - return target_dir - - monkeypatch.setattr("opendt.core.simulation.runner.ensure_workload_dir", _ensure_dir) - - workload_dir = Path(runner.create_workload(None, None)) - - assert workload_dir.exists() - assert list(workload_dir.iterdir()) == [] - - -def test_run_simulation_without_runner_raises(): - """Running a simulation without the binary should raise a helpful exception.""" - runner = OpenDCRunner() - runner.opendc_path = None - - with pytest.raises(FileNotFoundError): - runner.run_simulation(tasks_data=None, fragments_data=None, topology_data={}) - - -def test_run_simulation_falls_back_to_shell_for_non_executable(tmp_path, monkeypatch): - """When the runner lacks execute permissions, we should invoke it through /bin/sh.""" - - runner = OpenDCRunner() - - non_exec_path = tmp_path / "OpenDCExperimentRunner" - non_exec_path.write_text("echo noop") - non_exec_path.chmod(0o644) - - runner.opendc_path = str(non_exec_path) - - commands: list[list[str]] = [] - - def fake_run(args, capture_output, text, timeout, env): # type: ignore[override] - commands.append(list(args)) - - class Result: - returncode = 0 - stdout = "noop" - stderr = "" - - return Result() - - monkeypatch.setattr("opendt.core.simulation.runner.subprocess.run", fake_run) - monkeypatch.setattr( - runner, - "parse_opendc_results", - lambda: {"status": "ok"}, - ) - - result = runner.run_simulation(tasks_data=None, fragments_data=None, topology_data={}) - - assert result == {"status": "ok"} - assert commands, "Expected subprocess.run to be invoked" - assert commands[0][0] == "/bin/sh" - assert Path(commands[0][1]) == non_exec_path - - -def test_run_simulation_surfaces_process_failures(tmp_path, monkeypatch): - """Non-zero subprocess exits should bubble up with stdout/stderr context.""" - - runner = OpenDCRunner() - - binary_path = tmp_path / "OpenDCExperimentRunner" - binary_path.write_text("#!/bin/sh\nexit 2") - binary_path.chmod(0o755) - runner.opendc_path = str(binary_path) - - def fake_run(*_args, **_kwargs): # type: ignore[override] - class Result: - returncode = 2 - stdout = "boom" - stderr = "stacktrace" - - return Result() - - monkeypatch.setattr("opendt.core.simulation.runner.subprocess.run", fake_run) - - with pytest.raises(RuntimeError) as exc: - runner.run_simulation(tasks_data=None, fragments_data=None, topology_data={}) - - assert "exit code 2" in str(exc.value) - assert "boom" in str(exc.value) - assert "stacktrace" in str(exc.value) - - -def test_run_simulation_invokes_runner_with_expected_arguments(tmp_path, monkeypatch): - """Successful subprocess launches should forward the topology and experiment path.""" - - runner = OpenDCRunner() - - binary_path = tmp_path / "OpenDCExperimentRunner" - binary_path.write_text("#!/bin/sh\nexit 0") - binary_path.chmod(0o755) - runner.opendc_path = str(binary_path) - - workload_dir = tmp_path / "workload" - workload_dir.mkdir() - - monkeypatch.setattr("opendt.core.simulation.runner.ensure_workload_dir", lambda: workload_dir) - - recorded: dict[str, Any] = {} - - def fake_run(args, capture_output, text, timeout, env): # type: ignore[override] - recorded["args"] = args - recorded["capture_output"] = capture_output - recorded["text"] = text - recorded["timeout"] = timeout - recorded["env"] = env - - class Result: - returncode = 0 - stdout = "all good" - stderr = "" - - return Result() - - monkeypatch.setattr("opendt.core.simulation.runner.subprocess.run", fake_run) - monkeypatch.setattr( - runner, - "parse_opendc_results", - lambda: {"status": "success", "energy_kwh": 0.0}, - ) - - outcome = runner.run_simulation( - tasks_data=[{"id": 1}], - fragments_data=[{"id": 1}], - topology_data={"nodes": []}, - expName="unit-test", - ) - - assert outcome == {"status": "success", "energy_kwh": 0.0} - assert recorded["args"][0] == runner.opendc_path - assert "--experiment-path" in recorded["args"] - assert recorded["capture_output"] is True - assert recorded["text"] is True - assert recorded["timeout"] == 120 - expected_java = os.environ.get("JAVA_HOME", "/usr/lib/jvm/java-21-openjdk-amd64") - assert recorded["env"].get("JAVA_HOME") == expected_java - - -def test_parse_results_aggregates_metrics(tmp_path, monkeypatch): - """Parsing should summarise energy, utilisation and runtime from parquet output.""" - - runner = OpenDCRunner() - monkeypatch.setenv("OPENDT_SIM_DIR", str(tmp_path)) - - power = pd.DataFrame( - { - "energy_usage": [3_600_000, 1_800_000], - "power_draw": [100.0, 150.5], - } - ) - host = pd.DataFrame({"cpu_utilization": [0.4, 0.6]}) - service = pd.DataFrame({"timestamp": [1_000, 9_001_000]}) - - power.to_parquet(tmp_path / "powerSource.parquet") - host.to_parquet(tmp_path / "host.parquet") - service.to_parquet(tmp_path / "service.parquet") - - result = runner.parse_opendc_results() - - assert result == { - "energy_kwh": pytest.approx(1.5, rel=1e-3), - "cpu_utilization": pytest.approx(0.5, rel=1e-3), - "max_power_draw": pytest.approx(150.5, rel=1e-3), - "runtime_hours": pytest.approx(2.5, rel=1e-3), - "status": "success", - } - - -def test_parse_results_handles_missing_files(tmp_path, monkeypatch): - """If no parquet outputs exist we still return a success payload with zeroes.""" - - runner = OpenDCRunner() - monkeypatch.setenv("OPENDT_SIM_DIR", str(tmp_path)) - - result = runner.parse_opendc_results() - - assert result == { - "energy_kwh": 0.0, - "cpu_utilization": 0.0, - "max_power_draw": 0.0, - "runtime_hours": 0.0, - "status": "success", - }