Skip to content
Closed

TLS #53

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ import (

"github.com/opensandbox/opensandbox/internal/api"
"github.com/opensandbox/opensandbox/internal/auth"
"github.com/opensandbox/opensandbox/internal/certmanager"
"github.com/opensandbox/opensandbox/internal/cloudflare"
"github.com/opensandbox/opensandbox/internal/compute"
"github.com/opensandbox/opensandbox/internal/config"
"github.com/opensandbox/opensandbox/internal/controlplane"
"github.com/opensandbox/opensandbox/internal/db"
"github.com/opensandbox/opensandbox/internal/dns"
"github.com/opensandbox/opensandbox/internal/ecr"
"github.com/opensandbox/opensandbox/internal/proxy"
"github.com/opensandbox/opensandbox/internal/sandbox"
Expand Down Expand Up @@ -139,6 +141,49 @@ func main() {
log.Println("opensandbox: Redis worker registry started")
}

// Initialize cert manager for wildcard TLS (if Route53 + ACME configured).
// MUST run before autoscaler — workers need the cert in S3 before they boot.
var dnsClient *dns.Route53Client
if cfg.Route53HostedZoneID != "" && cfg.ACMEEmail != "" && cfg.S3Bucket != "" {
certBucket := cfg.CertS3Bucket
if certBucket == "" {
certBucket = cfg.S3Bucket
}
cm, err := certmanager.NewCertManager(certmanager.Config{
Domain: cfg.SandboxDomain,
HostedZoneID: cfg.Route53HostedZoneID,
S3Bucket: certBucket,
S3Prefix: cfg.CertS3Prefix,
S3Region: cfg.S3Region,
AccessKeyID: cfg.S3AccessKeyID,
SecretAccessKey: cfg.S3SecretAccessKey,
ACMEEmail: cfg.ACMEEmail,
})
if err != nil {
log.Printf("opensandbox: cert manager init failed: %v (TLS wildcard disabled)", err)
} else {
if err := cm.ObtainOrRenew(ctx); err != nil {
log.Printf("opensandbox: initial cert obtain failed: %v (will retry)", err)
}
cm.StartRenewalLoop(ctx)
defer cm.Stop()
log.Printf("opensandbox: cert manager started (domain=*.%s, renewal every 12h)", cfg.SandboxDomain)
}

// Create Route53 client for DNS cleanup (reused below)
var dnsErr error
dnsClient, dnsErr = dns.NewRoute53Client(dns.Route53Config{
HostedZoneID: cfg.Route53HostedZoneID,
Region: cfg.S3Region,
AccessKeyID: cfg.S3AccessKeyID,
SecretAccessKey: cfg.S3SecretAccessKey,
})
if dnsErr != nil {
log.Printf("opensandbox: Route53 client for DNS cleanup failed: %v", dnsErr)
dnsClient = nil
}
}

// Initialize EC2 compute pool + autoscaler (server mode with AWS configured)
if cfg.Mode == "server" && cfg.EC2AMI != "" && redisRegistry != nil {
ec2Pool, err := compute.NewEC2Pool(compute.EC2PoolConfig{
Expand All @@ -150,8 +195,10 @@ func main() {
SubnetID: cfg.EC2SubnetID,
SecurityGroupID: cfg.EC2SecurityGroupID,
KeyName: cfg.EC2KeyName,
IAMInstanceProfile: cfg.EC2IAMInstanceProfile,
SecretsARN: cfg.SecretsARN,
IAMInstanceProfile: cfg.EC2IAMInstanceProfile,
SecretsARN: cfg.SecretsARN,
Route53HostedZoneID: cfg.Route53HostedZoneID,
SandboxDomain: cfg.SandboxDomain,
})
if err != nil {
log.Fatalf("opensandbox: failed to create EC2 pool: %v", err)
Expand All @@ -177,6 +224,19 @@ func main() {
log.Printf("opensandbox: control plane subdomain proxy configured (*.%s)", cfg.SandboxDomain)
}

// Start DNS cleaner to remove stale A records for dead workers.
// Handles crash/termination cases where the worker didn't run its cleanup script.
if dnsClient != nil && redisRegistry != nil && cfg.SandboxDomain != "" {
dnsCleaner := controlplane.NewDNSCleaner(controlplane.DNSCleanerConfig{
DNSClient: dnsClient,
Registry: redisRegistry,
Domain: cfg.SandboxDomain,
})
dnsCleaner.Start()
defer dnsCleaner.Stop()
log.Printf("opensandbox: DNS cleaner started (removes stale worker A records every 5m)")
}

// Initialize Cloudflare client for custom hostnames (if configured)
if cfg.CFAPIToken != "" && cfg.CFZoneID != "" {
opts.CFClient = cloudflare.NewClient(cfg.CFAPIToken, cfg.CFZoneID)
Expand Down
57 changes: 57 additions & 0 deletions cmd/worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"time"

"github.com/opensandbox/opensandbox/internal/auth"
"github.com/opensandbox/opensandbox/internal/certmanager"
"github.com/opensandbox/opensandbox/internal/config"
"github.com/opensandbox/opensandbox/internal/db"
fc "github.com/opensandbox/opensandbox/internal/firecracker"
Expand Down Expand Up @@ -372,6 +373,54 @@ func main() {
// Start HTTP server for direct SDK access
httpServer := worker.NewHTTPServer(mgr, ptyMgr, execMgr, jwtIssuer, sandboxDBMgr, sbProxy, sbRouter, cfg.SandboxDomain)
httpAddr := fmt.Sprintf(":%d", cfg.Port)

// Initialize cert fetcher for TLS (if S3 cert bucket is configured)
certBucket := cfg.CertS3Bucket
if certBucket == "" {
certBucket = cfg.S3Bucket // default to same bucket
}
var certFetcher *certmanager.CertFetcher
if certBucket != "" && cfg.CertS3Prefix != "" {
var fetchErr error
certFetcher, fetchErr = certmanager.NewCertFetcher(certmanager.FetcherConfig{
S3Bucket: certBucket,
S3Prefix: cfg.CertS3Prefix,
S3Region: cfg.S3Region,
AccessKeyID: cfg.S3AccessKeyID,
SecretAccessKey: cfg.S3SecretAccessKey,
LocalCertDir: filepath.Join(cfg.DataDir, "tls"),
})
if fetchErr != nil {
log.Printf("opensandbox-worker: cert fetcher init failed: %v (TLS disabled)", fetchErr)
} else {
if err := certFetcher.FetchAndStore(ctx); err != nil {
log.Printf("opensandbox-worker: initial cert fetch failed: %v (TLS disabled, will retry)", err)
certFetcher = nil
} else {
certFetcher.StartRefreshLoop(ctx)
log.Println("opensandbox-worker: TLS cert loaded from S3, refresh loop started")
}
}
}

// Wire cert fetcher into health endpoint
if certFetcher != nil {
httpServer.SetCertFetcher(certFetcher)
}

// Serve HTTPS on :443 if cert is available, always serve HTTP for VPC-internal traffic
var tlsServer *worker.HTTPServer
if certFetcher != nil {
tlsServer = worker.NewHTTPServer(mgr, ptyMgr, execMgr, jwtIssuer, sandboxDBMgr, sbProxy, sbRouter, cfg.SandboxDomain)
tlsServer.SetCertFetcher(certFetcher)
log.Println("opensandbox-worker: starting HTTPS server on :443 (Let's Encrypt wildcard)")
go func() {
if err := tlsServer.StartTLSWithCert(":443", certFetcher.GetCertificate); err != nil {
log.Printf("HTTPS server error: %v", err)
}
}()
}

log.Printf("opensandbox-worker: starting HTTP server on %s", httpAddr)
go func() {
if err := httpServer.Start(httpAddr); err != nil {
Expand Down Expand Up @@ -433,6 +482,14 @@ func main() {

// 1. Stop accepting new work
grpcServer.Stop()
if tlsServer != nil {
if err := tlsServer.Close(); err != nil {
log.Printf("error closing HTTPS server: %v", err)
}
}
if certFetcher != nil {
certFetcher.Stop()
}
if err := httpServer.Close(); err != nil {
log.Printf("error closing HTTP server: %v", err)
}
Expand Down
141 changes: 103 additions & 38 deletions deploy/ec2/setup-instance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,41 +64,15 @@ sudo apt-get install -y podman uidmap slirp4netns
# -------------------------------------------------------------------
# System tools for ext4 image creation
# -------------------------------------------------------------------
echo "==> Installing ext4 and rootfs tools..."
sudo apt-get install -y e2fsprogs
echo "==> Installing ext4, rootfs, and DNS tools..."
sudo apt-get install -y e2fsprogs dnsutils

# -------------------------------------------------------------------
# Redis
# -------------------------------------------------------------------
echo "==> Installing Redis..."
sudo apt-get install -y redis-server

# -------------------------------------------------------------------
# Caddy (custom build with Route53 DNS module for wildcard certs)
# -------------------------------------------------------------------
echo "==> Installing Go (needed for xcaddy)..."
GO_VERSION="1.23.6"
curl -sL "https://go.dev/dl/go${GO_VERSION}.linux-${GOARCH}.tar.gz" | sudo tar -C /usr/local -xzf -
export PATH=$PATH:/usr/local/go/bin:$HOME/go/bin

echo "==> Building Caddy with Route53 DNS module..."
go install github.com/caddyserver/xcaddy/cmd/xcaddy@latest
xcaddy build --with github.com/caddy-dns/route53 --output /tmp/caddy-custom
sudo mv /tmp/caddy-custom /usr/local/bin/caddy
sudo chmod +x /usr/local/bin/caddy

echo "==> Verifying Caddy has Route53 module..."
caddy list-modules | grep route53 || { echo "ERROR: Caddy missing route53 module"; exit 1; }

echo "==> Installing Caddy config..."
sudo mkdir -p /etc/caddy
sudo cp /tmp/deploy-ec2/Caddyfile /etc/caddy/Caddyfile 2>/dev/null || \
echo " NOTE: Copy deploy/ec2/Caddyfile to /etc/caddy/Caddyfile manually"

echo "==> Installing Caddy systemd unit..."
sudo cp /tmp/deploy-ec2/caddy.service /etc/systemd/system/caddy.service 2>/dev/null || \
echo " NOTE: Copy deploy/ec2/caddy.service to /etc/systemd/system/ manually"

# -------------------------------------------------------------------
# NVMe instance storage (XFS with reflink for instant rootfs copies)
# -------------------------------------------------------------------
Expand Down Expand Up @@ -220,37 +194,130 @@ echo "==> Installing identity service..."
sudo tee /usr/local/bin/opensandbox-worker-identity.sh > /dev/null << 'IDENT'
#!/usr/bin/env bash
set -euo pipefail
TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \

ROUTE53_HOSTED_ZONE_ID="${OPENSANDBOX_ROUTE53_HOSTED_ZONE_ID:-}"
WORKER_DOMAIN="${OPENSANDBOX_SANDBOX_DOMAIN:-workers.opencomputer.dev}"

# Query EC2 instance metadata (IMDSv2)
IMDS_TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" \
-H "X-aws-ec2-metadata-token-ttl-seconds: 300")
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
http://169.254.169.254/latest/meta-data/instance-id)
PRIVATE_IP=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
PRIVATE_IP=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
http://169.254.169.254/latest/meta-data/local-ipv4)
PUBLIC_IP=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" \
PUBLIC_IP=$(curl -s -H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
http://169.254.169.254/latest/meta-data/public-ipv4 || echo "")

SHORT_ID=$(echo "$INSTANCE_ID" | sed 's/^i-//' | cut -c1-8)
WORKER_ID="w-use2-${SHORT_ID}"
WORKER_HOSTNAME="${WORKER_ID}.${WORKER_DOMAIN}"
IP="${PUBLIC_IP:-$PRIVATE_IP}"

mkdir -p /etc/opensandbox

# Register A record in Route53 (direct DNS, no proxy)
if [ -n "$ROUTE53_HOSTED_ZONE_ID" ] && [ -n "$IP" ]; then
echo "opensandbox-identity: registering DNS ${WORKER_HOSTNAME} -> ${IP} (Route53)"

aws route53 change-resource-record-sets \
--hosted-zone-id "$ROUTE53_HOSTED_ZONE_ID" \
--change-batch "{
\"Changes\": [{
\"Action\": \"UPSERT\",
\"ResourceRecordSet\": {
\"Name\": \"${WORKER_HOSTNAME}\",
\"Type\": \"A\",
\"TTL\": 60,
\"ResourceRecords\": [{\"Value\": \"${IP}\"}]
}
}]
}" > /dev/null 2>&1 && {
echo "opensandbox-identity: DNS registered"
HTTP_ADDR="https://${WORKER_HOSTNAME}"

# Verify DNS resolves before proceeding (prevents routing to unresolvable hostname)
echo "opensandbox-identity: verifying DNS resolution for ${WORKER_HOSTNAME}..."
for i in $(seq 1 30); do
RESOLVED=$(dig +short "${WORKER_HOSTNAME}" 2>/dev/null || true)
if [ -n "$RESOLVED" ]; then
echo "opensandbox-identity: DNS verified (${WORKER_HOSTNAME} -> ${RESOLVED})"
break
fi
if [ $i -eq 30 ]; then
echo "opensandbox-identity: WARNING: DNS not resolving after 60s, proceeding anyway"
fi
sleep 2
done
} || {
echo "opensandbox-identity: WARNING: DNS registration failed, using raw IP"
HTTP_ADDR="http://${IP}:8080"
}

# Save state for cleanup on shutdown
cat > /etc/opensandbox/worker-dns.env << EOF
ROUTE53_HOSTED_ZONE_ID=${ROUTE53_HOSTED_ZONE_ID}
WORKER_HOSTNAME=${WORKER_HOSTNAME}
WORKER_IP=${IP}
EOF
else
echo "opensandbox-identity: no Route53 config, using raw IP (no TLS)"
HTTP_ADDR="http://${IP}:8080"
cat > /etc/opensandbox/worker-dns.env << EOF
ROUTE53_HOSTED_ZONE_ID=
WORKER_HOSTNAME=${WORKER_HOSTNAME}
WORKER_IP=${IP}
EOF
fi

cat > /etc/opensandbox/worker-identity.env << EOF
OPENSANDBOX_WORKER_ID=${WORKER_ID}
OPENSANDBOX_HTTP_ADDR=http://${PUBLIC_IP:-$PRIVATE_IP}:8080
OPENSANDBOX_HTTP_ADDR=${HTTP_ADDR}
OPENSANDBOX_GRPC_ADVERTISE=${PRIVATE_IP}:9090
EOF
echo "opensandbox-identity: ${WORKER_ID} private=${PRIVATE_IP} public=${PUBLIC_IP:-none}"

echo "opensandbox-identity: ${WORKER_ID} private=${PRIVATE_IP} public=${PUBLIC_IP:-none} addr=${HTTP_ADDR}"
IDENT
sudo chmod +x /usr/local/bin/opensandbox-worker-identity.sh

# DNS cleanup script — deletes the Route53 A record on shutdown
sudo tee /usr/local/bin/opensandbox-worker-dns-cleanup.sh > /dev/null << 'CLEANUP'
#!/usr/bin/env bash
set -euo pipefail
source /etc/opensandbox/worker-dns.env 2>/dev/null || exit 0
[ -z "$ROUTE53_HOSTED_ZONE_ID" ] && exit 0
[ -z "$WORKER_HOSTNAME" ] && exit 0
[ -z "$WORKER_IP" ] && exit 0

echo "opensandbox-dns-cleanup: removing ${WORKER_HOSTNAME}"
aws route53 change-resource-record-sets \
--hosted-zone-id "$ROUTE53_HOSTED_ZONE_ID" \
--change-batch "{
\"Changes\": [{
\"Action\": \"DELETE\",
\"ResourceRecordSet\": {
\"Name\": \"${WORKER_HOSTNAME}\",
\"Type\": \"A\",
\"TTL\": 60,
\"ResourceRecords\": [{\"Value\": \"${WORKER_IP}\"}]
}
}]
}" > /dev/null 2>&1 || echo "opensandbox-dns-cleanup: WARNING: failed to remove DNS record"
CLEANUP
sudo chmod +x /usr/local/bin/opensandbox-worker-dns-cleanup.sh

sudo tee /etc/systemd/system/opensandbox-identity.service > /dev/null << 'SVC'
[Unit]
Description=OpenSandbox Worker Identity (from EC2 IMDS)
Description=OpenSandbox Worker Identity + Route53 DNS
After=network-online.target
Wants=network-online.target
Before=opensandbox-worker.service

[Service]
Type=oneshot
RemainAfterExit=yes
EnvironmentFile=-/etc/opensandbox/route53.env
ExecStart=/usr/local/bin/opensandbox-worker-identity.sh
ExecStop=/usr/local/bin/opensandbox-worker-dns-cleanup.sh

[Install]
WantedBy=multi-user.target
Expand Down Expand Up @@ -305,14 +372,12 @@ sudo systemctl daemon-reload
sudo systemctl enable opensandbox-nvme
sudo systemctl enable opensandbox-identity
sudo systemctl enable opensandbox-worker
sudo systemctl enable caddy 2>/dev/null || true

# -------------------------------------------------------------------
# Cleanup
# -------------------------------------------------------------------
echo "==> Cleaning up build tools..."
echo "==> Cleaning up..."
sudo apt-get clean
sudo rm -rf /usr/local/go $HOME/go

echo ""
echo "============================================"
Expand Down
Loading
Loading