diff --git a/.github/scripts/setup-arm64-runner.sh b/.github/scripts/setup-arm64-runner.sh new file mode 100755 index 0000000000..30e0ff4775 --- /dev/null +++ b/.github/scripts/setup-arm64-runner.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +# +# Setup script for an ARM64 self-hosted GitHub Actions runner. +# Run this on a fresh ARM64 Ubuntu 22.04/24.04 machine with KVM support. +# +# Prerequisites: +# - ARM64 Linux host (Graviton, Ampere, etc.) +# - KVM enabled (/dev/kvm accessible) +# - At least 8GB RAM (for hugepage allocation) +# - Root access +# +# Usage: +# sudo ./setup-arm64-runner.sh +# +# After running this script, register the machine as a GitHub Actions +# self-hosted runner with the label: infra-tests-arm64 +# https://github.com/e2b-dev/infra/settings/actions/runners/new + +set -euo pipefail + +PS4='[\D{%Y-%m-%d %H:%M:%S}] ' +set -x + +if [ "$(id -u)" -ne 0 ]; then + echo "ERROR: This script must be run as root" >&2 + exit 1 +fi + +ARCH=$(dpkg --print-architecture) +if [ "$ARCH" != "arm64" ]; then + echo "ERROR: This script is for ARM64 hosts (detected: $ARCH)" >&2 + exit 1 +fi + +echo "=== Setting up ARM64 GitHub Actions runner ===" + +# KVM check +if [ ! -e /dev/kvm ]; then + echo "ERROR: /dev/kvm not found. KVM support is required." >&2 + exit 1 +fi + +# Install base dependencies +apt-get update +apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + git \ + jq \ + nbd-client \ + nbd-server + +# Enable unprivileged userfaultfd +echo 1 > /proc/sys/vm/unprivileged_userfaultfd + +# Hugepages +mkdir -p /mnt/hugepages +mount -t hugetlbfs none /mnt/hugepages 2>/dev/null || true +echo 2000 > /proc/sys/vm/nr_hugepages + +grep -qF 'hugetlbfs /mnt/hugepages' /etc/fstab || \ + echo "hugetlbfs /mnt/hugepages hugetlbfs defaults 0 0" >> /etc/fstab + +# Sysctl — write once (idempotent) +cat <<'EOF' > /etc/sysctl.d/99-e2b.conf +vm.unprivileged_userfaultfd=1 +vm.nr_hugepages=2000 +net.core.somaxconn=65535 +net.core.netdev_max_backlog=65535 +net.ipv4.tcp_max_syn_backlog=65535 +vm.max_map_count=1048576 +EOF +sysctl --system + +# NBD +modprobe nbd nbds_max=256 +echo "nbd" > /etc/modules-load.d/e2b.conf +echo "options nbd nbds_max=256" > /etc/modprobe.d/e2b-nbd.conf + +# Disable inotify for NBD devices +cat <<'EOF' > /etc/udev/rules.d/97-nbd-device.rules +ACTION=="add|change", KERNEL=="nbd*", OPTIONS:="nowatch" +EOF +udevadm control --reload-rules +udevadm trigger + +# File descriptor limits +cat <<'EOF' > /etc/security/limits.d/99-e2b.conf +* soft nofile 1048576 +* hard nofile 1048576 +EOF + +echo "" +echo "=== ARM64 runner setup complete ===" +echo "" +echo "Verify:" +echo " uname -m → aarch64" +echo " ls /dev/kvm → exists" +echo " cat /proc/meminfo | grep HugePages_Total" +echo " lsmod | grep nbd" +echo "" +echo "Next: register this machine as a GitHub Actions self-hosted runner" +echo " Label: infra-tests-arm64" +echo " https://github.com/e2b-dev/infra/settings/actions/runners/new" diff --git a/.github/workflows/pr-tests-arm64.yml b/.github/workflows/pr-tests-arm64.yml new file mode 100644 index 0000000000..90988079c8 --- /dev/null +++ b/.github/workflows/pr-tests-arm64.yml @@ -0,0 +1,115 @@ +name: ARM64 tests on PRs + +on: [workflow_call] + +permissions: + contents: read + +jobs: + cross-compile: + name: Cross-compile all packages for ARM64 + runs-on: ubuntu-24.04 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Go + uses: ./.github/actions/go-setup-cache + + - name: Install ARM64 cross-compiler + run: sudo apt-get update && sudo apt-get install -y gcc-aarch64-linux-gnu + + - name: Build and vet packages (pure Go) + run: | + for pkg in api client-proxy envd shared db docker-reverse-proxy; do + echo "::group::packages/$pkg" + pushd "packages/$pkg" > /dev/null + GOARCH=arm64 go build ./... + GOARCH=arm64 go vet ./... + popd > /dev/null + echo "::endgroup::" + done + + - name: Build and vet orchestrator (CGO) + run: | + CGO_ENABLED=1 CC=aarch64-linux-gnu-gcc GOARCH=arm64 go build ./... + CGO_ENABLED=1 CC=aarch64-linux-gnu-gcc GOARCH=arm64 go vet ./... + working-directory: packages/orchestrator + + arm64-unit-tests: + name: ARM64 tests for ${{ matrix.package }} + runs-on: ubuntu-24.04-arm + timeout-minutes: 30 + strategy: + matrix: + include: + - package: packages/api + test_path: ./... + sudo: false + - package: packages/client-proxy + test_path: ./... + sudo: false + - package: packages/db + test_path: ./... + sudo: false + - package: packages/docker-reverse-proxy + test_path: ./... + sudo: false + - package: packages/envd + test_path: ./... + sudo: true + - package: packages/orchestrator + test_path: ./... + sudo: true + - package: packages/shared + test_path: ./pkg/... + sudo: false + fail-fast: false + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Go + uses: ./.github/actions/go-setup-cache + with: + cache-dependency-paths: | + go.work + ${{ matrix.package }}/go.mod + ${{ matrix.package }}/go.sum + + - name: Setup envd tests + run: | + sudo apt-get update && sudo apt-get install -y bindfs + if: matrix.package == 'packages/envd' + + - name: Setup orchestrator tests + run: | + # Enable unprivileged uffd (Ubuntu defaults to 0) + echo 1 | sudo tee /proc/sys/vm/unprivileged_userfaultfd + + # Enable hugepages (256 × 2MB = 512MB). + # Tests that need more hugepages than available will skip gracefully. + sudo mkdir -p /mnt/hugepages + sudo mount -t hugetlbfs none /mnt/hugepages + echo 256 | sudo tee /proc/sys/vm/nr_hugepages + + # Install extra kernel modules (nbd is not in base modules on GitHub-hosted runners) + sudo apt-get update + sudo apt-get install -y linux-modules-extra-$(uname -r) + sudo modprobe nbd nbds_max=256 + + # Disable inotify watching of change events for NBD devices + echo 'ACTION=="add|change", KERNEL=="nbd*", OPTIONS:="nowatch"' | sudo tee /etc/udev/rules.d/97-nbd-device.rules + sudo udevadm control --reload-rules + sudo udevadm trigger + if: matrix.package == 'packages/orchestrator' + + - name: Run tests that require sudo + working-directory: ${{ matrix.package }} + run: sudo -E `which go` test -race -v ${{ matrix.test_path }} + if: matrix.sudo == true + + - name: Run tests + working-directory: ${{ matrix.package }} + run: go test -race -v ${{ matrix.test_path }} + if: matrix.sudo == false diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index a3af82a46a..62bd96248c 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -28,6 +28,8 @@ jobs: uses: ./.github/workflows/out-of-order-migrations.yml unit-tests: uses: ./.github/workflows/pr-tests.yml + arm64-tests: + uses: ./.github/workflows/pr-tests-arm64.yml integration-tests: needs: [out-of-order-migrations] uses: ./.github/workflows/integration_tests.yml diff --git a/packages/api/Makefile b/packages/api/Makefile index 5c580ee5cd..0ba964f404 100644 --- a/packages/api/Makefile +++ b/packages/api/Makefile @@ -6,6 +6,13 @@ PREFIX := $(strip $(subst ",,$(PREFIX))) HOSTNAME := $(shell hostname 2> /dev/null || hostnamectl hostname 2> /dev/null) $(if $(HOSTNAME),,$(error Failed to determine hostname: both 'hostname' and 'hostnamectl' failed)) +# Architecture for builds. Defaults to local arch; override for cross-compilation +# (e.g., BUILD_ARCH=amd64 make build-and-upload from an ARM64 host). +BUILD_ARCH ?= $(shell go env GOARCH) +# Docker platform string. Override for multi-arch builds: +# BUILD_PLATFORM=linux/amd64,linux/arm64 make build-and-upload +BUILD_PLATFORM ?= linux/$(BUILD_ARCH) + expectedMigration := $(shell ./../../scripts/get-latest-migration.sh) ifeq ($(PROVIDER),aws) diff --git a/packages/client-proxy/Makefile b/packages/client-proxy/Makefile index 383ea0fc9f..36e0e3d4b1 100644 --- a/packages/client-proxy/Makefile +++ b/packages/client-proxy/Makefile @@ -6,6 +6,13 @@ PREFIX := $(strip $(subst ",,$(PREFIX))) HOSTNAME := $(shell hostname 2> /dev/null || hostnamectl hostname 2> /dev/null) $(if $(HOSTNAME),,$(error Failed to determine hostname: both 'hostname' and 'hostnamectl' failed)) +# Architecture for builds. Defaults to local arch; override for cross-compilation +# (e.g., BUILD_ARCH=amd64 make build-and-upload from an ARM64 host). +BUILD_ARCH ?= $(shell go env GOARCH) +# Docker platform string. Override for multi-arch builds: +# BUILD_PLATFORM=linux/amd64,linux/arm64 make build-and-upload +BUILD_PLATFORM ?= linux/$(BUILD_ARCH) + ifeq ($(PROVIDER),aws) IMAGE_REGISTRY := $(AWS_ACCOUNT_ID).dkr.ecr.$(AWS_REGION).amazonaws.com/$(PREFIX)core/client-proxy else @@ -16,7 +23,7 @@ endif build: # Allow for passing commit sha directly for docker builds $(eval COMMIT_SHA ?= $(shell git rev-parse --short HEAD)) - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o bin/client-proxy -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" . + CGO_ENABLED=0 GOOS=linux GOARCH=$(BUILD_ARCH) go build -o bin/client-proxy -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" . .PHONY: build-debug build-debug: @@ -26,7 +33,7 @@ build-debug: .PHONY: build-and-upload build-and-upload: $(eval COMMIT_SHA := $(shell git rev-parse --short HEAD)) - @docker buildx build --platform linux/amd64 --tag $(IMAGE_REGISTRY) --push --build-arg COMMIT_SHA="$(COMMIT_SHA)" -f ./Dockerfile .. + @docker buildx build --platform $(BUILD_PLATFORM) --tag $(IMAGE_REGISTRY) --push --build-arg COMMIT_SHA="$(COMMIT_SHA)" -f ./Dockerfile .. .PHONY: run run: diff --git a/packages/db/pkg/testutils/db.go b/packages/db/pkg/testutils/db.go index ecdf856488..edde569887 100644 --- a/packages/db/pkg/testutils/db.go +++ b/packages/db/pkg/testutils/db.go @@ -5,6 +5,7 @@ import ( "os/exec" "path/filepath" "strings" + "sync" "testing" "time" @@ -106,6 +107,12 @@ func SetupDatabase(t *testing.T) *Database { } } +// gooseMu serializes goose operations across parallel tests. +// goose.OpenDBWithDriver calls goose.SetDialect which writes to package-level +// globals (dialect, store) without synchronization. Concurrent test goroutines +// race on these globals, triggering the race detector on ARM64. +var gooseMu sync.Mutex + // runDatabaseMigrations executes all required database migrations func runDatabaseMigrations(t *testing.T, connStr string) { t.Helper() @@ -115,6 +122,9 @@ func runDatabaseMigrations(t *testing.T, connStr string) { require.NoError(t, err, "Failed to find git root") repoRoot := strings.TrimSpace(string(output)) + gooseMu.Lock() + defer gooseMu.Unlock() + db, err := goose.OpenDBWithDriver("pgx", connStr) require.NoError(t, err) t.Cleanup(func() { diff --git a/packages/envd/Makefile b/packages/envd/Makefile index f073d37ae7..54396a564b 100644 --- a/packages/envd/Makefile +++ b/packages/envd/Makefile @@ -7,6 +7,13 @@ LDFLAGS=-ldflags "-X=main.commitSHA=$(BUILD)" AWS_BUCKET_PREFIX ?= $(PREFIX)$(AWS_ACCOUNT_ID)- GCP_BUCKET_PREFIX ?= $(GCP_PROJECT_ID)- +# Architecture for builds. Defaults to local arch; override for cross-compilation +# (e.g., BUILD_ARCH=amd64 make build from an ARM64 host). +BUILD_ARCH ?= $(shell go env GOARCH) +# Docker platform string. Override for multi-arch builds: +# BUILD_PLATFORM=linux/amd64,linux/arm64 make start-docker +BUILD_PLATFORM ?= linux/$(BUILD_ARCH) + .PHONY: init init: brew install protobuf @@ -20,17 +27,17 @@ else endif build: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -o bin/envd ${LDFLAGS} + CGO_ENABLED=0 GOOS=linux GOARCH=$(BUILD_ARCH) go build -a -o bin/envd ${LDFLAGS} build-debug: CGO_ENABLED=1 go build -race -gcflags=all="-N -l" -o bin/debug/envd ${LDFLAGS} start-docker: make build - DOCKER_BUILDKIT=1 docker build --platform linux/amd64 -t envd-debug . -f debug.Dockerfile + DOCKER_BUILDKIT=1 docker build --platform $(BUILD_PLATFORM) -t envd-debug . -f debug.Dockerfile docker run \ --name envd \ - --platform linux/amd64 \ + --platform $(BUILD_PLATFORM) \ -p 49983:49983 \ -p 2345:2345 \ -p 9999:9999 \ diff --git a/packages/envd/internal/services/legacy/conversion_test.go b/packages/envd/internal/services/legacy/conversion_test.go index 43f5fe92a4..14acf51486 100644 --- a/packages/envd/internal/services/legacy/conversion_test.go +++ b/packages/envd/internal/services/legacy/conversion_test.go @@ -2,6 +2,7 @@ package legacy import ( "bytes" + "context" "io" "net/http" "net/http/httptest" @@ -22,12 +23,19 @@ import ( func TestFilesystemClient_FieldFormatter(t *testing.T) { t.Parallel() fsh := filesystemconnectmocks.NewMockFilesystemHandler(t) - fsh.EXPECT().Move(mock.Anything, mock.Anything).Return(connect.NewResponse(&filesystem.MoveResponse{ - Entry: &filesystem.EntryInfo{ - Name: "test-name", - Owner: "new-extra-field", + // Use RunAndReturn to create a fresh response per call. Using Return() + // shares one Response across parallel subtests, causing a data race on + // the lazily-initialized header/trailer maps inside connect.Response. + fsh.EXPECT().Move(mock.Anything, mock.Anything).RunAndReturn( + func(_ context.Context, _ *connect.Request[filesystem.MoveRequest]) (*connect.Response[filesystem.MoveResponse], error) { + return connect.NewResponse(&filesystem.MoveResponse{ + Entry: &filesystem.EntryInfo{ + Name: "test-name", + Owner: "new-extra-field", + }, + }), nil }, - }), nil) + ) _, handler := filesystemconnect.NewFilesystemHandler(fsh, connect.WithInterceptors( diff --git a/packages/orchestrator/Makefile b/packages/orchestrator/Makefile index 94da7d669b..ab0531c2d6 100644 --- a/packages/orchestrator/Makefile +++ b/packages/orchestrator/Makefile @@ -7,6 +7,13 @@ GCP_BUCKET_PREFIX ?= $(GCP_PROJECT_ID)- HOSTNAME := $(shell hostname 2> /dev/null || hostnamectl hostname 2> /dev/null) $(if $(HOSTNAME),,$(error Failed to determine hostname: both 'hostname' and 'hostnamectl' failed)) +# Architecture for builds. Defaults to local arch; override for cross-compilation +# (e.g., BUILD_ARCH=amd64 make build from an ARM64 host). +BUILD_ARCH ?= $(shell go env GOARCH) +# Docker platform string. Override for multi-arch builds: +# BUILD_PLATFORM=linux/amd64,linux/arm64 make build +BUILD_PLATFORM ?= linux/$(BUILD_ARCH) + .PHONY: init init: brew install protobuf @@ -18,18 +25,18 @@ generate: .PHONY: build build: $(eval COMMIT_SHA := $(shell git rev-parse --short HEAD)) - @docker build --platform linux/amd64 --output=bin --build-arg COMMIT_SHA="$(COMMIT_SHA)" -f ./Dockerfile .. + @docker build --platform $(BUILD_PLATFORM) --output=bin --build-arg COMMIT_SHA="$(COMMIT_SHA)" -f ./Dockerfile .. .PHONY: build-local -build-local: +build-local: fetch-busybox # Allow for passing commit sha directly for docker builds $(eval COMMIT_SHA ?= $(shell git rev-parse --short HEAD)) - CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o bin/orchestrator -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" . - CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o bin/clean-nfs-cache -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" ./cmd/clean-nfs-cache + CGO_ENABLED=1 GOOS=linux GOARCH=$(BUILD_ARCH) go build -o bin/orchestrator -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" . + CGO_ENABLED=1 GOOS=linux GOARCH=$(BUILD_ARCH) go build -o bin/clean-nfs-cache -ldflags "-X=main.commitSHA=$(COMMIT_SHA)" ./cmd/clean-nfs-cache .PHONY: build-debug -build-debug: - CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -race -gcflags=all="-N -l" -o bin/orchestrator . +build-debug: fetch-busybox + CGO_ENABLED=1 GOOS=linux GOARCH=$(BUILD_ARCH) go build -race -gcflags=all="-N -l" -o bin/orchestrator . .PHONY: run-debug run-debug: @@ -105,7 +112,7 @@ test-docker: @cp -r ../shared .shared/ @rm -rf .clickhouse/ @cp -r ../clickhouse .clickhouse/ - @docker build --platform linux/amd64 -f test.Dockerfile --no-cache-filter runner --progress=plain -t orchestrator-test . + @docker build --platform $(BUILD_PLATFORM) -f test.Dockerfile --no-cache-filter runner --progress=plain -t orchestrator-test . @rm -rf .shared/ @rm -rf .clickhouse/ @echo "Done" @@ -125,6 +132,34 @@ build-template: -kernel $(KERNEL_VERSION) \ -firecracker $(FIRECRACKER_VERSION) +.PHONY: fetch-busybox +fetch-busybox: + @ARCH=$(BUILD_ARCH); \ + BUSYBOX_TARGET=./pkg/template/build/core/systeminit/busybox_1.36.1-2; \ + if [ "$$ARCH" != "arm64" ]; then \ + echo "✓ Using bundled amd64 busybox"; \ + elif file "$$BUSYBOX_TARGET" 2>/dev/null | grep -q 'aarch64\|ARM aarch64'; then \ + echo "✓ Busybox is already arm64"; \ + elif command -v busybox >/dev/null 2>&1 && file "$$(command -v busybox)" 2>/dev/null | grep -q 'aarch64\|ARM aarch64' && file "$$(command -v busybox)" 2>/dev/null | grep -q 'statically linked'; then \ + cp "$$(command -v busybox)" "$$BUSYBOX_TARGET" && \ + echo "✓ Copied host busybox (arm64, static) to embedded path"; \ + elif command -v apt-get >/dev/null 2>&1 && command -v dpkg-deb >/dev/null 2>&1; then \ + echo "Fetching arm64 busybox via apt..."; \ + TMPDIR=$$(mktemp -d); \ + apt-get download busybox-static 2>/dev/null && \ + dpkg-deb -x busybox-static_*.deb "$$TMPDIR" && \ + cp "$$TMPDIR/bin/busybox" "$$BUSYBOX_TARGET" && \ + rm -rf "$$TMPDIR" busybox-static_*.deb && \ + echo "✓ Replaced embedded busybox with arm64 binary (from busybox-static package)" || \ + { rm -rf "$$TMPDIR" busybox-static_*.deb; echo "⚠ apt-get download failed"; exit 1; }; \ + else \ + echo "⚠ ARM64 busybox required but no method available to fetch it."; \ + echo " Options:"; \ + echo " 1. Install busybox-static: apt install busybox-static, then re-run"; \ + echo " 2. Manually place an arm64 busybox binary at: $$BUSYBOX_TARGET"; \ + exit 1; \ + fi + .PHONY: migrate migrate: ./scripts/upload-envs.sh /mnt/disks/fc-envs/v1 $(TEMPLATE_BUCKET_NAME) diff --git a/packages/orchestrator/README.md b/packages/orchestrator/README.md index 725a2f4b86..c18c44c899 100644 --- a/packages/orchestrator/README.md +++ b/packages/orchestrator/README.md @@ -219,10 +219,67 @@ Flags: --- +## Architecture (ARM64) Support + +The orchestrator supports both `amd64` (x86_64) and `arm64` (aarch64) architectures. Architecture is detected automatically via `runtime.GOARCH` at compile time. + +### Architecture naming convention + +This project uses **Go/Docker/Debian naming** (`amd64`/`arm64`) for architecture directories in binary paths and GCS buckets: + +| Convention | x86_64 name | ARM64 name | Used by | +|------------|-------------|------------|---------| +| **Go/Docker/Debian** | `amd64` | `arm64` | This repo, Docker, `dpkg --print-architecture` | +| Linux/GNU | `x86_64` | `aarch64` | `uname -m`, kernel Makefiles | + +Binary paths follow the `{version}/{arch}/` layout: + +``` +# Firecracker (GCS bucket or FIRECRACKER_VERSIONS_DIR) +fc-versions/v1.12.1_717921c/amd64/firecracker +fc-versions/v1.12.1_717921c/arm64/firecracker + +# Kernels (GCS bucket or HOST_KERNELS_DIR) +kernels/vmlinux-6.1.102/amd64/vmlinux.bin +kernels/vmlinux-6.1.102/arm64/vmlinux.bin +``` + +> **Note:** The [fc-kernels](https://github.com/e2b-dev/fc-kernels) repo currently uses `x86_64` instead of `amd64` for its directory names. This will be aligned in a follow-up change. + +### ARM64-specific behavior + +- **SMT** is disabled (ARM processors don't support simultaneous multi-threading) +- **CPU detection** uses fallback values since `gopsutil` doesn't populate Family/Model on ARM64 +- **OCI platform** is set to the target architecture instead of hardcoded `amd64` +- **Busybox binary** must be swapped before building: `make fetch-busybox` + +### Cross-architecture deployment + +`TARGET_ARCH` is a **runtime** environment variable that overrides the architecture used for path resolution and OCI image pulls. When unset, defaults to the host architecture (`runtime.GOARCH`). + +```bash +# Run orchestrator targeting amd64 paths from an arm64 host +TARGET_ARCH=amd64 ./bin/orchestrator + +# Or in .env file (read at runtime) +echo "TARGET_ARCH=amd64" >> .env.local +``` + +`TARGET_ARCH` affects: +- Firecracker and kernel binary path resolution (`{version}/{arch}/...`) +- OCI image platform for container pulls + +It does **not** affect: +- Makefile compilation — use `GOARCH` directly for cross-compilation: `GOARCH=amd64 make build-local` +- Hardware-dependent runtime behavior (SMT detection, CPU info) which always uses the actual host architecture + +--- + ## Environment Variables Automatically set in local mode. Set before running to override: +- `TARGET_ARCH` - Target architecture override (`amd64` or `arm64`; default: host architecture) - `HOST_ENVD_PATH` - Envd binary path (default: `../envd/bin/envd`) - `HOST_KERNELS_DIR` - Kernel versions dir (local: `{storage}/kernels`, prod: `/fc-kernels`) - `FIRECRACKER_VERSIONS_DIR` - Firecracker versions dir (local: `{storage}/fc-versions`, prod: `/fc-versions`) diff --git a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go index 6365c52eac..afc7377815 100644 --- a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go +++ b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/clean.go @@ -99,7 +99,7 @@ type Candidate struct { } type statReq struct { - df *os.File + dirPath string name string response chan *statReq f *File diff --git a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go index e2b61fea50..42ad6dd39d 100644 --- a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go +++ b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/scan.go @@ -60,7 +60,7 @@ func (c *Cleaner) Statter(ctx context.Context, done *sync.WaitGroup) { case <-ctx.Done(): return case req := <-c.statRequestCh: - f, err := c.statInDir(req.df, req.name) + f, err := c.statInDir(req.dirPath, req.name) req.f = f req.err = err req.response <- req @@ -201,13 +201,16 @@ func (c *Cleaner) scanDir(ctx context.Context, path []*Dir) (out *Dir, err error } } - // submit all stat requests + // Submit stat requests using the directory path (not the *os.File). + // The file descriptor df is closed when scanDir returns (defer above), + // but Statter goroutines may still be processing requests concurrently. + // Passing the path avoids a race between df.Close() and df.Fd(). responseCh := make(chan *statReq, len(filenames)) for _, name := range filenames { select { case <-ctx.Done(): return nil, ctx.Err() - case c.statRequestCh <- &statReq{df: df, name: name, response: responseCh}: + case c.statRequestCh <- &statReq{dirPath: absPath, name: name, response: responseCh}: // submitted } } diff --git a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go index 2337a69375..6ab9f8914f 100644 --- a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go +++ b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_linux.go @@ -4,7 +4,7 @@ package cleaner import ( "fmt" - "os" + "path/filepath" "golang.org/x/sys/unix" ) @@ -30,11 +30,11 @@ func (c *Cleaner) stat(fullPath string) (*Candidate, error) { }, nil } -func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) { +func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) { c.StatxC.Add(1) c.StatxInDirC.Add(1) var statx unix.Statx_t - err := unix.Statx(int(df.Fd()), filename, + err := unix.Statx(unix.AT_FDCWD, filepath.Join(dirPath, filename), unix.AT_STATX_DONT_SYNC|unix.AT_SYMLINK_NOFOLLOW|unix.AT_NO_AUTOMOUNT, unix.STATX_ATIME|unix.STATX_SIZE, &statx, diff --git a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go index dc69da012a..2b793f4397 100644 --- a/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go +++ b/packages/orchestrator/cmd/clean-nfs-cache/cleaner/stat_osx.go @@ -31,10 +31,10 @@ func (c *Cleaner) stat(path string) (*Candidate, error) { }, nil } -func (c *Cleaner) statInDir(df *os.File, filename string) (*File, error) { +func (c *Cleaner) statInDir(dirPath string, filename string) (*File, error) { c.StatxInDirC.Add(1) - // performance on OS X doeas not matter, so just use the full stat - cand, err := c.stat(filepath.Join(df.Name(), filename)) + // performance on OS X does not matter, so just use the full stat + cand, err := c.stat(filepath.Join(dirPath, filename)) if err != nil { return nil, err } diff --git a/packages/orchestrator/cmd/create-build/main.go b/packages/orchestrator/cmd/create-build/main.go index fac3b2578b..660ece4499 100644 --- a/packages/orchestrator/cmd/create-build/main.go +++ b/packages/orchestrator/cmd/create-build/main.go @@ -412,63 +412,141 @@ func printLocalFileSizes(basePath, buildID string) { } func setupKernel(ctx context.Context, dir, version string) error { - dstPath := filepath.Join(dir, version, "vmlinux.bin") + arch := utils.TargetArch() + dstPath := filepath.Join(dir, version, arch, "vmlinux.bin") + if err := os.MkdirAll(filepath.Dir(dstPath), 0o755); err != nil { return fmt.Errorf("mkdir kernel dir: %w", err) } if _, err := os.Stat(dstPath); err == nil { - fmt.Printf("✓ Kernel %s exists\n", version) + fmt.Printf("✓ Kernel %s (%s) exists\n", version, arch) return nil } - kernelURL, _ := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/kernels/", version, "vmlinux.bin") - fmt.Printf("⬇ Downloading kernel %s...\n", version) + // Try arch-specific URL first: {version}/{arch}/vmlinux.bin + archURL, err := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/kernels/", version, arch, "vmlinux.bin") + if err != nil { + return fmt.Errorf("invalid kernel URL: %w", err) + } + + fmt.Printf("⬇ Downloading kernel %s (%s)...\n", version, arch) + + if err := download(ctx, archURL, dstPath, 0o644); err == nil { + return nil + } else if !errors.Is(err, errNotFound) { + return fmt.Errorf("failed to download kernel: %w", err) + } + + // Legacy URLs are x86_64-only; only fall back for amd64. + if arch != "amd64" { + return fmt.Errorf("kernel %s not found for %s (no legacy fallback for non-amd64)", version, arch) + } + + legacyURL, err := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/kernels/", version, "vmlinux.bin") + if err != nil { + return fmt.Errorf("invalid kernel legacy URL: %w", err) + } + + fmt.Printf(" %s path not found, trying legacy URL...\n", arch) - return download(ctx, kernelURL, dstPath, 0o644) + return download(ctx, legacyURL, dstPath, 0o644) } func setupFC(ctx context.Context, dir, version string) error { - dstPath := filepath.Join(dir, version, "firecracker") + arch := utils.TargetArch() + dstPath := filepath.Join(dir, version, arch, "firecracker") + if err := os.MkdirAll(filepath.Dir(dstPath), 0o755); err != nil { return fmt.Errorf("mkdir firecracker dir: %w", err) } if _, err := os.Stat(dstPath); err == nil { - fmt.Printf("✓ Firecracker %s exists\n", version) + fmt.Printf("✓ Firecracker %s (%s) exists\n", version, arch) return nil } - fcURL := fmt.Sprintf("https://github.com/e2b-dev/fc-versions/releases/download/%s/firecracker", version) - fmt.Printf("⬇ Downloading Firecracker %s...\n", version) + // Download from GCS bucket with {version}/{arch}/firecracker path + fcURL, err := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/fc-versions/", version, arch, "firecracker") + if err != nil { + return fmt.Errorf("invalid Firecracker URL: %w", err) + } + + fmt.Printf("⬇ Downloading Firecracker %s (%s)...\n", version, arch) + + if err := download(ctx, fcURL, dstPath, 0o755); err == nil { + return nil + } else if !errors.Is(err, errNotFound) { + return fmt.Errorf("failed to download Firecracker: %w", err) + } + + // Legacy URLs are x86_64-only; only fall back for amd64. + if arch != "amd64" { + return fmt.Errorf("firecracker %s not found for %s (no legacy fallback for non-amd64)", version, arch) + } + + legacyURL, err := url.JoinPath("https://storage.googleapis.com/e2b-prod-public-builds/fc-versions/", version, "firecracker") + if err != nil { + return fmt.Errorf("invalid Firecracker legacy URL: %w", err) + } + + fmt.Printf(" %s path not found, trying legacy URL...\n", arch) - return download(ctx, fcURL, dstPath, 0o755) + return download(ctx, legacyURL, dstPath, 0o755) } -func download(ctx context.Context, url, path string, perm os.FileMode) error { - req, _ := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) +var errNotFound = errors.New("not found") + +func download(ctx context.Context, rawURL, path string, perm os.FileMode) error { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) + if err != nil { + return fmt.Errorf("invalid download URL %s: %w", rawURL, err) + } + resp, err := (&http.Client{Timeout: 5 * time.Minute}).Do(req) if err != nil { return err } defer resp.Body.Close() + if resp.StatusCode == http.StatusNotFound { + return fmt.Errorf("%w: %s", errNotFound, rawURL) + } if resp.StatusCode != http.StatusOK { - return fmt.Errorf("HTTP %d: %s", resp.StatusCode, url) + return fmt.Errorf("HTTP %d: %s", resp.StatusCode, rawURL) } - f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm) + // Write to a temporary file and rename atomically to avoid partial files + // on network errors or disk-full conditions. + tmpPath := path + ".tmp" + + f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm) if err != nil { return err } - defer f.Close() - _, err = io.Copy(f, resp.Body) - if err == nil { - fmt.Printf("✓ Downloaded %s\n", filepath.Base(path)) + if _, err := io.Copy(f, resp.Body); err != nil { + f.Close() + os.Remove(tmpPath) + + return err + } + + if err := f.Close(); err != nil { + os.Remove(tmpPath) + + return err } - return err + if err := os.Rename(tmpPath, path); err != nil { + os.Remove(tmpPath) + + return err + } + + fmt.Printf("✓ Downloaded %s\n", filepath.Base(path)) + + return nil } diff --git a/packages/orchestrator/cmd/smoketest/smoke_test.go b/packages/orchestrator/cmd/smoketest/smoke_test.go index 94fd697a10..dc4c8c35ae 100644 --- a/packages/orchestrator/cmd/smoketest/smoke_test.go +++ b/packages/orchestrator/cmd/smoketest/smoke_test.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "testing" "time" @@ -282,7 +283,7 @@ func findOrBuildEnvd(t *testing.T) string { cmd := exec.CommandContext(t.Context(), "go", "build", "-o", binPath, ".") //nolint:gosec // trusted input cmd.Dir = envdDir - cmd.Env = append(os.Environ(), "CGO_ENABLED=0", "GOOS=linux", "GOARCH=amd64") + cmd.Env = append(os.Environ(), "CGO_ENABLED=0", "GOOS=linux", "GOARCH="+runtime.GOARCH) out, err := cmd.CombinedOutput() if err != nil { t.Skipf("failed to build envd: %v\n%s", err, out) diff --git a/packages/orchestrator/pkg/sandbox/fc/client.go b/packages/orchestrator/pkg/sandbox/fc/client.go index 1af2db0ec8..2b67174205 100644 --- a/packages/orchestrator/pkg/sandbox/fc/client.go +++ b/packages/orchestrator/pkg/sandbox/fc/client.go @@ -3,6 +3,7 @@ package fc import ( "context" "fmt" + "runtime" "github.com/bits-and-blooms/bitset" "github.com/firecracker-microvm/firecracker-go-sdk" @@ -326,7 +327,15 @@ func (c *apiClient) setMachineConfig( memoryMB int64, hugePages bool, ) error { - smt := true + // SMT (Simultaneous Multi-Threading / Hyper-Threading) must be disabled on + // ARM64 because ARM processors use a different core topology (big.LITTLE, + // efficiency/performance cores) rather than hardware threads per core. + // Firecracker validates this against the host CPU and rejects SMT=true on ARM. + // See: https://github.com/firecracker-microvm/firecracker/blob/main/docs/cpu_templates/cpu-features.md + // We use runtime.GOARCH (not TARGET_ARCH) because the orchestrator binary + // always runs on the same architecture as Firecracker. + const archARM64 = "arm64" + smt := runtime.GOARCH != archARM64 trackDirtyPages := false machineConfig := &models.MachineConfiguration{ VcpuCount: &vCPUCount, diff --git a/packages/orchestrator/pkg/sandbox/fc/config.go b/packages/orchestrator/pkg/sandbox/fc/config.go index 23cca45da5..f6f509aafd 100644 --- a/packages/orchestrator/pkg/sandbox/fc/config.go +++ b/packages/orchestrator/pkg/sandbox/fc/config.go @@ -1,9 +1,11 @@ package fc import ( + "os" "path/filepath" "github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" ) const ( @@ -31,10 +33,25 @@ func (t Config) SandboxKernelDir() string { } func (t Config) HostKernelPath(config cfg.BuilderConfig) string { + // Prefer arch-prefixed path ({version}/{arch}/vmlinux.bin) for multi-arch support. + // Fall back to legacy flat path ({version}/vmlinux.bin) for existing production nodes. + archPath := filepath.Join(config.HostKernelsDir, t.KernelVersion, utils.TargetArch(), SandboxKernelFile) + if _, err := os.Stat(archPath); err == nil { + return archPath + } + return filepath.Join(config.HostKernelsDir, t.KernelVersion, SandboxKernelFile) } func (t Config) FirecrackerPath(config cfg.BuilderConfig) string { + // Prefer arch-prefixed path ({version}/{arch}/firecracker) for multi-arch support. + // Fall back to legacy flat path ({version}/firecracker) for existing production nodes + // that haven't migrated to the arch-prefixed layout yet. + archPath := filepath.Join(config.FirecrackerVersionsDir, t.FirecrackerVersion, utils.TargetArch(), FirecrackerBinaryName) + if _, err := os.Stat(archPath); err == nil { + return archPath + } + return filepath.Join(config.FirecrackerVersionsDir, t.FirecrackerVersion, FirecrackerBinaryName) } diff --git a/packages/orchestrator/pkg/sandbox/fc/config_test.go b/packages/orchestrator/pkg/sandbox/fc/config_test.go new file mode 100644 index 0000000000..0dff191793 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/fc/config_test.go @@ -0,0 +1,113 @@ +package fc + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" +) + +func TestFirecrackerPath_ArchPrefixed(t *testing.T) { + t.Parallel() + dir := t.TempDir() + arch := utils.TargetArch() + + // Create the arch-prefixed binary + archDir := filepath.Join(dir, "v1.12.0", arch) + require.NoError(t, os.MkdirAll(archDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(archDir, "firecracker"), []byte("binary"), 0o755)) + + config := cfg.BuilderConfig{FirecrackerVersionsDir: dir} + fc := Config{FirecrackerVersion: "v1.12.0"} + + result := fc.FirecrackerPath(config) + + assert.Equal(t, filepath.Join(dir, "v1.12.0", arch, "firecracker"), result) +} + +func TestFirecrackerPath_LegacyFallback(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + // Only create the legacy flat binary (no arch subdirectory) + require.NoError(t, os.MkdirAll(filepath.Join(dir, "v1.12.0"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "v1.12.0", "firecracker"), []byte("binary"), 0o755)) + + config := cfg.BuilderConfig{FirecrackerVersionsDir: dir} + fc := Config{FirecrackerVersion: "v1.12.0"} + + result := fc.FirecrackerPath(config) + + assert.Equal(t, filepath.Join(dir, "v1.12.0", "firecracker"), result) +} + +func TestFirecrackerPath_NeitherExists(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + // No binary at all — should return legacy flat path + config := cfg.BuilderConfig{FirecrackerVersionsDir: dir} + fc := Config{FirecrackerVersion: "v1.12.0"} + + result := fc.FirecrackerPath(config) + + assert.Equal(t, filepath.Join(dir, "v1.12.0", "firecracker"), result) +} + +func TestHostKernelPath_ArchPrefixed(t *testing.T) { + t.Parallel() + dir := t.TempDir() + arch := utils.TargetArch() + + // Create the arch-prefixed kernel + archDir := filepath.Join(dir, "vmlinux-6.1.102", arch) + require.NoError(t, os.MkdirAll(archDir, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(archDir, "vmlinux.bin"), []byte("kernel"), 0o644)) + + config := cfg.BuilderConfig{HostKernelsDir: dir} + fc := Config{KernelVersion: "vmlinux-6.1.102"} + + result := fc.HostKernelPath(config) + + assert.Equal(t, filepath.Join(dir, "vmlinux-6.1.102", arch, "vmlinux.bin"), result) +} + +func TestHostKernelPath_LegacyFallback(t *testing.T) { + t.Parallel() + dir := t.TempDir() + + // Only create the legacy flat kernel + require.NoError(t, os.MkdirAll(filepath.Join(dir, "vmlinux-6.1.102"), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "vmlinux-6.1.102", "vmlinux.bin"), []byte("kernel"), 0o644)) + + config := cfg.BuilderConfig{HostKernelsDir: dir} + fc := Config{KernelVersion: "vmlinux-6.1.102"} + + result := fc.HostKernelPath(config) + + assert.Equal(t, filepath.Join(dir, "vmlinux-6.1.102", "vmlinux.bin"), result) +} + +func TestHostKernelPath_PrefersArchOverLegacy(t *testing.T) { + t.Parallel() + dir := t.TempDir() + arch := utils.TargetArch() + + // Create BOTH arch-prefixed and legacy flat kernels + require.NoError(t, os.MkdirAll(filepath.Join(dir, "vmlinux-6.1.102", arch), 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "vmlinux-6.1.102", arch, "vmlinux.bin"), []byte("arch-kernel"), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(dir, "vmlinux-6.1.102", "vmlinux.bin"), []byte("legacy-kernel"), 0o644)) + + config := cfg.BuilderConfig{HostKernelsDir: dir} + fc := Config{KernelVersion: "vmlinux-6.1.102"} + + result := fc.HostKernelPath(config) + + // Should prefer the arch-prefixed path + assert.Equal(t, filepath.Join(dir, "vmlinux-6.1.102", arch, "vmlinux.bin"), result) +} diff --git a/packages/orchestrator/pkg/sandbox/fc/script_builder.go b/packages/orchestrator/pkg/sandbox/fc/script_builder.go index 71bbf497f6..c1773d7fd3 100644 --- a/packages/orchestrator/pkg/sandbox/fc/script_builder.go +++ b/packages/orchestrator/pkg/sandbox/fc/script_builder.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" "path/filepath" + "runtime" txtTemplate "text/template" "github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg" @@ -25,6 +26,7 @@ type startScriptArgs struct { NamespaceID string FirecrackerPath string FirecrackerSocket string + ExtraArgs string } // StartScriptResult contains the generated script and computed paths @@ -47,7 +49,7 @@ ln -s {{ .HostRootfsPath }} {{ .DeprecatedSandboxRootfsDir }}/{{ .SandboxRootfsF mount -t tmpfs tmpfs {{ .SandboxDir }}/{{ .SandboxKernelDir }} -o X-mount.mkdir && ln -s {{ .HostKernelPath }} {{ .SandboxDir }}/{{ .SandboxKernelDir }}/{{ .SandboxKernelFile }} && -ip netns exec {{ .NamespaceID }} {{ .FirecrackerPath }} --api-sock {{ .FirecrackerSocket }}` +ip netns exec {{ .NamespaceID }} {{ .FirecrackerPath }} --api-sock {{ .FirecrackerSocket }}{{ .ExtraArgs }}` const startScriptV2 = `mount --make-rprivate / && mount -t tmpfs tmpfs {{ .SandboxDir }} -o X-mount.mkdir && @@ -57,7 +59,7 @@ ln -s {{ .HostRootfsPath }} {{ .SandboxDir }}/{{ .SandboxRootfsFile }} && mkdir -p {{ .SandboxDir }}/{{ .SandboxKernelDir }} && ln -s {{ .HostKernelPath }} {{ .SandboxDir }}/{{ .SandboxKernelDir }}/{{ .SandboxKernelFile }} && -ip netns exec {{ .NamespaceID }} {{ .FirecrackerPath }} --api-sock {{ .FirecrackerSocket }}` +ip netns exec {{ .NamespaceID }} {{ .FirecrackerPath }} --api-sock {{ .FirecrackerSocket }}{{ .ExtraArgs }}` // StartScriptBuilder handles the creation and execution of firecracker start scripts type StartScriptBuilder struct { @@ -85,6 +87,15 @@ func (sb *StartScriptBuilder) buildArgs( rootfsPaths RootfsPaths, namespaceID string, ) startScriptArgs { + // On ARM64, disable seccomp to allow userfaultfd syscall for snapshot restore. + // The upstream Firecracker seccomp filter for aarch64 does not include the + // userfaultfd syscall (nr 282), causing snapshot loading to fail with + // "Failed to UFFD object: System error". + var extraArgs string + if runtime.GOARCH == "arm64" { + extraArgs = " --no-seccomp" + } + return startScriptArgs{ // General SandboxDir: sb.builderConfig.SandboxDir, @@ -103,6 +114,7 @@ func (sb *StartScriptBuilder) buildArgs( NamespaceID: namespaceID, FirecrackerPath: versions.FirecrackerPath(sb.builderConfig), FirecrackerSocket: files.SandboxFirecrackerSocketPath(), + ExtraArgs: extraArgs, } } diff --git a/packages/orchestrator/pkg/sandbox/nbd/path_direct.go b/packages/orchestrator/pkg/sandbox/nbd/path_direct.go index 9a5d362edf..bc934e1228 100644 --- a/packages/orchestrator/pkg/sandbox/nbd/path_direct.go +++ b/packages/orchestrator/pkg/sandbox/nbd/path_direct.go @@ -78,7 +78,7 @@ func (d *DirectPathMount) Open(ctx context.Context) (retDeviceIndex uint32, err telemetry.ReportEvent(ctx, "got backend size") - deviceIndex := uint32(math.MaxUint32) + var deviceIndex uint32 for { deviceIndex, err = d.devicePool.GetDevice(ctx) @@ -119,14 +119,19 @@ func (d *DirectPathMount) Open(ctx context.Context) (retDeviceIndex uint32, err server.Close() dispatch := NewDispatch(serverc, d.Backend) + // Capture loop variables for the goroutine closure to avoid a data + // race: deviceIndex is reassigned on each retry iteration of the + // outer for-loop while the goroutine may still read it. + devIdx := deviceIndex + sockIdx := i // Start reading commands on the socket and dispatching them to our provider d.handlersWg.Go(func() { handleErr := dispatch.Handle(ctx) // The error is expected to happen if the nbd (socket connection) is closed logger.L().Info(ctx, "closing handler for NBD commands", zap.Error(handleErr), - zap.Uint32("device_index", deviceIndex), - zap.Int("socket_index", i), + zap.Uint32("device_index", devIdx), + zap.Int("socket_index", sockIdx), ) }) diff --git a/packages/orchestrator/pkg/sandbox/uffd/testutils/page_mmap.go b/packages/orchestrator/pkg/sandbox/uffd/testutils/page_mmap.go index 929a396702..ac17b0788f 100644 --- a/packages/orchestrator/pkg/sandbox/uffd/testutils/page_mmap.go +++ b/packages/orchestrator/pkg/sandbox/uffd/testutils/page_mmap.go @@ -1,6 +1,7 @@ package testutils import ( + "errors" "fmt" "math" "syscall" @@ -20,7 +21,16 @@ func NewPageMmap(t *testing.T, size, pagesize uint64) ([]byte, uintptr, error) { } if pagesize == header.HugepageSize { - return newMmap(t, size, header.HugepageSize, unix.MAP_HUGETLB|unix.MAP_HUGE_2MB) + b, addr, err := newMmap(t, size, header.HugepageSize, unix.MAP_HUGETLB|unix.MAP_HUGE_2MB) + // Hugepage allocation can fail with ENOMEM on CI runners that don't + // have enough (or any) hugepages pre-allocated in /proc/sys/vm/nr_hugepages. + // Skip gracefully rather than failing the test. + if err != nil && errors.Is(err, syscall.ENOMEM) { + pages := int(math.Ceil(float64(size) / float64(header.HugepageSize))) + t.Skipf("skipping: hugepage mmap failed (need %d hugepages): %v", pages, err) + } + + return b, addr, err } return nil, 0, fmt.Errorf("unsupported page size: %d", pagesize) diff --git a/packages/orchestrator/pkg/service/machineinfo/main.go b/packages/orchestrator/pkg/service/machineinfo/main.go index 27d280da8b..f221902f87 100644 --- a/packages/orchestrator/pkg/service/machineinfo/main.go +++ b/packages/orchestrator/pkg/service/machineinfo/main.go @@ -22,13 +22,27 @@ func Detect() (MachineInfo, error) { } if len(info) > 0 { - if info[0].Family == "" || info[0].Model == "" { + family := info[0].Family + model := info[0].Model + + // On ARM64, gopsutil doesn't populate Family/Model from /proc/cpuinfo. + // Provide fallback values so callers don't get an error. + if runtime.GOARCH == "arm64" { + if family == "" { + family = "arm64" + } + if model == "" { + model = "0" + } + } + + if family == "" || model == "" { return MachineInfo{}, fmt.Errorf("unable to detect CPU platform from CPU info: %+v", info[0]) } return MachineInfo{ - Family: info[0].Family, - Model: info[0].Model, + Family: family, + Model: model, ModelName: info[0].ModelName, Flags: info[0].Flags, Arch: runtime.GOARCH, diff --git a/packages/orchestrator/pkg/template/build/core/oci/oci.go b/packages/orchestrator/pkg/template/build/core/oci/oci.go index 34f8aa81b5..dc57314c62 100644 --- a/packages/orchestrator/pkg/template/build/core/oci/oci.go +++ b/packages/orchestrator/pkg/template/build/core/oci/oci.go @@ -56,9 +56,12 @@ func (e *ImageTooLargeError) Error() string { ) } -var DefaultPlatform = containerregistry.Platform{ - OS: "linux", - Architecture: "amd64", +// DefaultPlatform returns the OCI platform for image pulls, respecting TARGET_ARCH. +func DefaultPlatform() containerregistry.Platform { + return containerregistry.Platform{ + OS: "linux", + Architecture: utils.TargetArch(), + } } // wrapImagePullError converts technical Docker registry errors into user-friendly messages. @@ -96,7 +99,7 @@ func GetPublicImage(ctx context.Context, dockerhubRepository dockerhub.RemoteRep return nil, fmt.Errorf("invalid image reference '%s': %w", tag, err) } - platform := DefaultPlatform + platform := DefaultPlatform() // When no auth provider is provided and the image is from the default registry // use docker remote repository proxy with cached images @@ -149,7 +152,7 @@ func GetImage(ctx context.Context, artifactRegistry artifactsregistry.ArtifactsR childCtx, childSpan := tracer.Start(ctx, "pull-docker-image") defer childSpan.End() - platform := DefaultPlatform + platform := DefaultPlatform() img, err := artifactRegistry.GetImage(childCtx, templateId, buildId, platform) if err != nil { @@ -469,7 +472,7 @@ func verifyImagePlatform(img containerregistry.Image, platform containerregistry return fmt.Errorf("error getting image config file: %w", err) } if config.Architecture != platform.Architecture { - return fmt.Errorf("image is not %s", platform.Architecture) + return fmt.Errorf("image architecture %q does not match expected %q", config.Architecture, platform.Architecture) } return nil diff --git a/packages/orchestrator/pkg/template/build/core/oci/oci_test.go b/packages/orchestrator/pkg/template/build/core/oci/oci_test.go index aa0fa421cd..5b774f8682 100644 --- a/packages/orchestrator/pkg/template/build/core/oci/oci_test.go +++ b/packages/orchestrator/pkg/template/build/core/oci/oci_test.go @@ -26,6 +26,7 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/dockerhub" templatemanager "github.com/e2b-dev/infra/packages/shared/pkg/grpc/template-manager" "github.com/e2b-dev/infra/packages/shared/pkg/logger" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" ) func createFileTar(t *testing.T, fileName string) *bytes.Buffer { @@ -213,7 +214,7 @@ func TestGetPublicImageWithGeneralAuth(t *testing.T) { // Set the config to include the proper platform configFile, err := testImage.ConfigFile() require.NoError(t, err) - configFile.Architecture = "amd64" + configFile.Architecture = utils.TargetArch() configFile.OS = "linux" testImage, err = mutate.ConfigFile(testImage, configFile) require.NoError(t, err) diff --git a/packages/shared/pkg/storage/gcp_multipart_test.go b/packages/shared/pkg/storage/gcp_multipart_test.go index c0daaa6eef..7fe4d397ce 100644 --- a/packages/shared/pkg/storage/gcp_multipart_test.go +++ b/packages/shared/pkg/storage/gcp_multipart_test.go @@ -172,7 +172,7 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) { var uploadID string var initiateCount, uploadPartCount, completeCount int32 - receivedParts := make(map[int]string) + receivedParts := sync.Map{} handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch { @@ -194,7 +194,7 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) { // Upload part partNum := atomic.AddInt32(&uploadPartCount, 1) body, _ := io.ReadAll(r.Body) - receivedParts[int(partNum)] = string(body) + receivedParts.Store(int(partNum), string(body)) w.Header().Set("ETag", fmt.Sprintf(`"etag%d"`, partNum)) w.WriteHeader(http.StatusOK) @@ -217,7 +217,9 @@ func TestMultipartUploader_UploadFileInParallel_Success(t *testing.T) { // Verify all parts were uploaded and content matches var reconstructed strings.Builder for i := 1; i <= int(atomic.LoadInt32(&uploadPartCount)); i++ { - reconstructed.WriteString(receivedParts[i]) + if part, ok := receivedParts.Load(i); ok { + reconstructed.WriteString(part.(string)) + } } require.Equal(t, testContent, reconstructed.String()) } @@ -655,6 +657,7 @@ func TestMultipartUploader_BoundaryConditions_ExactChunkSize(t *testing.T) { require.NoError(t, err) var partSizes []int + var partSizesMu sync.Mutex handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch { @@ -670,7 +673,9 @@ func TestMultipartUploader_BoundaryConditions_ExactChunkSize(t *testing.T) { case strings.Contains(r.URL.RawQuery, "partNumber"): body, _ := io.ReadAll(r.Body) + partSizesMu.Lock() partSizes = append(partSizes, len(body)) + partSizesMu.Unlock() partNum := strings.Split(strings.Split(r.URL.RawQuery, "partNumber=")[1], "&")[0] w.Header().Set("ETag", fmt.Sprintf(`"boundary-etag-%s"`, partNum)) @@ -904,10 +909,13 @@ func TestRetryableClient_ActualRetryBehavior(t *testing.T) { var requestCount int32 var retryDelays []time.Duration var retryTimes []time.Time + var retryMu sync.Mutex server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { count := atomic.AddInt32(&requestCount, 1) + retryMu.Lock() retryTimes = append(retryTimes, time.Now()) + retryMu.Unlock() if count < 3 { w.WriteHeader(http.StatusInternalServerError) diff --git a/packages/shared/pkg/utils/env.go b/packages/shared/pkg/utils/env.go index 82305689d7..24864dfc0f 100644 --- a/packages/shared/pkg/utils/env.go +++ b/packages/shared/pkg/utils/env.go @@ -3,9 +3,35 @@ package utils import ( "fmt" "os" + "runtime" "strings" ) +// archAliases normalizes common architecture names to Go convention. +var archAliases = map[string]string{ + "amd64": "amd64", + "x86_64": "amd64", + "arm64": "arm64", + "aarch64": "arm64", +} + +// TargetArch returns the target architecture for binary paths and OCI platform. +// If TARGET_ARCH is set, it is normalized to Go convention ("amd64" or "arm64"); +// otherwise defaults to the host architecture (runtime.GOARCH). +func TargetArch() string { + if arch := os.Getenv("TARGET_ARCH"); arch != "" { + if normalized, ok := archAliases[arch]; ok { + return normalized + } + + fmt.Fprintf(os.Stderr, "WARNING: unrecognized TARGET_ARCH=%q, falling back to %s\n", arch, runtime.GOARCH) + + return runtime.GOARCH + } + + return runtime.GOARCH +} + // RequiredEnv returns the value of the environment variable for key if it is set, non-empty and not only whitespace. // It panics otherwise. // diff --git a/packages/shared/pkg/utils/env_test.go b/packages/shared/pkg/utils/env_test.go new file mode 100644 index 0000000000..9684ab83eb --- /dev/null +++ b/packages/shared/pkg/utils/env_test.go @@ -0,0 +1,66 @@ +package utils + +import ( + "runtime" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTargetArch_DefaultsToHostArch(t *testing.T) { + t.Setenv("TARGET_ARCH", "") + + result := TargetArch() + + assert.Equal(t, runtime.GOARCH, result) +} + +func TestTargetArch_RespectsValidOverride(t *testing.T) { + tests := []struct { + name string + arch string + expected string + }{ + {name: "amd64", arch: "amd64", expected: "amd64"}, + {name: "arm64", arch: "arm64", expected: "arm64"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("TARGET_ARCH", tt.arch) + + result := TargetArch() + + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestTargetArch_NormalizesAliases(t *testing.T) { + tests := []struct { + name string + arch string + expected string + }{ + {name: "x86_64 → amd64", arch: "x86_64", expected: "amd64"}, + {name: "aarch64 → arm64", arch: "aarch64", expected: "arm64"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Setenv("TARGET_ARCH", tt.arch) + + result := TargetArch() + + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestTargetArch_FallsBackOnUnknown(t *testing.T) { + t.Setenv("TARGET_ARCH", "mips") + + result := TargetArch() + + assert.Equal(t, runtime.GOARCH, result) +} diff --git a/packages/shared/pkg/utils/errorcollector_test.go b/packages/shared/pkg/utils/errorcollector_test.go index ff40d05368..bb1b37bab8 100644 --- a/packages/shared/pkg/utils/errorcollector_test.go +++ b/packages/shared/pkg/utils/errorcollector_test.go @@ -3,6 +3,7 @@ package utils import ( "context" "errors" + "sync/atomic" "testing" "github.com/stretchr/testify/assert" @@ -49,23 +50,31 @@ func TestErrorCollector(t *testing.T) { ec := NewErrorCollector(1) - // Block the collector's only slot + // Block the collector's only slot. + // ctx1 and ctx2 must be distinct variables: the closure passed to ec.Go + // captures the context variable by reference. If we reused a single "ctx" + // variable, the first closure's <-ctx.Done() would race with the main + // goroutine's reassignment of ctx on the second WithCancel call. started := make(chan struct{}) - ctx, cancel1 := context.WithCancel(t.Context()) - ec.Go(ctx, func() error { + ctx1, cancel1 := context.WithCancel(t.Context()) + ec.Go(ctx1, func() error { close(started) - <-ctx.Done() + <-ctx1.Done() return nil }) <-started - // This Go call should block on the semaphore - var wasCalled bool - ctx, cancel2 := context.WithCancel(t.Context()) - ec.Go(ctx, func() error { - wasCalled = true + // This Go call should block on the semaphore. + // wasCalled must be atomic: the goroutine spawned by ec.Go may write it + // concurrently with the main goroutine's read in assert.False below. + // A plain bool causes a data race that the -race detector catches on ARM64 + // (weaker memory model) even though it appears safe on x86. + var wasCalled atomic.Bool + ctx2, cancel2 := context.WithCancel(t.Context()) + ec.Go(ctx2, func() error { + wasCalled.Store(true) return nil }) @@ -78,6 +87,6 @@ func TestErrorCollector(t *testing.T) { err := ec.Wait() require.ErrorIs(t, err, context.Canceled) - assert.False(t, wasCalled) + assert.False(t, wasCalled.Load()) }) } diff --git a/packages/shared/scripts/package-lock.json b/packages/shared/scripts/package-lock.json index 570b6efbf4..9498773161 100644 --- a/packages/shared/scripts/package-lock.json +++ b/packages/shared/scripts/package-lock.json @@ -102,15 +102,15 @@ } }, "node_modules/brace-expansion": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.2.tgz", - "integrity": "sha512-Pdk8c9poy+YhOgVWw1JNN22/HcivgKWwpxKq04M/jTmHyCZn12WPJebZxdjSa5TmBqISrUSgNYU3eRORljfCCw==", + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.5.tgz", + "integrity": "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ==", "license": "MIT", "dependencies": { "balanced-match": "^4.0.2" }, "engines": { - "node": "20 || >=22" + "node": "18 || 20 || >=22" } }, "node_modules/chalk": {