diff --git a/.fallowrc.json b/.fallowrc.json index a1206627f..ce94eba6e 100644 --- a/.fallowrc.json +++ b/.fallowrc.json @@ -25,6 +25,7 @@ ], "ignorePatterns": [ "examples/test-app/**", + "scripts/perf/**", "ios-runner/AgentDeviceRunner/AgentDeviceRunnerUITests.xctestplan", "scripts/write-xcuitest-cache-metadata.mjs" ], diff --git a/.github/workflows/perf-nightly.yml b/.github/workflows/perf-nightly.yml new file mode 100644 index 000000000..d317d33be --- /dev/null +++ b/.github/workflows/perf-nightly.yml @@ -0,0 +1,127 @@ +name: Perf Nightly + +# End-to-end command perf benchmark (scripts/perf). Scheduled + manual only — perf timing on +# shared CI runners is noisy, so treat this as a trend/regression signal, not absolute numbers. +# Reuses the same build artifacts as the device suites: the cached iOS XCUITest runner +# (setup-apple-replay, ios-runner-prebuilt cache) and the Android replay host, and runs the CLI +# from source via --experimental-strip-types (no dist build), matching the replay workflows. + +on: + schedule: + - cron: "0 4 * * *" + workflow_dispatch: + inputs: + rounds: + description: "Measured rounds per command (samples)" + required: false + default: "5" + +permissions: + contents: read + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + AGENT_DEVICE_PERF_CLI: "--experimental-strip-types src/bin.ts" + PERF_ROUNDS: ${{ github.event.inputs.rounds || '5' }} + +jobs: + perf-ios: + name: iOS Command Perf + runs-on: macos-26 + timeout-minutes: 80 + env: + IOS_RUNTIME_VERSION: "26.2" + AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH: ${{ github.workspace }}/.tmp/ios-runner-derived + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup toolchain + uses: ./.github/actions/setup-node-pnpm + + - name: Setup Apple replay + id: apple-replay + uses: ./.github/actions/setup-apple-replay + with: + derived-path: ${{ env.AGENT_DEVICE_IOS_RUNNER_DERIVED_PATH }} + cache-key-prefix: ios-runner-prebuilt + cache-key-suffix: -ios-${{ env.IOS_RUNTIME_VERSION }} + build-command: sh ./scripts/build-xcuitest-apple.sh + xcuitest-platform: ios + xcuitest-destination: generic/platform=iOS Simulator + clean-derived: "1" + + - name: Boot iOS test simulator + uses: ./.github/actions/boot-ios-test-simulator + with: + runtime-version: ${{ env.IOS_RUNTIME_VERSION }} + preferred-device-name: iPhone 17 Pro + + - name: Run iOS command perf benchmark + run: | + pnpm clean:daemon + node --experimental-strip-types scripts/perf/run.ts \ + --platform ios \ + --device "iPhone 17 Pro" \ + --n "$PERF_ROUNDS" --warmup 1 \ + --out-dir "$GITHUB_WORKSPACE/perf-results" + + - name: Upload iOS perf report + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: perf-ios + path: perf-results/ + if-no-files-found: warn + + perf-android: + name: Android Command Perf + runs-on: ubuntu-latest + timeout-minutes: 80 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup toolchain + uses: ./.github/actions/setup-node-pnpm + + - name: Setup Android replay host + id: android-replay-host + uses: ./.github/actions/setup-android-replay-host + + - name: Package npm-bundled Android helpers + run: | + pnpm package:android-snapshot-helper:npm + pnpm package:android-multitouch-helper:npm + + - name: Run Android command perf benchmark + uses: reactivecircus/android-emulator-runner@b530d96654c385303d652368551fb075bc2f0b6b # v2.35.0 + with: + api-level: 36 + arch: x86_64 + profile: pixel_7 + target: google_apis_playstore + emulator-options: -no-window -gpu swiftshader_indirect -no-snapshot -noaudio -no-boot-anim -no-metrics + script: | + set -e + # Disable animations up front so accessibility dumps don't time out (the harness + # also runs `settings animations off`, this is belt-and-suspenders). + adb -s emulator-5554 shell settings put global window_animation_scale 0 || true + adb -s emulator-5554 shell settings put global transition_animation_scale 0 || true + adb -s emulator-5554 shell settings put global animator_duration_scale 0 || true + node --experimental-strip-types scripts/perf/run.ts \ + --platform android \ + --serial emulator-5554 \ + --n "$PERF_ROUNDS" --warmup 1 \ + --out-dir "$GITHUB_WORKSPACE/perf-results" + + - name: Upload Android perf report + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: perf-android + path: perf-results/ + if-no-files-found: warn diff --git a/.gitignore b/.gitignore index 1dc2da634..0f683df1c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ node_modules/ +scripts/perf/.results/ .pnpm-store/ .fallow/ dist/ diff --git a/package.json b/package.json index ed3bd7131..47428205b 100644 --- a/package.json +++ b/package.json @@ -98,6 +98,9 @@ "ad": "node bin/agent-device.mjs", "size": "node scripts/size-report.mjs", "size:markdown": "node scripts/size-report.mjs --json .tmp/size-report.json --markdown .tmp/size-report.md", + "perf": "node --experimental-strip-types scripts/perf/run.ts", + "perf:ios": "node --experimental-strip-types scripts/perf/run.ts --platform ios", + "perf:android": "node --experimental-strip-types scripts/perf/run.ts --platform android", "lint": "oxlint . --deny-warnings", "format": "oxfmt --write src test skills package.json tsconfig.json tsconfig.lib.json rslib.config.ts vitest.config.ts .github/actions/setup-node-pnpm/action.yml .oxlintrc.json .oxfmtrc.json '!test/skillgym/.skillgym-results/**'", "fallow": "fallow --summary", diff --git a/scripts/perf/cli.ts b/scripts/perf/cli.ts new file mode 100644 index 000000000..486cc5eb8 --- /dev/null +++ b/scripts/perf/cli.ts @@ -0,0 +1,99 @@ +import { performance } from 'node:perf_hooks'; +import { runCmdSync } from '../../src/utils/exec.ts'; +import { resolveCliArgv, REPO_ROOT } from './config.ts'; +import type { BatchStepSpec } from './scenario.ts'; +import type { CliResult } from './types.ts'; + +const MAX_BUFFER = 64 * 1024 * 1024; +const CLI_ARGV = resolveCliArgv(); + +function tryParseJson(stdout: string): unknown { + const trimmed = stdout.trim(); + if (!trimmed) return undefined; + try { + return JSON.parse(trimmed); + } catch { + // Some commands print a trailing line after JSON; try the last JSON-looking block. + const start = trimmed.indexOf('{'); + const end = trimmed.lastIndexOf('}'); + if (start >= 0 && end > start) { + try { + return JSON.parse(trimmed.slice(start, end + 1)); + } catch { + return undefined; + } + } + return undefined; + } +} + +function jsonOk(json: unknown): boolean { + return !(json !== null && typeof json === 'object' && (json as { ok?: unknown }).ok === false); +} + +// Invoke the built CLI once. `args` includes the command + positionals + dash-flags; +// `baseFlags` carries the isolation + device flags shared by every call. +export function invokeCli(args: string[], baseFlags: string[]): CliResult { + const full = [...CLI_ARGV, ...args, ...baseFlags, '--json']; + const t0 = performance.now(); + let stdout = ''; + let stderr = ''; + let exitCode = -1; + try { + // allowFailure so non-zero exits are recorded as samples instead of thrown; maxBuffer + // raised because snapshot payloads exceed Node's ~1MB default. + const r = runCmdSync(process.execPath, full, { + cwd: REPO_ROOT, + maxBuffer: MAX_BUFFER, + allowFailure: true, + }); + stdout = r.stdout; + stderr = r.stderr; + exitCode = r.exitCode; + } catch (error) { + // Spawn-level failures (missing executable, timeout) — record as a failed sample. + stderr = error instanceof Error ? error.message : String(error); + } + const wallClockMs = performance.now() - t0; + const json = tryParseJson(stdout); + return { exitCode, wallClockMs, stdout, stderr, json, ok: exitCode === 0 && jsonOk(json) }; +} + +// Wrap a single command in its own `batch` invocation to read per-step durationMs. +export function invokeBatchStep(spec: BatchStepSpec, baseFlags: string[]): CliResult { + const result = invokeCli(['batch', '--steps', JSON.stringify([spec])], baseFlags); + // Defensive: today's stop-only batch surfaces a failed step as a top-level non-zero/ok:false + // (already caught by invokeCli). But if a future on-error mode keeps the batch ok while a step + // fails, don't silently count that step as a success — downgrade ok from the step's own ok. + const stepOk = firstBatchResult(result.json)?.ok; + if (result.ok && stepOk === false) { + return { ...result, ok: false }; + } + return result; +} + +function firstBatchResult(json: unknown): Record | undefined { + const data = (json as { data?: { results?: unknown[] } } | undefined)?.data; + const first = data?.results?.[0]; + return first && typeof first === 'object' ? (first as Record) : undefined; +} + +export function readBatchStepDurationMs(result: CliResult): number | undefined { + const v = firstBatchResult(result.json)?.durationMs; + return typeof v === 'number' ? v : undefined; +} + +export function readBatchStepError(result: CliResult): { code?: string; message?: string } { + const err = (result.json as { error?: { code?: string; message?: string } } | undefined)?.error; + return { code: err?.code, message: err?.message }; +} + +// Proxy for a11y-tree size: snapshot node count (falls back to distinct @eN refs). +export function countElements(result: CliResult): number | undefined { + const stepData = firstBatchResult(result.json)?.data; + if (stepData === undefined || typeof stepData !== 'object') return undefined; + const nodes = (stepData as { nodes?: unknown }).nodes; + if (Array.isArray(nodes)) return nodes.length; + const matches = JSON.stringify(stepData).match(/@e\d+/g); + return matches ? new Set(matches).size : 0; +} diff --git a/scripts/perf/config.ts b/scripts/perf/config.ts new file mode 100644 index 000000000..10c3e8463 --- /dev/null +++ b/scripts/perf/config.ts @@ -0,0 +1,94 @@ +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import type { Platform } from './types.ts'; + +const HERE = path.dirname(fileURLToPath(import.meta.url)); +export const REPO_ROOT = path.resolve(HERE, '..', '..'); +const CLI_BIN = path.join(REPO_ROOT, 'bin', 'agent-device.mjs'); +const DEFAULT_OUT_DIR = path.join(HERE, '.results'); + +export type PerfConfig = { + platform: Platform; + rounds: number; // measured rounds (samples per command) + warmup: number; // leading rounds dropped from stats + keepArtifacts: boolean; // keep temp state dir + leave device booted + outDir: string; + udid?: string; // iOS device override (UDID) + device?: string; // device override by name (e.g. "iPhone 17 Pro"); preferred over udid + serial?: string; // Android device override +}; + +// How to invoke the CLI. Defaults to the built dist binary (bin/agent-device.mjs). +// Set AGENT_DEVICE_PERF_CLI to run from source instead, e.g. on CI: +// AGENT_DEVICE_PERF_CLI="--experimental-strip-types src/bin.ts" +// (matches the device workflows, which run from source and skip the dist build). +export function resolveCliArgv(): string[] { + const override = process.env.AGENT_DEVICE_PERF_CLI?.trim(); + if (override) return override.split(/\s+/); + return [CLI_BIN]; +} + +export function usesSourceCli(): boolean { + return Boolean(process.env.AGENT_DEVICE_PERF_CLI?.trim()); +} + +function readValue(argv: string[], i: number, flag: string): string { + const v = argv[i + 1]; + if (v === undefined) throw new Error(`Missing value for ${flag}`); + return v; +} + +function readIntValue(argv: string[], i: number, flag: string, min: number): number { + const raw = readValue(argv, i, flag); + const n = Number(raw); + if (!Number.isInteger(n) || n < min) { + throw new Error(`${flag} must be an integer >= ${min} (got ${JSON.stringify(raw)})`); + } + return n; +} + +export function parseConfig(argv: string[]): PerfConfig { + const cfg: PerfConfig = { + platform: 'ios', + rounds: 5, + warmup: 1, + keepArtifacts: false, + outDir: DEFAULT_OUT_DIR, + }; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + switch (a) { + case '--platform': { + const v = readValue(argv, i++, a); + if (v !== 'ios' && v !== 'android') throw new Error(`Unknown platform: ${v}`); + cfg.platform = v; + break; + } + case '--n': + case '--rounds': + cfg.rounds = readIntValue(argv, i++, a, 1); + break; + case '--warmup': + cfg.warmup = readIntValue(argv, i++, a, 0); + break; + case '--keep-artifacts': + cfg.keepArtifacts = true; + break; + case '--out-dir': + cfg.outDir = path.resolve(readValue(argv, i++, a)); + break; + case '--udid': + cfg.udid = readValue(argv, i++, a); + break; + case '--device': + cfg.device = readValue(argv, i++, a); + break; + case '--serial': + cfg.serial = readValue(argv, i++, a); + break; + default: + throw new Error(`Unknown flag: ${a}`); + } + } + return cfg; +} diff --git a/scripts/perf/harness.ts b/scripts/perf/harness.ts new file mode 100644 index 000000000..a70253ab3 --- /dev/null +++ b/scripts/perf/harness.ts @@ -0,0 +1,195 @@ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { + countElements, + invokeBatchStep, + invokeCli, + readBatchStepDurationMs, + readBatchStepError, +} from './cli.ts'; +import type { PerfConfig } from './config.ts'; +import { resolveProfile, type ResolvedProfile } from './platform-profiles.ts'; +import { buildSettingsTour, type ScenarioStep } from './scenario.ts'; +import { summarize } from './stats.ts'; +import type { CliResult, Measurement, Sample } from './types.ts'; + +export type IsolationContext = { + stateDir: string; + artifactsDir: string; + baseFlags: string[]; + profile: ResolvedProfile; +}; + +function log(msg: string): void { + process.stderr.write(`[perf] ${msg}\n`); +} + +export function setupIsolation(cfg: PerfConfig): IsolationContext { + const profile = resolveProfile(cfg); + const stateDir = fs.mkdtempSync(path.join(os.tmpdir(), 'agent-device-perf-')); + const artifactsDir = path.join(stateDir, 'artifacts'); + fs.mkdirSync(artifactsDir, { recursive: true }); + const baseFlags = ['--state-dir', stateDir, '--session', 'perf', ...profile.platformFlags]; + log(`state-dir: ${stateDir}`); + log(`device: ${profile.deviceName} (${profile.udid ?? profile.serial})`); + return { stateDir, artifactsDir, baseFlags, profile }; +} + +export function teardownIsolation(ctx: IsolationContext, cfg: PerfConfig): void { + log('teardown: closing session'); + try { + const args = ['close']; + if (!cfg.keepArtifacts) args.push('--shutdown'); + invokeCli(args, ctx.baseFlags); + } catch { + /* best-effort */ + } + if (cfg.keepArtifacts) { + log(`keep-artifacts: leaving ${ctx.stateDir} and device in place`); + return; + } + try { + fs.rmSync(ctx.stateDir, { recursive: true, force: true }); + log('teardown: removed temp state dir'); + } catch { + /* best-effort */ + } +} + +function sampleError(r: CliResult): Pick { + const err = readBatchStepError(r); + return { + errorCode: err.code ?? `exit:${r.exitCode}`, + errorMessage: (err.message ?? r.stderr.trim().split('\n').pop() ?? '').slice(0, 200), + }; +} + +// Base sample (timing + ok + error note on failure) shared by every measured invocation. +function toSample(r: CliResult, round: number): Sample { + const sample: Sample = { round, wallClockMs: r.wallClockMs, ok: r.ok }; + if (!r.ok) Object.assign(sample, sampleError(r)); + return sample; +} + +// The first interaction after open/relaunch pays the iOS XCUITest runner startup (~10s+ cold) +// and a per-relaunch first-AX-query settle cost. Run an untimed throwaway interaction so that +// cost is never attributed to a measured command. +function warmRunner(ctx: IsolationContext): void { + invokeCli(['snapshot', '-i'], ctx.baseFlags); +} + +function runStep(step: ScenarioStep, ctx: IsolationContext, round: number): Sample { + // Untimed reset to root for steps whose precondition is a clean, top-of-list root. + if (step.freshRoot) { + invokeCli(['open', ctx.profile.appTarget, '--relaunch'], ctx.baseFlags); + warmRunner(ctx); + } + const r = + step.execMode === 'standalone' + ? invokeCli(step.args, ctx.baseFlags) + : invokeBatchStep(step.step, ctx.baseFlags); + const sample = toSample(r, round); + if (step.execMode === 'batch') { + sample.daemonDurationMs = readBatchStepDurationMs(r); + if (step.isSnapshot) sample.elementCount = countElements(r); + } + return sample; +} + +function buildMeasurement( + step: Pick, + platform: ResolvedProfile['platform'], + samples: Sample[], + warmupDropped: number, +): Measurement { + const ok = samples.filter((s) => s.ok); + const failures = samples.length - ok.length; + const notes: string[] = []; + if (failures > 0) { + const codes = [...new Set(samples.filter((s) => !s.ok).map((s) => s.errorCode))].join(', '); + notes.push(`${failures}/${samples.length} samples failed: ${codes}`); + } + const num = (xs: (number | undefined)[]) => xs.filter((n): n is number => typeof n === 'number'); + return { + command: step.command, + label: step.label, + platform, + execMode: step.execMode, + samples, + warmupDropped, + wallClock: summarize(ok.map((s) => s.wallClockMs)), + daemonDuration: summarize(num(ok.map((s) => s.daemonDurationMs))), + elementCount: summarize(num(ok.map((s) => s.elementCount))), + failures, + notes, + }; +} + +// Boot the device once and time it. Runs WITHOUT --session so no session lock policy +// applies and the device selectors are honored (selectors are rejected on locked sessions). +function bootOnce(ctx: IsolationContext): Measurement { + log('booting device (no session lock; sampled once)'); + const bootFlags = ['--state-dir', ctx.stateDir, ...ctx.profile.platformFlags]; + const r = invokeCli(['boot', ...ctx.profile.selectorFlags], bootFlags); + const sample = toSample(r, 0); + return buildMeasurement( + { command: 'boot', label: 'boot device', execMode: 'standalone' }, + ctx.profile.platform, + [sample], + 0, + ); +} + +// Establish the session by opening Settings WITH device selectors (open is the only +// interaction command allowed to carry selectors on a fresh session). Locks the session +// to our device so every later call targets it via --session alone. +function establishSession(ctx: IsolationContext): Measurement { + log('establishing session (open with device selectors)'); + const r = invokeCli(['open', ctx.profile.appTarget, ...ctx.profile.selectorFlags], ctx.baseFlags); + const sample = toSample(r, 0); + return buildMeasurement( + { command: 'open', label: 'open (establish + cold)', execMode: 'standalone' }, + ctx.profile.platform, + [sample], + 0, + ); +} + +export function runScenario(ctx: IsolationContext, cfg: PerfConfig): Measurement[] { + const steps = buildSettingsTour(ctx.profile, { artifactsDir: ctx.artifactsDir }); + const acc = new Map(); + for (const step of steps) acc.set(step.label, []); + + const boot = bootOnce(ctx); + const establish = establishSession(ctx); + // Absorb the one-time runner startup before any round so it isn't charged to a measurement. + warmRunner(ctx); + + // Android accessibility dumps time out while the UI is animating; disable animations + // up front (untimed) so snapshot/get/is/fill can read an idle hierarchy. + if (ctx.profile.platform === 'android') { + log('disabling animations (android)'); + invokeCli(['settings', 'animations', 'off'], ctx.baseFlags); + } + + const totalRounds = cfg.warmup + cfg.rounds; + for (let round = 0; round < totalRounds; round++) { + const measured = round >= cfg.warmup; + log(`round ${round + 1}/${totalRounds}${measured ? '' : ' (warmup, dropped)'}`); + for (const step of steps) { + const sample = runStep(step, ctx, round); + if (measured) acc.get(step.label)!.push(sample); + // After the round's reset-open relaunch, warm the runner (untimed) so the first measured + // read (snapshot -i) doesn't pay the post-relaunch first-AX-query cost. + if (step.command === 'open' && step.execMode === 'standalone') { + warmRunner(ctx); + } + } + } + + const tourMeasurements = steps.map((step) => + buildMeasurement(step, ctx.profile.platform, acc.get(step.label)!, cfg.warmup), + ); + return [boot, establish, ...tourMeasurements]; +} diff --git a/scripts/perf/platform-profiles.ts b/scripts/perf/platform-profiles.ts new file mode 100644 index 000000000..3d02ff87d --- /dev/null +++ b/scripts/perf/platform-profiles.ts @@ -0,0 +1,77 @@ +import type { PerfConfig } from './config.ts'; +import type { Platform } from './types.ts'; + +// Local-convenience defaults for ad-hoc runs; CI always overrides them (--device / --serial). +// The iOS UDID is a specific local "iPhone 17" sim; the Android serial is a dedicated emulator +// port. Pass --udid/--device/--serial to target your own device. +const DEFAULT_IOS_UDID = 'D74E0B66-57EB-4EC1-92DC-DA0A30581FE7'; +const DEFAULT_ANDROID_SERIAL = 'emulator-5556'; + +export type ProfileSelectors = { + // A row on the Settings root that pushes a large sub-screen (big a11y tree). + deepScreen: string; + // The Settings search field (for press/focus; auto-picks a match). + searchField: string; + // A selector that uniquely targets the EDITABLE search field (for fill). + searchFieldEditable: string; + // iOS exposes an editable search field at the Settings root (fill works without focusing + // first; focusing then filling can hang). Android only reveals the editable after tapping + // the search card, so it must press the search entry before fill/type. + searchEditableAtRoot: boolean; + // A label reliably visible on the Settings root, for get/is (selector form). + anchorLabel: string; + // Plain text of the anchor, for wait text / find (not a selector). + anchorText: string; +}; + +export type ResolvedProfile = { + platform: Platform; + deviceName: string; + udid?: string; + serial?: string; + platformFlags: string[]; // --platform; applied to every call (only conflicts if it mismatches a locked session) + selectorFlags: string[]; // device selectors — ONLY on the session-establishing open / selectorless boot + appTarget: string; // `open` target for Settings + selectors: ProfileSelectors; +}; + +export function resolveProfile(cfg: PerfConfig): ResolvedProfile { + if (cfg.platform === 'ios') { + // Prefer targeting by device name (CI boots a named simulator); fall back to a UDID. + const useName = cfg.device !== undefined; + const udid = useName ? undefined : (cfg.udid ?? DEFAULT_IOS_UDID); + return { + platform: 'ios', + deviceName: cfg.device ?? 'iPhone 17', + udid, + platformFlags: ['--platform', 'ios'], + selectorFlags: useName ? ['--device', cfg.device!] : ['--udid', udid!], + appTarget: 'settings', + selectors: { + deepScreen: 'label="General"', + searchField: 'label="Search"', + searchFieldEditable: 'label="Search" editable', + searchEditableAtRoot: true, + anchorLabel: 'label="General"', + anchorText: 'General', + }, + }; + } + const serial = cfg.serial ?? DEFAULT_ANDROID_SERIAL; + return { + platform: 'android', + deviceName: cfg.serial ? `android (${serial})` : 'Pixel_9_Pro_XL_API_37', + serial, + platformFlags: ['--platform', 'android'], + selectorFlags: ['--serial', serial, '--android-device-allowlist', serial], + appTarget: 'com.android.settings', + selectors: { + deepScreen: 'text="Network & internet"', + searchField: 'text="Search Settings"', + searchFieldEditable: 'editable', + searchEditableAtRoot: false, + anchorLabel: 'label="Network & internet"', + anchorText: 'Network & internet', + }, + }; +} diff --git a/scripts/perf/report.ts b/scripts/perf/report.ts new file mode 100644 index 000000000..3d863a5ff --- /dev/null +++ b/scripts/perf/report.ts @@ -0,0 +1,67 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import type { Measurement, RunResult, Stat } from './types.ts'; + +function ms(n: number | undefined): string { + return typeof n === 'number' && Number.isFinite(n) ? n.toFixed(0) : '–'; +} + +function wallCells(s: Stat | null): string { + if (!s) return '– | – | – | –'; + return `${ms(s.min)} | ${ms(s.median)} | ${ms(s.p95)} | ${ms(s.max)}`; +} + +function stampName(platform: string, startedAt: string): string { + return `perf-${platform}-${startedAt.replace(/[:.]/g, '-')}`; +} + +function measurementRow(m: Measurement): string { + const daemon = m.daemonDuration ? ms(m.daemonDuration.median) : '–'; + const elements = m.elementCount ? ms(m.elementCount.median) : '–'; + const n = m.wallClock?.n ?? 0; + return `| ${m.label} | ${m.command} | ${m.execMode} | ${n} | ${wallCells(m.wallClock)} | ${daemon} | ${elements} | ${m.notes.join('; ')} |`; +} + +function toMarkdown(run: RunResult): string { + const lines: string[] = []; + lines.push(`# agent-device command perf — ${run.platform}`); + lines.push(''); + lines.push(`- **Device**: ${run.device.name} (${run.device.udid ?? run.device.serial ?? '?'})`); + lines.push(`- **agent-device**: ${run.agentDeviceVersion}`); + lines.push(`- **Rounds**: ${run.config.rounds} (warmup ${run.config.warmup} dropped)`); + lines.push(`- **Started**: ${run.startedAt}`); + lines.push(`- **Finished**: ${run.finishedAt}`); + lines.push(''); + lines.push('All times in milliseconds. `wall-clock` includes process spawn + socket overhead;'); + lines.push('`daemon` is the batch step round-trip (spawn overhead ≈ wall-median − daemon-median).'); + lines.push('`elements` = node count in the snapshot payload (tree-size proxy).'); + lines.push('An untimed warmup interaction runs after each open/relaunch, so measured commands'); + lines.push('do not pay the one-time iOS-runner startup or post-relaunch first-AX-query cost.'); + lines.push(''); + lines.push('| command | cli | mode | n | wall min | wall median | wall p95 | wall max | daemon median | elements | notes |'); + lines.push('|---|---|---|---|---|---|---|---|---|---|---|'); + for (const m of run.measurements) lines.push(measurementRow(m)); + lines.push(''); + + const failed = run.measurements.filter((m) => m.failures > 0); + if (failed.length > 0) { + lines.push('## Failures'); + lines.push(''); + for (const m of failed) { + const sample = m.samples.find((s) => !s.ok); + lines.push(`- **${m.label}** — ${m.notes.join('; ')}${sample?.errorMessage ? ` — ${sample.errorMessage}` : ''}`); + } + lines.push(''); + } + return lines.join('\n'); +} + +export function writeReports(run: RunResult, outDir: string): { jsonPath: string; mdPath: string } { + fs.mkdirSync(outDir, { recursive: true }); + const base = stampName(run.platform, run.startedAt); + const jsonPath = path.join(outDir, `${base}.json`); + const mdPath = path.join(outDir, `${base}.md`); + fs.writeFileSync(jsonPath, JSON.stringify(run, null, 2)); + fs.writeFileSync(mdPath, toMarkdown(run)); + return { jsonPath, mdPath }; +} diff --git a/scripts/perf/run.ts b/scripts/perf/run.ts new file mode 100644 index 000000000..9a0010536 --- /dev/null +++ b/scripts/perf/run.ts @@ -0,0 +1,67 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { parseConfig, REPO_ROOT, usesSourceCli } from './config.ts'; +import { runScenario, setupIsolation, teardownIsolation, type IsolationContext } from './harness.ts'; +import { writeReports } from './report.ts'; +import type { RunResult } from './types.ts'; + +function readVersion(): string { + try { + const pkg = JSON.parse(fs.readFileSync(path.join(REPO_ROOT, 'package.json'), 'utf8')); + return typeof pkg.version === 'string' ? pkg.version : 'unknown'; + } catch { + return 'unknown'; + } +} + +function main(): void { + const cfg = parseConfig(process.argv.slice(2)); + // The dist binary needs a build; running from source (AGENT_DEVICE_PERF_CLI) does not. + if (!usesSourceCli() && !fs.existsSync(path.join(REPO_ROOT, 'dist', 'src'))) { + process.stderr.write('[perf] dist/ is missing — run `pnpm build` first.\n'); + process.exit(1); + } + + const startedAt = new Date().toISOString(); + let ctx: IsolationContext | null = null; + let exitCode = 0; + + const cleanup = (): void => { + if (ctx) { + teardownIsolation(ctx, cfg); + ctx = null; + } + }; + process.on('SIGINT', () => { + cleanup(); + process.exit(130); + }); + process.on('SIGTERM', () => { + cleanup(); + process.exit(143); + }); + + try { + ctx = setupIsolation(cfg); + const measurements = runScenario(ctx, cfg); + const run: RunResult = { + startedAt, + finishedAt: new Date().toISOString(), + platform: cfg.platform, + device: { udid: ctx.profile.udid, serial: ctx.profile.serial, name: ctx.profile.deviceName }, + config: { rounds: cfg.rounds, warmup: cfg.warmup, keepArtifacts: cfg.keepArtifacts }, + agentDeviceVersion: readVersion(), + measurements, + }; + const { jsonPath, mdPath } = writeReports(run, cfg.outDir); + process.stderr.write(`\n[perf] report: ${mdPath}\n[perf] json: ${jsonPath}\n`); + } catch (e) { + process.stderr.write(`[perf] error: ${(e as Error).stack ?? String(e)}\n`); + exitCode = 1; + } finally { + cleanup(); + } + process.exit(exitCode); +} + +main(); diff --git a/scripts/perf/scenario.ts b/scripts/perf/scenario.ts new file mode 100644 index 000000000..8c63ac08b --- /dev/null +++ b/scripts/perf/scenario.ts @@ -0,0 +1,102 @@ +import path from 'node:path'; +import type { ResolvedProfile } from './platform-profiles.ts'; + +// A legacy-form batch step: maps through the exact documented CLI grammar. +// `flags` uses internal CliFlags field names (e.g. snapshotInteractiveOnly). +export type BatchStepSpec = { + command: string; + positionals?: string[]; + flags?: Record; +}; + +type ScenarioStepBase = { + label: string; + command: string; + // When set, the harness runs an untimed `open --relaunch` (reset to root, top of list) + // before timing this step. Used for steps whose precondition is a clean root, since + // earlier commands (find/is, search) leave the list scrolled or in a different surface. + freshRoot?: boolean; +}; + +// Discriminated on execMode so the invoker gets the right payload without `!`/`?? []`: +// standalone carries full CLI args; batch carries one legacy batch step. +export type ScenarioStep = + | (ScenarioStepBase & { execMode: 'standalone'; args: string[] }) + | (ScenarioStepBase & { execMode: 'batch'; step: BatchStepSpec; isSnapshot?: boolean }); + +export type StepContext = { artifactsDir: string }; + +function std(label: string, command: string, args: string[]): ScenarioStep { + return { label, command, execMode: 'standalone', args }; +} + +function bat( + label: string, + command: string, + step: BatchStepSpec, + opts: { isSnapshot?: boolean; freshRoot?: boolean } = {}, +): ScenarioStep { + return { label, command, execMode: 'batch' as const, step, ...opts }; +} + +// One ordered pass over Settings. The harness repeats this N (+warmup) times; +// the leading `open --relaunch` resets the app to its root each round, so every +// round starts from a known state while commands run in their natural order. +export function buildSettingsTour(p: ResolvedProfile, ctx: StepContext): ScenarioStep[] { + const s = p.selectors; + const shot = path.join(ctx.artifactsDir, 'shot.png'); + const rec = path.join(ctx.artifactsDir, 'rec.mp4'); + const trace = path.join(ctx.artifactsDir, 'trace.log'); + + // Text entry differs per platform: iOS fills the root search field directly (focusing it + // first can hang); Android must open the search screen before an editable field exists. + const textEntry: ScenarioStep[] = p.selectors.searchEditableAtRoot + ? [ + // iOS: editable search field exists at root; fill it directly (freshRoot resets scroll). + bat('fill search', 'fill', { command: 'fill', positionals: [s.searchFieldEditable, 'general'] }, { freshRoot: true }), + bat('type', 'type', { command: 'type', positionals: ['wifi'] }), + ] + : [ + // Android: tap the search entry first to reveal the editable, then type/fill it. + bat('press search field', 'press', { command: 'press', positionals: [s.searchField] }, { freshRoot: true }), + bat('type', 'type', { command: 'type', positionals: ['wifi'] }), + bat('fill search', 'fill', { command: 'fill', positionals: [s.searchFieldEditable, 'general'] }), + ]; + + return [ + // --- reset to root via relaunch --- + std('open (relaunch → root)', 'open', ['open', p.appTarget, '--relaunch']), + + // --- reads on the root tree (snapshots first; anchor label is visible here) --- + bat('snapshot -i (root)', 'snapshot', { command: 'snapshot', flags: { snapshotInteractiveOnly: true } }, { isSnapshot: true }), + bat('snapshot (root)', 'snapshot', { command: 'snapshot' }, { isSnapshot: true }), + + // --- navigate into a sub-screen from a fresh root (freshRoot resets scroll so the + // deep-screen row is in view), read it, then return --- + bat('press → deep screen', 'press', { command: 'press', positionals: [s.deepScreen] }, { freshRoot: true }), + bat('snapshot (deep)', 'snapshot', { command: 'snapshot' }, { isSnapshot: true }), + bat('snapshot -i (deep)', 'snapshot', { command: 'snapshot', flags: { snapshotInteractiveOnly: true } }, { isSnapshot: true }), + bat('back', 'back', { command: 'back' }), + + // --- targeted reads against the visible anchor (freshRoot so the anchor is on screen) --- + bat('wait text', 'wait', { command: 'wait', positionals: ['text', s.anchorText, '3000'] }, { freshRoot: true }), + bat('find', 'find', { command: 'find', positionals: [s.anchorText] }), + bat('get text', 'get', { command: 'get', positionals: ['text', s.anchorLabel] }), + bat('is visible', 'is', { command: 'is', positionals: ['visible', s.anchorLabel] }), + + // --- text entry (platform-specific order; see textEntry above) then scroll results --- + ...textEntry, + bat('scroll down', 'scroll', { command: 'scroll', positionals: ['down'] }), + + // --- artifact-producing commands; record brackets the rest so the clip has >1s of + // footage (an instant start→stop makes simctl recordVideo fail to finalize) --- + std('record start', 'record', ['record', 'start', rec, '--hide-touches']), + bat('screenshot', 'screenshot', { command: 'screenshot', positionals: [shot] }), + bat('logs mark', 'logs', { command: 'logs', positionals: ['mark', 'perf-mark'] }), + bat('logs clear', 'logs', { command: 'logs', positionals: ['clear'] }), + std('trace start', 'trace', ['trace', 'start', trace]), + std('trace stop', 'trace', ['trace', 'stop']), + bat('perf', 'perf', { command: 'perf' }), + std('record stop', 'record', ['record', 'stop']), + ]; +} diff --git a/scripts/perf/stats.ts b/scripts/perf/stats.ts new file mode 100644 index 000000000..5a41ac4ea --- /dev/null +++ b/scripts/perf/stats.ts @@ -0,0 +1,22 @@ +import type { Stat } from './types.ts'; + +// Nearest-rank percentile over a copy of the values. +function percentile(sorted: number[], p: number): number { + if (sorted.length === 0) return Number.NaN; + const rank = Math.ceil((p / 100) * sorted.length); + const idx = Math.min(sorted.length - 1, Math.max(0, rank - 1)); + return sorted[idx]; +} + +export function summarize(values: number[]): Stat | null { + const clean = values.filter((v) => Number.isFinite(v)); + if (clean.length === 0) return null; + const sorted = [...clean].sort((a, b) => a - b); + return { + n: sorted.length, + min: sorted[0], + median: percentile(sorted, 50), + p95: percentile(sorted, 95), + max: sorted[sorted.length - 1], + }; +} diff --git a/scripts/perf/types.ts b/scripts/perf/types.ts new file mode 100644 index 000000000..175481a88 --- /dev/null +++ b/scripts/perf/types.ts @@ -0,0 +1,50 @@ +// Shared data shapes for the e2e perf benchmark harness. + +export type Platform = 'ios' | 'android'; + +export type ExecMode = 'batch' | 'standalone'; + +export type CliResult = { + exitCode: number; + wallClockMs: number; // measured by the harness around the child process + stdout: string; + stderr: string; + json: unknown; // parsed --json payload (or undefined when not parseable) + ok: boolean; // exit 0 AND (json.ok !== false) +}; + +export type Sample = { + round: number; + wallClockMs: number; + daemonDurationMs?: number; // from batch results[0].durationMs (batch mode only) + elementCount?: number; // for snapshot rows: parsed @eN count, a tree-size proxy + ok: boolean; + errorCode?: string; + errorMessage?: string; +}; + +export type Stat = { n: number; min: number; median: number; p95: number; max: number }; + +export type Measurement = { + command: string; + label: string; + platform: Platform; + execMode: ExecMode; + samples: Sample[]; // kept samples only (warmup rounds dropped) + warmupDropped: number; + wallClock: Stat | null; + daemonDuration: Stat | null; // null for standalone or when no ok samples + elementCount: Stat | null; // null unless snapshot row + failures: number; + notes: string[]; +}; + +export type RunResult = { + startedAt: string; + finishedAt: string; + platform: Platform; + device: { udid?: string; serial?: string; name: string }; + config: { rounds: number; warmup: number; keepArtifacts: boolean }; + agentDeviceVersion: string; + measurements: Measurement[]; +}; diff --git a/src/utils/exec.ts b/src/utils/exec.ts index 6191bb706..018a383c5 100644 --- a/src/utils/exec.ts +++ b/src/utils/exec.ts @@ -23,6 +23,8 @@ export type ExecOptions = { timeoutMs?: number; detached?: boolean; signal?: AbortSignal; + /** Max stdout/stderr bytes for synchronous runs (default Node ~1MB). */ + maxBuffer?: number; }; type ExecStreamOptions = ExecOptions & { @@ -151,27 +153,29 @@ function runSpawnedCommand( child.on('error', (err) => { if (timeoutHandle) clearTimeout(timeoutHandle); abort.dispose(); - reject( - abort.didAbort - ? createCommandCanceledError(executable, cmd, args) - : createSpawnError(executable, cmd, args, err), - ); + reject(spawnRejectionError(abort, executable, cmd, args, err)); }); child.on('close', (code) => { if (timeoutHandle) clearTimeout(timeoutHandle); abort.dispose(); const exitCode = code ?? 1; - if (abort.didAbort) { - reject(createCommandCanceledError(executable, cmd, args)); - return; - } - if (didTimeout && timeoutMs) { + if (!abort.didAbort && didTimeout && timeoutMs) { reject(createTimeoutError(executable, cmd, args, timeoutMs, exitCode, stdout, stderr)); return; } - if (exitCode !== 0 && !options.allowFailure) { - reject(createExitError(executable, cmd, args, exitCode, stdout, stderr)); + const failure = commandCloseFailure( + abort, + executable, + cmd, + args, + exitCode, + options.allowFailure, + stdout, + stderr, + ); + if (failure) { + reject(failure); return; } resolve({ @@ -251,6 +255,7 @@ export function runCmdSync(cmd: string, args: string[], options: ExecOptions = { timeout: normalizeTimeoutMs(options.timeoutMs), windowsHide: true, shell: false, + ...(options.maxBuffer !== undefined ? { maxBuffer: options.maxBuffer } : {}), }); if (result.error) { @@ -347,21 +352,23 @@ export function runCmdBackground( const wait = new Promise((resolve, reject) => { child.on('error', (err) => { abort.dispose(); - reject( - abort.didAbort - ? createCommandCanceledError(executable, cmd, args) - : createSpawnError(executable, cmd, args, err), - ); + reject(spawnRejectionError(abort, executable, cmd, args, err)); }); child.on('close', (code) => { abort.dispose(); const exitCode = code ?? 1; - if (abort.didAbort) { - reject(createCommandCanceledError(executable, cmd, args)); - return; - } - if (exitCode !== 0 && !options.allowFailure) { - reject(createExitError(executable, cmd, args, exitCode, stdout, stderr)); + const failure = commandCloseFailure( + abort, + executable, + cmd, + args, + exitCode, + options.allowFailure, + stdout, + stderr, + ); + if (failure) { + reject(failure); return; } resolve({ stdout, stderr, exitCode }); @@ -463,6 +470,40 @@ function createExitError( }); } +type CommandAbort = { readonly didAbort: boolean }; + +// Error to reject a spawned child's `error` event with: canceled if we aborted, else a spawn error. +function spawnRejectionError( + abort: CommandAbort, + executable: string, + cmd: string, + args: string[], + err: Error, +): AppError { + return abort.didAbort + ? createCommandCanceledError(executable, cmd, args) + : createSpawnError(executable, cmd, args, err); +} + +// Failure (if any) for a spawned child's `close` event: canceled if we aborted, an exit error on +// a non-zero code unless allowed, otherwise null (the command resolves successfully). +function commandCloseFailure( + abort: CommandAbort, + executable: string, + cmd: string, + args: string[], + exitCode: number, + allowFailure: boolean | undefined, + stdout: string, + stderr: string, +): AppError | null { + if (abort.didAbort) return createCommandCanceledError(executable, cmd, args); + if (exitCode !== 0 && !allowFailure) { + return createExitError(executable, cmd, args, exitCode, stdout, stderr); + } + return null; +} + function normalizeOverridePath( rawPath: string | undefined, envName: string,