Skip to content

Commit 0f79a24

Browse files
authored
feat: benchmark tools (#280)
1 parent c9d05c9 commit 0f79a24

24 files changed

+1700
-17
lines changed

bin.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ if (!satisfies(process.version, NODE_VERSION_RANGE)) {
1818
}
1919

2020
import { runMCPInstall, runMCPRemove } from './src/mcp';
21-
import type { CloudRegion, WizardOptions } from './src/utils/types';
21+
import type { CloudRegion } from './src/utils/types';
2222
import { runWizard } from './src/run';
2323
import { isNonInteractiveEnvironment } from './src/utils/environment';
2424
import clack from './src/utils/clack';
@@ -79,6 +79,11 @@ yargs(hideBin(process.argv))
7979
'PostHog personal API key (phx_xxx) for authentication\nenv: POSTHOG_WIZARD_API_KEY',
8080
type: 'string',
8181
},
82+
'project-id': {
83+
describe:
84+
'PostHog project ID to use (optional; when not set, uses default from API key or OAuth)\nenv: POSTHOG_WIZARD_PROJECT_ID',
85+
type: 'string',
86+
},
8287
})
8388
.command(
8489
['$0'],
@@ -115,6 +120,12 @@ yargs(hideBin(process.argv))
115120
'Show menu for manual integration selection instead of auto-detecting\nenv: POSTHOG_WIZARD_MENU',
116121
type: 'boolean',
117122
},
123+
benchmark: {
124+
default: false,
125+
describe:
126+
'Run in benchmark mode with per-phase token tracking\nenv: POSTHOG_WIZARD_BENCHMARK',
127+
type: 'boolean',
128+
},
118129
});
119130
},
120131
(argv) => {
@@ -155,7 +166,7 @@ yargs(hideBin(process.argv))
155166
process.exit(1);
156167
}
157168

158-
void runWizard(options as unknown as WizardOptions);
169+
void runWizard(options as unknown as Parameters<typeof runWizard>[0]);
159170
},
160171
)
161172
.command('mcp <command>', 'MCP server management commands', (yargs) => {

src/lib/__tests__/agent-interface.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ describe('runAgent', () => {
3232
localMcp: false,
3333
ci: false,
3434
menu: false,
35+
benchmark: false,
3536
};
3637

3738
const defaultAgentConfig = {

src/lib/agent-interface.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import path from 'path';
77
import clack from '../utils/clack';
8-
import { debug, logToFile, initLogFile, LOG_FILE_PATH } from '../utils/debug';
8+
import { debug, logToFile, initLogFile, getLogFilePath } from '../utils/debug';
99
import type { WizardOptions } from '../utils/types';
1010
import { analytics } from '../utils/analytics';
1111
import {
@@ -50,6 +50,8 @@ export const AgentSignals = {
5050
ERROR_RESOURCE_MISSING: '[ERROR-RESOURCE-MISSING]',
5151
/** Signal emitted when the agent provides a remark about its run */
5252
WIZARD_REMARK: '[WIZARD-REMARK]',
53+
/** Signal prefix for benchmark logging */
54+
BENCHMARK: '[BENCHMARK]',
5355
} as const;
5456

5557
export type AgentSignal = (typeof AgentSignals)[keyof typeof AgentSignals];
@@ -393,7 +395,7 @@ export async function initializeAgent(
393395
});
394396
}
395397

396-
clack.log.step(`Verbose logs: ${LOG_FILE_PATH}`);
398+
clack.log.step(`Verbose logs: ${getLogFilePath()}`);
397399
clack.log.success("Agent initialized. Let's get cooking!");
398400
return agentRunConfig;
399401
} catch (error) {
@@ -421,6 +423,10 @@ export async function runAgent(
421423
successMessage?: string;
422424
errorMessage?: string;
423425
},
426+
middleware?: {
427+
onMessage(message: any): void;
428+
finalize(resultMessage: any, totalDurationMs: number): any;
429+
},
424430
): Promise<{ error?: AgentErrorType; message?: string }> {
425431
const {
426432
estimatedDurationMinutes = 8,
@@ -446,6 +452,7 @@ export async function runAgent(
446452
const collectedText: string[] = [];
447453
// Track if we received a successful result (before any cleanup errors)
448454
let receivedSuccessResult = false;
455+
let lastResultMessage: any = null;
449456

450457
// Workaround for SDK bug: stdin closes before canUseTool responses can be sent.
451458
// The fix is to use an async generator for the prompt that stays open until
@@ -505,6 +512,11 @@ export async function runAgent(
505512
duration_ms: durationMs,
506513
duration_seconds: durationSeconds,
507514
});
515+
try {
516+
middleware?.finalize(lastResultMessage, durationMs);
517+
} catch (e) {
518+
logToFile(`${AgentSignals.BENCHMARK} Middleware finalize error:`, e);
519+
}
508520
spinner.stop(successMessage);
509521
return {};
510522
};
@@ -604,12 +616,19 @@ export async function runAgent(
604616
receivedSuccessResult,
605617
);
606618

619+
try {
620+
middleware?.onMessage(message);
621+
} catch (e) {
622+
logToFile(`${AgentSignals.BENCHMARK} Middleware onMessage error:`, e);
623+
}
624+
607625
// Signal completion when result received
608626
if (message.type === 'result') {
609627
// Track successful results before any potential cleanup errors
610628
// The SDK may emit a second error result during cleanup due to a race condition
611629
if (message.subtype === 'success' && !message.is_error) {
612630
receivedSuccessResult = true;
631+
lastResultMessage = message;
613632
}
614633
signalDone!();
615634
}

src/lib/agent-runner.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import {
3535
} from '../steps';
3636
import { checkAnthropicStatusWithPrompt } from '../utils/anthropic-status';
3737
import { enableDebugLogs } from '../utils/debug';
38+
import { createBenchmarkPipeline } from './middleware/benchmark';
3839

3940
/**
4041
* Universal agent-powered wizard runner.
@@ -159,7 +160,6 @@ export async function runAgentWizard(
159160
analytics.setTag(key, value);
160161
});
161162

162-
// Build integration prompt
163163
const integrationPrompt = buildIntegrationPrompt(
164164
config,
165165
{
@@ -197,6 +197,10 @@ export async function runAgentWizard(
197197
options,
198198
);
199199

200+
const middleware = options.benchmark
201+
? createBenchmarkPipeline(spinner, options)
202+
: undefined;
203+
200204
const agentResult = await runAgent(
201205
agent,
202206
integrationPrompt,
@@ -208,6 +212,7 @@ export async function runAgentWizard(
208212
successMessage: config.ui.successMessage,
209213
errorMessage: 'Integration failed',
210214
},
215+
middleware,
211216
);
212217

213218
// Handle error cases detected in agent output
@@ -355,7 +360,6 @@ ${chalk.dim(`How did this work for you? Drop us a line: wizard@posthog.com`)}`;
355360

356361
/**
357362
* Build the integration prompt for the agent.
358-
* Uses shared base prompt with optional framework-specific addendum.
359363
*/
360364
function buildIntegrationPrompt(
361365
config: FrameworkConfig,

src/lib/middleware/benchmark.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/**
2+
* Benchmark tracking for wizard runs.
3+
*
4+
* Usage:
5+
* const pipeline = createBenchmarkPipeline(spinner, options);
6+
* pipeline.onMessage(message);
7+
* pipeline.finalize(resultMessage, durationMs);
8+
*/
9+
10+
import chalk from 'chalk';
11+
import clack from '../../utils/clack';
12+
import { logToFile, getLogFilePath, configureLogFile } from '../../utils/debug';
13+
import { MiddlewarePipeline } from './pipeline';
14+
import { PhaseDetector } from './phase-detector';
15+
import { loadBenchmarkConfig } from './config';
16+
import { createPluginsFromConfig } from './benchmarks';
17+
import type { BenchmarkConfig } from './config';
18+
import type { WizardOptions } from '../../utils/types';
19+
import { AgentSignals } from '../agent-interface';
20+
21+
// ── Types ──────────────────────────────────────────────────────────────
22+
23+
export interface StepUsage {
24+
name: string;
25+
usage: {
26+
input_tokens: number;
27+
output_tokens: number;
28+
cache_creation_input_tokens: number;
29+
cache_read_input_tokens: number;
30+
cache_creation?: {
31+
ephemeral_5m_input_tokens: number;
32+
ephemeral_1h_input_tokens: number;
33+
};
34+
};
35+
modelUsage: Record<string, unknown>;
36+
totalCostUsd: number;
37+
durationMs: number;
38+
durationApiMs: number;
39+
numTurns: number;
40+
contextTokensIn?: number;
41+
contextTokensOut?: number;
42+
compactions?: number;
43+
compactionPreTokens?: number[];
44+
}
45+
46+
export interface BenchmarkData {
47+
timestamp: string;
48+
steps: StepUsage[];
49+
totals: {
50+
totalCostUsd: number;
51+
durationMs: number;
52+
inputTokens: number;
53+
outputTokens: number;
54+
numTurns: number;
55+
totalCompactions: number;
56+
totalCacheReadTokens: number;
57+
totalCacheCreation5mTokens: number;
58+
totalCacheCreation1hTokens: number;
59+
};
60+
}
61+
62+
// ── Factory ────────────────────────────────────────────────────────────
63+
64+
/**
65+
* Create a middleware pipeline configured for benchmarking.
66+
* Loads .benchmark-config.json from the install dir, falls back to defaults.
67+
*/
68+
export function createBenchmarkPipeline(
69+
spinner: ReturnType<typeof clack.spinner>,
70+
options: WizardOptions,
71+
configOverride?: BenchmarkConfig,
72+
): MiddlewarePipeline {
73+
const config = configOverride ?? loadBenchmarkConfig(options.installDir);
74+
75+
configureLogFile({
76+
path: config.output.logPath,
77+
enabled: config.output.logEnabled,
78+
});
79+
80+
const plugins = createPluginsFromConfig(config, {
81+
spinner,
82+
phased: false,
83+
outputPath: config.output.benchmarkPath,
84+
});
85+
86+
if (!config.output.suppressWizardLogs) {
87+
clack.log.info(
88+
`${chalk.cyan(AgentSignals.BENCHMARK)} Verbose logs: ${getLogFilePath()}`,
89+
);
90+
clack.log.info(
91+
`${chalk.cyan(
92+
AgentSignals.BENCHMARK,
93+
)} Benchmark data will be written to: ${config.output.benchmarkPath}`,
94+
);
95+
}
96+
97+
logToFile(
98+
`${AgentSignals.BENCHMARK} Tracking enabled, starting with setup phase`,
99+
);
100+
101+
return new MiddlewarePipeline(plugins, {
102+
phaseDetector: new PhaseDetector(),
103+
autoDetectPhases: true,
104+
});
105+
}

0 commit comments

Comments
 (0)