Skip to content

Commit eda3abf

Browse files
authored
Port Bedrock improvements from 2025.08 (#8931)
Addresses #8913. Brings forward these improvements: - configurable max output tokens per provider (applies to all providers) - ability to toggle followup generation on/off (applies to all providers) - lower max output tokens for Bedrock Claude models - cache breakpoints for Bedrock Claude I've also made two additional logging improvements not included in 2025.08: - all Anthropic usage stats logged in one entry after each request finishes (suggestion from @wch) - we now log at the `[info]` level more often (at least twice every time you ask Assistant something). Formerly, it was very possible to see literally nothing in the Assistant logs at the default `[info]` level, which was disconcerting. I would like to refactor the Bedrock bits, so there are fewer model-specific switches in the general Vercel handling. That can be tracked in #8776. ### QA Notes - Be sure to set the log level to Trace to see all the new logging. - Turning on followup generation requires an IDE restart. - If you want to test these changes in Desktop, you'll need the following guide: https://connect.posit.it/positron-wiki/positron-assistant.html#aws-bedrock-in-positron-desktop
1 parent 3f5a43f commit eda3abf

File tree

8 files changed

+291
-532
lines changed

8 files changed

+291
-532
lines changed

extensions/positron-assistant/package-lock.json

Lines changed: 61 additions & 485 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

extensions/positron-assistant/package.json

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,20 @@
202202
"type": "boolean",
203203
"default": false,
204204
"description": "%configuration.showTokenUsage.description%"
205+
},
206+
"positron.assistant.maxOutputTokens": {
207+
"type": "object",
208+
"default": {},
209+
"description": "%configuration.maxOutputTokens.description%",
210+
"additionalProperties": {
211+
"type": "number",
212+
"minimum": 512
213+
}
214+
},
215+
"positron.assistant.followups.enable": {
216+
"type": "boolean",
217+
"default": true,
218+
"description": "%configuration.followups.enable.description%"
205219
}
206220
}
207221
}
@@ -504,7 +518,7 @@
504518
"postinstall": "ts-node scripts/post-install.ts"
505519
},
506520
"devDependencies": {
507-
"@ai-sdk/amazon-bedrock": "^1.1.6",
521+
"@ai-sdk/amazon-bedrock": "2.2.12",
508522
"@ai-sdk/anthropic": "^1.0.5",
509523
"@ai-sdk/azure": "^1.1.9",
510524
"@ai-sdk/google": "^1.1.17",

extensions/positron-assistant/package.nls.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@
2323
"configuration.inlineCompletionExcludes.description": "A list of [glob patterns](https://aka.ms/vscode-glob-patterns) to exclude from inline completions.",
2424
"configuration.gitIntegration.description": "Enable Positron Assistant git integration.",
2525
"configuration.getTableSummary.description": "Enable Positron Assistant get table summary tool.",
26-
"configuration.showTokenUsage.description": "Show token usage in the chat view for supported providers for the session and each message and response including prompts. Check with your provider for detailed usage."
26+
"configuration.showTokenUsage.description": "Show token usage in the chat view for supported providers for the session and each message and response including prompts. Check with your provider for detailed usage.",
27+
"configuration.maxOutputTokens.description": "Override the maximum output tokens for specific language models. The key selects a model ID (partial match supported); the value is the maximum output tokens for that model.",
28+
"configuration.followups.enable.description": "Enable suggested followup prompts after chat responses."
2729
}

extensions/positron-assistant/src/anthropic.ts

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ export class AnthropicLanguageModel implements positron.ai.LanguageModelChatProv
100100

101101
// Log request information - the request ID is only available upon connection.
102102
stream.on('connect', () => {
103+
log.info(`[anthropic] Start request ${stream.request_id} to ${this._config.model}: ${anthropicMessages.length} messages`);
103104
if (log.logLevel <= vscode.LogLevel.Trace) {
104105
log.trace(`[anthropic] SEND messages.stream [${stream.request_id}]: ${JSON.stringify(body, null, 2)}`);
105106
} else {
@@ -136,6 +137,7 @@ export class AnthropicLanguageModel implements positron.ai.LanguageModelChatProv
136137
await stream.done();
137138
} catch (error) {
138139
if (error instanceof Anthropic.APIError) {
140+
log.warn(`[anthropic] Error in messages.stream [${stream.request_id}]: ${error.message}`);
139141
let data: any;
140142
try {
141143
data = JSON.parse(error.message);
@@ -146,6 +148,7 @@ export class AnthropicLanguageModel implements positron.ai.LanguageModelChatProv
146148
throw new Error(`Anthropic's API is temporarily overloaded.`);
147149
}
148150
} else if (error instanceof Anthropic.AnthropicError) {
151+
log.warn(`[anthropic] Error in messages.stream [${stream.request_id}]: ${error.message}`);
149152
// This can happen if the API key was not persisted correctly.
150153
if (error.message.startsWith('Could not resolve authentication method')) {
151154
throw new Error('Something went wrong when storing the Anthropic API key. ' +
@@ -161,9 +164,8 @@ export class AnthropicLanguageModel implements positron.ai.LanguageModelChatProv
161164
log.trace(`[anthropic] RECV messages.stream [${stream.request_id}]: ${JSON.stringify(message, null, 2)}`);
162165
} else {
163166
log.debug(
164-
`[anthropic] RECV messages.stream [${stream.request_id}]: ` +
165-
`usage: ${JSON.stringify(message.usage)}`
166-
);
167+
`[anthropic] RECV messages.stream [${stream.request_id}]`);
168+
log.info(`[anthropic] Finished request ${stream.request_id}; usage: ${JSON.stringify(message.usage)}`);
167169
}
168170

169171
// Record token usage

extensions/positron-assistant/src/completion.ts

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ class OpenRouterCompletion extends FimPromptCompletion {
511511
}
512512

513513
class AWSCompletion extends FimPromptCompletion {
514-
protected model;
514+
protected model: ai.LanguageModelV1;
515515

516516
static source: positron.ai.LanguageModelSource = {
517517
type: positron.PositronLanguageModelType.Completion,
@@ -529,11 +529,10 @@ class AWSCompletion extends FimPromptCompletion {
529529
constructor(_config: ModelConfig) {
530530
super(_config);
531531

532+
// Cast to ai.LanguageModelV1 to satisfy base class type
532533
this.model = createAmazonBedrock({
533-
bedrockOptions: {
534-
credentials: fromNodeProviderChain(),
535-
}
536-
})(this._config.model);
534+
credentialProvider: fromNodeProviderChain(),
535+
})(this._config.model) as unknown as ai.LanguageModelV1;
537536
}
538537
}
539538

extensions/positron-assistant/src/models.ts

Lines changed: 137 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ import { createOpenAI } from '@ai-sdk/openai';
1515
import { createMistral } from '@ai-sdk/mistral';
1616
import { createOllama } from 'ollama-ai-provider';
1717
import { createOpenRouter } from '@openrouter/ai-sdk-provider';
18-
import { processMessages, toAIMessage } from './utils';
18+
import { markBedrockCacheBreakpoint, processMessages, toAIMessage } from './utils';
1919
import { createAmazonBedrock } from '@ai-sdk/amazon-bedrock';
2020
import { fromNodeProviderChain } from '@aws-sdk/credential-providers';
2121
import { AnthropicLanguageModel } from './anthropic';
2222
import { DEFAULT_MAX_TOKEN_OUTPUT } from './constants.js';
23-
import { recordRequestTokenUsage, recordTokenUsage } from './extension.js';
23+
import { log, recordRequestTokenUsage, recordTokenUsage } from './extension.js';
2424

2525
/**
2626
* Models used by chat participants and for vscode.lm.* API functionality.
@@ -195,7 +195,7 @@ abstract class AILanguageModel implements positron.ai.LanguageModelChatProvider
195195
public readonly name;
196196
public readonly provider;
197197
public readonly identifier;
198-
public readonly maxOutputTokens;
198+
public readonly maxOutputTokens: number;
199199
protected abstract model: ai.LanguageModelV1;
200200

201201
capabilities = {
@@ -211,7 +211,26 @@ abstract class AILanguageModel implements positron.ai.LanguageModelChatProvider
211211
this.identifier = _config.id;
212212
this.name = _config.name;
213213
this.provider = _config.provider;
214+
const maxOutputTokens = vscode.workspace.getConfiguration('positron.assistant').get('maxOutputTokens', {} as Record<string, number>);
214215
this.maxOutputTokens = _config.maxOutputTokens ?? DEFAULT_MAX_TOKEN_OUTPUT;
216+
217+
// Override maxOutputTokens if specified in the configuration
218+
for (const [key, value] of Object.entries(maxOutputTokens)) {
219+
if (_config.model.indexOf(key) !== -1 && value) {
220+
let maxOutputTokens = value;
221+
if (typeof maxOutputTokens !== 'number') {
222+
log.warn(`Invalid maxOutputTokens '${maxOutputTokens}' for ${key} (${_config.model}); ignoring`);
223+
continue;
224+
}
225+
if (maxOutputTokens < 512) {
226+
log.warn(`Specified maxOutputTokens '${maxOutputTokens}' for ${key} (${_config.model}) is too low; using 512 instead`);
227+
maxOutputTokens = 512;
228+
}
229+
log.debug(`Setting maxOutputTokens for ${key} (${_config.model}) to ${maxOutputTokens}`);
230+
this.maxOutputTokens = maxOutputTokens;
231+
break;
232+
}
233+
}
215234
}
216235

217236
get providerName(): string {
@@ -261,9 +280,44 @@ abstract class AILanguageModel implements positron.ai.LanguageModelChatProvider
261280
const processedMessages = processMessages(messages);
262281
// Only Anthropic currently supports experimental_content in tool
263282
// results.
264-
const toolResultExperimentalContent = this.provider === 'anthropic';
265-
// Convert messages to the Vercel AI format.
266-
const aiMessages = toAIMessage(processedMessages, toolResultExperimentalContent);
283+
const toolResultExperimentalContent = this.provider === 'anthropic' ||
284+
this.model.modelId.startsWith('us.anthropic');
285+
286+
// Only select Bedrock models support cache breakpoints; specifically,
287+
// the Claude 3.5 Sonnet models don't support them.
288+
//
289+
// Consider: it'd be more verbose but we should consider including this information
290+
// in the hardcoded model metadata in the model config.
291+
const bedrockCacheBreakpoint = this.provider === 'bedrock' &&
292+
!this.model.modelId.startsWith('us.anthropic.claude-3-5')
293+
294+
const aiMessages: ai.CoreMessage[] = [];
295+
296+
// The system message we will send to the model.
297+
let systemMessage: string | undefined = modelOptions.system;
298+
299+
if (bedrockCacheBreakpoint && systemMessage) {
300+
// Add the system prompt as the first message if we have a system
301+
// prompt and cache breakpoints are enabled.
302+
//
303+
// This must be done in order to set a cache breakpoint for the
304+
// system message. In general we prefer to send the system message
305+
// using the 'system' option in streamText; see the
306+
// CoreSystemMessage documentation for a detailed explanation.
307+
const aiSystemMessage: ai.CoreSystemMessage = {
308+
role: 'system',
309+
content: systemMessage,
310+
};
311+
markBedrockCacheBreakpoint(aiSystemMessage);
312+
aiMessages.push(aiSystemMessage);
313+
314+
// Consume the system message so it doesn't get sent a second time
315+
systemMessage = undefined;
316+
}
317+
318+
// Convert all other messages to the Vercel AI format.
319+
aiMessages.push(...toAIMessage(processedMessages, toolResultExperimentalContent,
320+
bedrockCacheBreakpoint));
267321

268322
if (options.tools && options.tools.length > 0) {
269323
tools = options.tools.reduce((acc: Record<string, ai.Tool>, tool: vscode.LanguageModelChatTool) => {
@@ -275,43 +329,72 @@ abstract class AILanguageModel implements positron.ai.LanguageModelChatProvider
275329
}, {});
276330
}
277331

332+
const modelTools = this._config.toolCalls ? tools : undefined;
333+
const requestId = (options.modelOptions as any)?.requestId;
334+
335+
log.info(`[vercel] Start request ${requestId} to ${this._config.name}: ${aiMessages.length} messages`);
336+
log.debug(`[${this._config.name}] SEND ${aiMessages.length} messages, ${modelTools ? Object.keys(modelTools).length : 0} tools`);
337+
if (modelTools) {
338+
log.trace(`tools: ${modelTools ? Object.keys(modelTools).join(', ') : '(none)'}`);
339+
}
340+
if (systemMessage) {
341+
log.trace(`system: ${systemMessage.length > 100 ? `${systemMessage.substring(0, 100)}...` : systemMessage} (${systemMessage.length} chars)`);
342+
}
343+
log.trace(`messages: ${JSON.stringify(aiMessages, null, 2)}`);
278344
const result = ai.streamText({
279345
model: this.model,
280-
system: modelOptions.system ?? undefined,
346+
system: systemMessage,
281347
messages: aiMessages,
282348
maxSteps: modelOptions.maxSteps ?? 50,
283-
tools: this._config.toolCalls ? tools : undefined,
349+
tools: modelTools,
284350
abortSignal: signal,
285351
maxTokens: modelOptions.maxTokens ?? this.maxOutputTokens,
286352
});
287353

354+
let accumulatedTextDeltas: string[] = [];
355+
356+
const flushAccumulatedTextDeltas = () => {
357+
if (accumulatedTextDeltas.length > 0) {
358+
const combinedText = accumulatedTextDeltas.join('');
359+
log.trace(`[${this._config.name}] RECV text-delta (${accumulatedTextDeltas.length} parts): ${combinedText}`);
360+
accumulatedTextDeltas = [];
361+
}
362+
};
363+
288364
for await (const part of result.fullStream) {
289365
if (token.isCancellationRequested) {
290366
break;
291367
}
292368

293369
if (part.type === 'reasoning') {
370+
flushAccumulatedTextDeltas();
371+
log.trace(`[${this._config.name}] RECV reasoning: ${part.textDelta}`);
294372
progress.report({
295373
index: 0,
296374
part: new vscode.LanguageModelTextPart(part.textDelta)
297375
});
298376
}
299377

300378
if (part.type === 'text-delta') {
379+
accumulatedTextDeltas.push(part.textDelta);
301380
progress.report({
302381
index: 0,
303382
part: new vscode.LanguageModelTextPart(part.textDelta)
304383
});
305384
}
306385

307386
if (part.type === 'tool-call') {
387+
flushAccumulatedTextDeltas();
388+
log.trace(`[${this._config.name}] RECV tool-call: ${part.toolCallId} (${part.toolName}) with args: ${JSON.stringify(part.args)}`);
308389
progress.report({
309390
index: 0,
310391
part: new vscode.LanguageModelToolCallPart(part.toolCallId, part.toolName, part.args)
311392
});
312393
}
313394

314395
if (part.type === 'error') {
396+
flushAccumulatedTextDeltas();
397+
log.warn(`[${this._config.name}] RECV error: ${JSON.stringify(part.error)}`);
315398
// TODO: Deal with various LLM providers' different error response formats
316399
if (typeof part.error === 'string') {
317400
throw new Error(part.error);
@@ -324,19 +407,47 @@ abstract class AILanguageModel implements positron.ai.LanguageModelChatProvider
324407
}
325408
}
326409

327-
if (this._context) {
328-
// ai-sdk provides token usage in the result but it's not clear how it is calculated
329-
const usage = await result.usage;
330-
const outputCount = usage.completionTokens;
331-
const inputCount = usage.promptTokens;
332-
const requestId = (options.modelOptions as any)?.requestId;
410+
// Flush any remaining accumulated text deltas
411+
flushAccumulatedTextDeltas();
412+
413+
// Log all the warnings from the response
414+
result.warnings.then((warnings) => {
415+
if (warnings) {
416+
for (const warning of warnings) {
417+
log.warn(`[${this.model}] (${this.identifier}) warn: ${warning}`);
418+
}
419+
}
420+
});
421+
422+
// ai-sdk provides token usage in the result but it's not clear how it is calculated
423+
const usage = await result.usage;
424+
const outputCount = usage.completionTokens;
425+
const inputCount = usage.promptTokens;
426+
427+
if (requestId) {
428+
recordRequestTokenUsage(requestId, this.provider, inputCount, outputCount);
429+
}
333430

431+
if (this._context) {
334432
recordTokenUsage(this._context, this.provider, inputCount, outputCount);
433+
}
335434

336-
if (requestId) {
337-
recordRequestTokenUsage(requestId, this.provider, inputCount, outputCount);
338-
}
435+
const other = await result.providerMetadata;
339436

437+
log.info(`[vercel]: End request ${requestId}; usage: ${inputCount} input tokens, ${outputCount} output tokens`);
438+
439+
// Log Bedrock usage if available
440+
if (other && other.bedrock && other.bedrock.usage) {
441+
// Get the Bedrock usage object; it typically contains
442+
// `cacheReadInputTokens` and `cacheWriteInputTokens`
443+
const usage = other.bedrock.usage as Record<string, any>;
444+
445+
// Add the input and output tokens to the usage object
446+
usage.inputTokens = inputCount;
447+
usage.outputTokens = outputCount;
448+
449+
// Log the Bedrock usage
450+
log.debug(`[${this._config.name}]: Bedrock usage: ${JSON.stringify(other.bedrock.usage, null, 2)}`);
340451
}
341452
}
342453

@@ -563,7 +674,7 @@ class VertexLanguageModel extends AILanguageModel implements positron.ai.Languag
563674
}
564675

565676
export class AWSLanguageModel extends AILanguageModel implements positron.ai.LanguageModelChatProvider {
566-
protected model;
677+
protected model: ai.LanguageModelV1;
567678

568679
static source: positron.ai.LanguageModelSource = {
569680
type: positron.PositronLanguageModelType.Chat,
@@ -583,13 +694,11 @@ export class AWSLanguageModel extends AILanguageModel implements positron.ai.Lan
583694
super(_config, _context);
584695

585696
this.model = createAmazonBedrock({
586-
bedrockOptions: {
587-
// AWS_ACCESS_KEY_ID, AWS_SESSION_TOKEN, and AWS_SECRET_ACCESS_KEY must be set
588-
// sets the AWS region where the models are available
589-
region: process.env.AWS_REGION ?? 'us-east-1',
590-
credentials: fromNodeProviderChain(),
591-
}
592-
})(this._config.model);
697+
// AWS_ACCESS_KEY_ID, AWS_SESSION_TOKEN, and AWS_SECRET_ACCESS_KEY must be set
698+
// sets the AWS region where the models are available
699+
region: process.env.AWS_REGION ?? 'us-east-1',
700+
credentialProvider: fromNodeProviderChain(),
701+
})(this._config.model) as ai.LanguageModelV1;
593702
}
594703

595704
get providerName(): string {
@@ -712,17 +821,17 @@ export const availableModels = new Map<string, { name: string; identifier: strin
712821
{
713822
name: 'Claude 4 Sonnet Bedrock',
714823
identifier: 'us.anthropic.claude-sonnet-4-20250514-v1:0',
715-
maxOutputTokens: 64_000, // reference: https://docs.anthropic.com/en/docs/about-claude/models/all-models#model-comparison-table
824+
maxOutputTokens: 8_192, // use more conservative value for Bedrock (up to 64K tokens available)
716825
},
717826
{
718827
name: 'Claude 4 Opus Bedrock',
719828
identifier: 'us.anthropic.claude-opus-4-20250514-v1:0',
720-
maxOutputTokens: 32_000, // reference: https://docs.anthropic.com/en/docs/about-claude/models/all-models#model-comparison-table
829+
maxOutputTokens: 8_192, // use more conservative value for Bedrock (up to 32K tokens available)
721830
},
722831
{
723832
name: 'Claude 3.7 Sonnet v1 Bedrock',
724833
identifier: 'us.anthropic.claude-3-7-sonnet-20250219-v1:0',
725-
maxOutputTokens: 64_000, // reference: https://docs.anthropic.com/en/docs/about-claude/models/all-models#model-comparison-table
834+
maxOutputTokens: 8_192, // use more conservative value for Bedrock (up to 64K tokens available)
726835
},
727836
{
728837
name: 'Claude 3.5 Sonnet v2 Bedrock',

0 commit comments

Comments
 (0)