Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 87 additions & 31 deletions src/services/mcp/McpHub.ts
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,48 @@ export class McpHub {
(conn) => conn.server.name === serverName && (conn.server.source === "global" || !conn.server.source),
)
}
/**
* Wait until a server connection is fully ready (client connected and status updated).
* Guards against race conditions where a tool/resource call arrives before initial connect completes.
*/
private async waitForServerReady(
serverName: string,
source?: "global" | "project",
timeoutMs: number = 10000,
): Promise<ConnectedMcpConnection> {
// Immediately error if there's no connected-type connection to wait on (preserves legacy error for tests/UX)
const initial = this.findConnection(serverName, source)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intentional? The connection check happens outside the loop, which could lead to a race condition if the connection is removed between the initial check and entering the loop. Consider moving the findConnection call inside the loop for safer operation.

const notFoundMsg = `No connection found for server: ${serverName}${source ? ` with source ${source}` : ""}`
if (!initial || initial.type !== "connected") {
throw new Error(notFoundMsg)
}

// If we have a connected-type connection but it's still "connecting", wait briefly for readiness.
const start = Date.now()
while (Date.now() - start < timeoutMs) {
const connection = this.findConnection(serverName, source)
if (connection && connection.type === "connected" && connection.server.status === "connected") {
return connection
}
await delay(50)
}
const conn = this.findConnection(serverName, source)
const status = conn?.type === "connected" ? conn.server.status : "disconnected"
throw new Error(`MCP server "${serverName}" is not ready (status: ${status}). Try again in a moment.`)
}

/**
* Detect transient connection-closed errors from MCP transport to allow a single retry.
*/
private isTransientClosedError(error: unknown): boolean {
const code = (error as any)?.code
const msg = error instanceof Error ? error.message : String(error)
if (typeof code === "number" && code === -32000) return true
if (typeof code === "string" && /-?32000/.test(code)) return true
return /connection closed|closed before|socket hang up|ECONNRESET|EPIPE|ERR_STREAM_PREMATURE_CLOSE/i.test(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This regex pattern is quite broad and might catch unrelated errors. Could we be more specific about which error patterns should trigger a retry? For example, we might want to exclude certain EPIPE errors that indicate permanent failures rather than transient issues.

msg || "",
)
}

private async fetchToolsList(serverName: string, source?: "global" | "project"): Promise<McpTool[]> {
try {
Expand Down Expand Up @@ -1559,22 +1601,31 @@ export class McpHub {
}

async readResource(serverName: string, uri: string, source?: "global" | "project"): Promise<McpResourceResponse> {
const connection = this.findConnection(serverName, source)
if (!connection || connection.type !== "connected") {
throw new Error(`No connection found for server: ${serverName}${source ? ` with source ${source}` : ""}`)
}
let connection = await this.waitForServerReady(serverName, source)
if (connection.server.disabled) {
throw new Error(`Server "${serverName}" is disabled`)
}
return await connection.client.request(
{
method: "resources/read",
params: {
uri,

const doRequest = async () =>
connection.client.request(
{
method: "resources/read",
params: { uri },
},
},
ReadResourceResultSchema,
)
ReadResourceResultSchema,
)

try {
return await doRequest()
} catch (error) {
// Handle initial transient "Connection closed" on first invocation after install/start
if (this.isTransientClosedError(error)) {
await delay(200)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we make this retry delay configurable or at least define it as a constant? The magic number 200ms appears in both callTool and readResource. Something like: private static readonly TRANSIENT_ERROR_RETRY_DELAY_MS = 200;

connection = await this.waitForServerReady(serverName, source)
return await doRequest()
}
throw error
}
}

async callTool(
Expand All @@ -1583,12 +1634,7 @@ export class McpHub {
toolArguments?: Record<string, unknown>,
source?: "global" | "project",
): Promise<McpToolCallResponse> {
const connection = this.findConnection(serverName, source)
if (!connection || connection.type !== "connected") {
throw new Error(
`No connection found for server: ${serverName}${source ? ` with source ${source}` : ""}. Please make sure to use MCP servers available under 'Connected MCP Servers'.`,
)
}
let connection = await this.waitForServerReady(serverName, source)
if (connection.server.disabled) {
throw new Error(`Server "${serverName}" is disabled and cannot be used`)
}
Expand All @@ -1599,23 +1645,33 @@ export class McpHub {
timeout = (parsedConfig.timeout ?? 60) * 1000
} catch (error) {
console.error("Failed to parse server config for timeout:", error)
// Default to 60 seconds if parsing fails
timeout = 60 * 1000
}

return await connection.client.request(
{
method: "tools/call",
params: {
name: toolName,
arguments: toolArguments,
const doRequest = async () =>
connection.client.request(
{
method: "tools/call",
params: {
name: toolName,
arguments: toolArguments,
},
},
},
CallToolResultSchema,
{
timeout,
},
)
CallToolResultSchema,
{ timeout },
)

try {
return await doRequest()
} catch (error) {
// Handle initial transient "Connection closed" that can happen on the very first call
if (this.isTransientClosedError(error)) {
await delay(200)
connection = await this.waitForServerReady(serverName, source)
return await doRequest()
}
throw error
}
}

/**
Expand Down
180 changes: 180 additions & 0 deletions src/services/mcp/__tests__/McpHub.retry.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import type { ClineProvider } from "../../../core/webview/ClineProvider"
import { McpHub } from "../McpHub"
import { vi, describe, it, expect, beforeEach } from "vitest"
import fs from "fs/promises"

// Minimal VSCode and FS mocks to keep constructor side-effects inert
vi.mock("vscode", () => ({
workspace: {
createFileSystemWatcher: vi.fn().mockReturnValue({
onDidChange: vi.fn(),
onDidCreate: vi.fn(),
onDidDelete: vi.fn(),
dispose: vi.fn(),
}),
onDidChangeWorkspaceFolders: vi.fn(),
workspaceFolders: [],
},
window: {
showErrorMessage: vi.fn(),
showInformationMessage: vi.fn(),
showWarningMessage: vi.fn(),
},
Disposable: {
from: vi.fn(),
},
}))

vi.mock("fs/promises", () => ({
default: {
access: vi.fn().mockResolvedValue(undefined),
writeFile: vi.fn().mockResolvedValue(undefined),
readFile: vi.fn().mockResolvedValue("{}"),
unlink: vi.fn().mockResolvedValue(undefined),
rename: vi.fn().mockResolvedValue(undefined),
lstat: vi.fn().mockResolvedValue({ isDirectory: () => true }),
mkdir: vi.fn().mockResolvedValue(undefined),
},
access: vi.fn().mockResolvedValue(undefined),
writeFile: vi.fn().mockResolvedValue(undefined),
readFile: vi.fn().mockResolvedValue("{}"),
unlink: vi.fn().mockResolvedValue(undefined),
rename: vi.fn().mockResolvedValue(undefined),
lstat: vi.fn().mockResolvedValue({ isDirectory: () => true }),
mkdir: vi.fn().mockResolvedValue(undefined),
}))

describe("McpHub transient-retry and readiness", () => {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great test coverage for the happy path! Consider adding edge case tests: What happens when waitForServerReady times out? What happens on the second retry failure (should not retry a third time)? Error handling when connection is removed during the wait period. These would help ensure the retry logic is robust under various failure scenarios.

let mcpHub: McpHub
let mockProvider: Partial<ClineProvider>

beforeEach(() => {
vi.clearAllMocks()

// Prevent constructor from kicking off async initialization that races our injected connections
vi.spyOn(McpHub.prototype as any, "initializeGlobalMcpServers").mockResolvedValue(undefined as any)
vi.spyOn(McpHub.prototype as any, "initializeProjectMcpServers").mockResolvedValue(undefined as any)

mockProvider = {
ensureSettingsDirectoryExists: vi.fn().mockResolvedValue("/mock/settings/path"),
ensureMcpServersDirectoryExists: vi.fn().mockResolvedValue("/mock/settings/path"),
postMessageToWebview: vi.fn().mockResolvedValue(undefined),
getState: vi.fn().mockResolvedValue({ mcpEnabled: true }),
context: {
subscriptions: [],
workspaceState: {} as any,
globalState: {} as any,
secrets: {} as any,
extensionUri: { fsPath: "/test/path" } as any,
extensionPath: "/test/path",
asAbsolutePath: (p: string) => p,
globalStorageUri: { fsPath: "/test/global" } as any,
globalStoragePath: "/test/global",
extension: {
packageJSON: { version: "1.0.0" },
} as any,
} as any,
}

// Keep global config empty so constructor does not attempt to connect anything
vi.mocked(fs.readFile).mockResolvedValue("{}")

mcpHub = new McpHub(mockProvider as ClineProvider)
})

it("retries once on transient -32000 'Connection closed' for tools/call", async () => {
// Prepare a connected connection
const firstError: any = new Error("Connection closed")
firstError.code = -32000

const request = vi
.fn()
// First call: throw transient
.mockRejectedValueOnce(firstError)
// Second call: succeed
.mockResolvedValueOnce({ content: [{ type: "text", text: "ok" }] })

// Inject a connected connection directly
;(mcpHub as any).connections = [
{
type: "connected",
server: {
name: "retry-server",
config: JSON.stringify({ type: "stdio", command: "node", timeout: 60 }),
status: "connected",
source: "global",
errorHistory: [],
},
client: { request } as any,
transport: {} as any,
},
]

const result = await mcpHub.callTool("retry-server", "do_something", { a: 1 })
expect(result).toBeTruthy()
expect(request).toHaveBeenCalledTimes(2)
})

it("retries once on transient -32000 'Connection closed' for resources/read", async () => {
const firstError: any = new Error("Connection closed before response")
firstError.code = -32000

const request = vi
.fn()
.mockRejectedValueOnce(firstError)
.mockResolvedValueOnce({ contents: [{ type: "text", text: "resource" }] })

;(mcpHub as any).connections = [
{
type: "connected",
server: {
name: "retry-server",
config: JSON.stringify({ type: "stdio", command: "node", timeout: 60 }),
status: "connected",
source: "global",
errorHistory: [],
},
client: { request } as any,
transport: {} as any,
},
]

const result = await mcpHub.readResource("retry-server", "file:///tmp/x.txt")
expect(result).toBeTruthy()
expect(request).toHaveBeenCalledTimes(2)
})

it("waits for server to become connected before first tool call", async () => {
// Start 'connecting' and then flip to 'connected' shortly after
const request = vi.fn().mockResolvedValue({ content: [{ type: "text", text: "ok" }] })

const connection: any = {
type: "connected",
server: {
name: "slow-server",
config: JSON.stringify({ type: "stdio", command: "node", timeout: 60 }),
status: "connecting",
source: "global",
errorHistory: [],
},
client: { request },
transport: {},
}

;(mcpHub as any).connections = [connection]

// Flip status to 'connected' after 100ms
setTimeout(() => {
connection.server.status = "connected"
}, 100)

const start = Date.now()
const result = await mcpHub.callTool("slow-server", "ping", {})
const elapsed = Date.now() - start

expect(result).toBeTruthy()
expect(request).toHaveBeenCalledTimes(1)
// Should have waited at least ~100ms before making the request
expect(elapsed).toBeGreaterThanOrEqual(90)
})
})
Loading