RooCodeInc · mrubens · Mar 18, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 7, 2025
@@ -298,6 +298,7 @@
 		"pretty-bytes": "^6.1.1",
 		"puppeteer-chromium-resolver": "^23.0.0",
 		"puppeteer-core": "^23.4.0",
+		"say": "^0.16.0",
 		"serialize-error": "^11.0.3",
 		"simple-git": "^3.27.0",
 		"sound-play": "^1.1.0",

@@ -30,6 +30,7 @@ import { McpHub } from "../../services/mcp/McpHub"
 import { McpServerManager } from "../../services/mcp/McpServerManager"
 import { fileExistsAtPath } from "../../utils/fs"
 import { playSound, setSoundEnabled, setSoundVolume } from "../../utils/sound"
+import { playTts, setTtsEnabled } from "../../utils/tts"
 import { singleCompletionHandler } from "../../utils/single-completion-handler"
 import { searchCommits } from "../../utils/git"
 import { getDiffStrategy } from "../diff/DiffStrategy"
@@ -394,6 +395,11 @@ export class ClineProvider implements vscode.WebviewViewProvider {
 			setSoundEnabled(soundEnabled ?? false)
 		})
 
+		// Initialize tts enabled state
+		this.getState().then(({ ttsEnabled }) => {
+			setTtsEnabled(ttsEnabled ?? false)
+		})
+
 		webviewView.webview.options = {
 			// Allow scripts in the webview
 			enableScripts: true,
@@ -1204,6 +1210,17 @@ export class ClineProvider implements vscode.WebviewViewProvider {
 						setSoundVolume(soundVolume)
 						await this.postStateToWebview()
 						break
+					case "ttsEnabled":
+						const ttsEnabled = message.bool ?? true
+						await this.updateGlobalState("ttsEnabled", ttsEnabled)
+						setTtsEnabled(ttsEnabled) // Add this line to update the tts utility
+						await this.postStateToWebview()
+						break
+					case "playTts":
+						if (message.text) {
+							playTts(message.text)
+						}
+						break
 					case "diffEnabled":
 						const diffEnabled = message.bool ?? true
 						await this.updateGlobalState("diffEnabled", diffEnabled)
@@ -2125,6 +2142,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
 			alwaysAllowMcp,
 			alwaysAllowModeSwitch,
 			soundEnabled,
+			ttsEnabled,
 			diffEnabled,
 			enableCheckpoints,
 			checkpointStorage,
@@ -2176,6 +2194,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
 				.filter((item: HistoryItem) => item.ts && item.task)
 				.sort((a: HistoryItem, b: HistoryItem) => b.ts - a.ts),
 			soundEnabled: soundEnabled ?? false,
+			ttsEnabled: ttsEnabled ?? false,
 			diffEnabled: diffEnabled ?? true,
 			enableCheckpoints: enableCheckpoints ?? true,
 			checkpointStorage: checkpointStorage ?? "task",
@@ -2326,6 +2345,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
 			taskHistory: stateValues.taskHistory,
 			allowedCommands: stateValues.allowedCommands,
 			soundEnabled: stateValues.soundEnabled ?? false,
+			ttsEnabled: stateValues.ttsEnabled ?? false,
 			diffEnabled: stateValues.diffEnabled ?? true,
 			enableCheckpoints: stateValues.enableCheckpoints ?? false,
 			checkpointStorage: stateValues.checkpointStorage ?? "task",

@@ -7,6 +7,7 @@ import { ClineProvider } from "../ClineProvider"
 import { ExtensionMessage, ExtensionState } from "../../../shared/ExtensionMessage"
 import { GlobalStateKey, SecretKey } from "../../../shared/globalState"
 import { setSoundEnabled } from "../../../utils/sound"
+import { setTtsEnabled } from "../../../utils/tts"
 import { defaultModeSlug } from "../../../shared/modes"
 import { experimentDefault } from "../../../shared/experiments"
 import { Cline } from "../../Cline"
@@ -193,6 +194,11 @@ jest.mock("../../../utils/sound", () => ({
 	setSoundEnabled: jest.fn(),
 }))
 
+// Mock tts utility
+jest.mock("../../../utils/tts", () => ({
+	setTtsEnabled: jest.fn(),
+}))
+
 // Mock ESM modules
 jest.mock("p-wait-for", () => ({
 	__esModule: true,
@@ -423,6 +429,7 @@ describe("ClineProvider", () => {
 			alwaysAllowMcp: false,
 			uriScheme: "vscode",
 			soundEnabled: false,
+			ttsEnabled: false,
 			diffEnabled: false,
 			enableCheckpoints: false,
 			checkpointStorage: "task",
@@ -517,6 +524,7 @@ describe("ClineProvider", () => {
 		expect(state).toHaveProperty("alwaysAllowBrowser")
 		expect(state).toHaveProperty("taskHistory")
 		expect(state).toHaveProperty("soundEnabled")
+		expect(state).toHaveProperty("ttsEnabled")
 		expect(state).toHaveProperty("diffEnabled")
 		expect(state).toHaveProperty("writeDelayMs")
 	})
@@ -588,6 +596,18 @@ describe("ClineProvider", () => {
 		expect(setSoundEnabled).toHaveBeenCalledWith(false)
 		expect(mockContext.globalState.update).toHaveBeenCalledWith("soundEnabled", false)
 		expect(mockPostMessage).toHaveBeenCalled()
+
+		// Simulate setting tts to enabled
+		await messageHandler({ type: "ttsEnabled", bool: true })
+		expect(setTtsEnabled).toHaveBeenCalledWith(true)
+		expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", true)
+		expect(mockPostMessage).toHaveBeenCalled()
+
+		// Simulate setting tts to disabled
+		await messageHandler({ type: "ttsEnabled", bool: false })
+		expect(setTtsEnabled).toHaveBeenCalledWith(false)
+		expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", false)
+		expect(mockPostMessage).toHaveBeenCalled()
 	})
 
 	test("requestDelaySeconds defaults to 5 seconds", async () => {

@@ -115,6 +115,7 @@ export interface ExtensionState {
 	currentTaskItem?: HistoryItem
 	allowedCommands?: string[]
 	soundEnabled?: boolean
+	ttsEnabled?: boolean
 	soundVolume?: number
 	diffEnabled?: boolean
 	enableCheckpoints: boolean

@@ -49,7 +49,9 @@ export interface WebviewMessage {
 		| "alwaysAllowMcp"
 		| "alwaysAllowModeSwitch"
 		| "playSound"
+		| "playTts"
 		| "soundEnabled"
+		| "ttsEnabled"
 		| "soundVolume"
 		| "diffEnabled"
 		| "enableCheckpoints"

@@ -57,6 +57,7 @@ export const GLOBAL_STATE_KEYS = [
 	"openRouterUseMiddleOutTransform",
 	"allowedCommands",
 	"soundEnabled",
+	"ttsEnabled",
 	"soundVolume",
 	"diffEnabled",
 	"enableCheckpoints",

@@ -0,0 +1,66 @@
+import * as vscode from "vscode"
+
+let isTtsEnabled = false
+let isSpeaking = false
+const utteranceQueue: string[] = []
+
+/**
+ * Set tts configuration
+ * @param enabled boolean
+ */
+export const setTtsEnabled = (enabled: boolean): void => {
+	isTtsEnabled = enabled
+}
+
+/**
+ * Process the next item in the utterance queue
+ */
+const processQueue = async (): Promise<void> => {
+	if (!isTtsEnabled || isSpeaking || utteranceQueue.length === 0) {
+		return
+	}
+
+	try {
+		isSpeaking = true
+		const nextUtterance = utteranceQueue.shift()!
+		const say = require("say")
+
+		// Wrap say.speak in a promise to handle completion
+		await new Promise<void>((resolve, reject) => {
+			say.speak(nextUtterance, null, null, (err: Error) => {
+				if (err) {
+					reject(err)
+				} else {
+					resolve()
+				}
+			})
+		})
+
+		isSpeaking = false
+		// Process next item in queue if any
+		await processQueue()
+	} catch (error: any) {
+		isSpeaking = false
+		vscode.window.showErrorMessage(error.message)
+		// Try to continue with next item despite error
+		await processQueue()
+	}
+}
+
+/**
+ * Queue a tts message to be spoken
+ * @param message string
+ * @return void
+ */
+export const playTts = async (message: string): Promise<void> => {
+	if (!isTtsEnabled) {
+		return
+	}
+
+	try {
+		utteranceQueue.push(message)
+		await processQueue()
+	} catch (error: any) {
+		vscode.window.showErrorMessage(error.message)
+	}
+}
@@ -86,6 +86,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 	const disableAutoScrollRef = useRef(false)
 	const [showScrollToBottom, setShowScrollToBottom] = useState(false)
 	const [isAtBottom, setIsAtBottom] = useState(false)
+	const lastTtsRef = useRef<string>("")
 
 	const [wasStreaming, setWasStreaming] = useState<boolean>(false)
 	const [showCheckpointWarning, setShowCheckpointWarning] = useState<boolean>(false)
@@ -99,6 +100,10 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 		vscode.postMessage({ type: "playSound", audioType })
 	}
 
+	function playTts(text: string) {
+		vscode.postMessage({ type: "playTts", text })
+	}
+
 	useDeepCompareEffect(() => {
 		// if last message is an ask, show user ask UI
 		// if user finished a task, then start a new task with a new conversation history since in this moment that the extension is waiting for user response, the user could close the extension and the conversation history would be lost.
@@ -659,6 +664,25 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 	)
 
 	useEffect(() => {
+		// skip input message
+		if (lastMessage && messages.length > 1) {
+			let text = lastMessage?.text || ""
+
+			if (
+				lastMessage.type === "say" && // is a say message
+				!lastMessage.partial && // not a partial message
+				!text.startsWith("{") && // not a json object
+				text !== lastTtsRef.current // not the same as last TTS message
+			) {
+				try {
+					playTts(text)
+					lastTtsRef.current = text
+				} catch (error) {
+					console.error("Failed to execute text-to-speech:", error)
+				}
+			}
+		}
+
 		// Only execute when isStreaming changes from true to false
 		if (wasStreaming && !isStreaming && lastMessage) {
 			// Play appropriate sound based on lastMessage content
@@ -691,7 +715,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
 		}
 		// Update previous value
 		setWasStreaming(isStreaming)
-	}, [isStreaming, lastMessage, wasStreaming, isAutoApproved])
+	}, [isStreaming, lastMessage, wasStreaming, isAutoApproved, messages.length])
 
 	const isBrowserSessionMessage = (message: ClineMessage): boolean => {
 		// which of visible messages are browser session messages, see above

@@ -7,12 +7,14 @@ import { SectionHeader } from "./SectionHeader"
 import { Section } from "./Section"
 
 type NotificationSettingsProps = HTMLAttributes<HTMLDivElement> & {
+	ttsEnabled?: boolean
 	soundEnabled?: boolean
 	soundVolume?: number
-	setCachedStateField: SetCachedStateField<"soundEnabled" | "soundVolume">
+	setCachedStateField: SetCachedStateField<"ttsEnabled" | "soundEnabled" | "soundVolume">
 }
 
 export const NotificationSettings = ({
+	ttsEnabled,
 	soundEnabled,
 	soundVolume,
 	setCachedStateField,
@@ -28,6 +30,16 @@ export const NotificationSettings = ({
 			</SectionHeader>
 
 			<Section>
+				<div>
+					<VSCodeCheckbox
+						checked={ttsEnabled}
+						onChange={(e: any) => setCachedStateField("ttsEnabled", e.target.checked)}>
+						<span className="font-medium">Enable text-to-speech</span>
+					</VSCodeCheckbox>
+					<p className="text-vscode-descriptionForeground text-sm mt-0">
+						When enabled, Roo will read aloud its responses using text-to-speech.
+					</p>
+				</div>
 				<div>
 					<VSCodeCheckbox
 						checked={soundEnabled}

@@ -76,6 +76,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },
 		requestDelaySeconds,
 		screenshotQuality,
 		soundEnabled,
+		ttsEnabled,
 		soundVolume,
 		terminalOutputLineLimit,
 		writeDelayMs,
@@ -149,6 +150,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },
 			vscode.postMessage({ type: "allowedCommands", commands: allowedCommands ?? [] })
 			vscode.postMessage({ type: "browserToolEnabled", bool: browserToolEnabled })
 			vscode.postMessage({ type: "soundEnabled", bool: soundEnabled })
+			vscode.postMessage({ type: "ttsEnabled", bool: ttsEnabled })
 			vscode.postMessage({ type: "soundVolume", value: soundVolume })
 			vscode.postMessage({ type: "diffEnabled", bool: diffEnabled })
 			vscode.postMessage({ type: "enableCheckpoints", bool: enableCheckpoints })
@@ -370,6 +372,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },
 
 				<div ref={notificationsRef}>
 					<NotificationSettings
+						ttsEnabled={ttsEnabled}
 						soundEnabled={soundEnabled}
 						soundVolume={soundVolume}
 						setCachedStateField={setCachedStateField}

@@ -32,6 +32,7 @@ export interface ExtensionStateContextType extends ExtensionState {
 	setAllowedCommands: (value: string[]) => void
 	setSoundEnabled: (value: boolean) => void
 	setSoundVolume: (value: number) => void
+	setTtsEnabled: (value: boolean) => void
 	setDiffEnabled: (value: boolean) => void
 	setEnableCheckpoints: (value: boolean) => void
 	setBrowserViewportSize: (value: string) => void
@@ -105,6 +106,7 @@ export const ExtensionStateContextProvider: React.FC<{ children: React.ReactNode
 		allowedCommands: [],
 		soundEnabled: false,
 		soundVolume: 0.5,
+		ttsEnabled: false,
 		diffEnabled: false,
 		enableCheckpoints: true,
 		checkpointStorage: "task",
@@ -242,6 +244,7 @@ export const ExtensionStateContextProvider: React.FC<{ children: React.ReactNode
 		setAllowedCommands: (value) => setState((prevState) => ({ ...prevState, allowedCommands: value })),
 		setSoundEnabled: (value) => setState((prevState) => ({ ...prevState, soundEnabled: value })),
 		setSoundVolume: (value) => setState((prevState) => ({ ...prevState, soundVolume: value })),
+		setTtsEnabled: (value) => setState((prevState) => ({ ...prevState, ttsEnabled: value })),
 		setDiffEnabled: (value) => setState((prevState) => ({ ...prevState, diffEnabled: value })),
 		setEnableCheckpoints: (value) => setState((prevState) => ({ ...prevState, enableCheckpoints: value })),
 		setBrowserViewportSize: (value: string) =>