Skip to content

Commit 0d4a743

Browse files
committed
Add text-to-speech functionality
1 parent d2c2029 commit 0d4a743

File tree

12 files changed

+172
-2
lines changed

12 files changed

+172
-2
lines changed

package-lock.json

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@
298298
"pretty-bytes": "^6.1.1",
299299
"puppeteer-chromium-resolver": "^23.0.0",
300300
"puppeteer-core": "^23.4.0",
301+
"say": "^0.16.0",
301302
"serialize-error": "^11.0.3",
302303
"simple-git": "^3.27.0",
303304
"sound-play": "^1.1.0",

src/core/webview/ClineProvider.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import { McpHub } from "../../services/mcp/McpHub"
3030
import { McpServerManager } from "../../services/mcp/McpServerManager"
3131
import { fileExistsAtPath } from "../../utils/fs"
3232
import { playSound, setSoundEnabled, setSoundVolume } from "../../utils/sound"
33+
import { playTts, setTtsEnabled } from "../../utils/tts"
3334
import { singleCompletionHandler } from "../../utils/single-completion-handler"
3435
import { searchCommits } from "../../utils/git"
3536
import { getDiffStrategy } from "../diff/DiffStrategy"
@@ -394,6 +395,11 @@ export class ClineProvider implements vscode.WebviewViewProvider {
394395
setSoundEnabled(soundEnabled ?? false)
395396
})
396397

398+
// Initialize tts enabled state
399+
this.getState().then(({ ttsEnabled }) => {
400+
setTtsEnabled(ttsEnabled ?? false)
401+
})
402+
397403
webviewView.webview.options = {
398404
// Allow scripts in the webview
399405
enableScripts: true,
@@ -1204,6 +1210,17 @@ export class ClineProvider implements vscode.WebviewViewProvider {
12041210
setSoundVolume(soundVolume)
12051211
await this.postStateToWebview()
12061212
break
1213+
case "ttsEnabled":
1214+
const ttsEnabled = message.bool ?? true
1215+
await this.updateGlobalState("ttsEnabled", ttsEnabled)
1216+
setTtsEnabled(ttsEnabled) // Add this line to update the tts utility
1217+
await this.postStateToWebview()
1218+
break
1219+
case "playTts":
1220+
if (message.text) {
1221+
playTts(message.text)
1222+
}
1223+
break
12071224
case "diffEnabled":
12081225
const diffEnabled = message.bool ?? true
12091226
await this.updateGlobalState("diffEnabled", diffEnabled)
@@ -2125,6 +2142,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
21252142
alwaysAllowMcp,
21262143
alwaysAllowModeSwitch,
21272144
soundEnabled,
2145+
ttsEnabled,
21282146
diffEnabled,
21292147
enableCheckpoints,
21302148
checkpointStorage,
@@ -2176,6 +2194,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
21762194
.filter((item: HistoryItem) => item.ts && item.task)
21772195
.sort((a: HistoryItem, b: HistoryItem) => b.ts - a.ts),
21782196
soundEnabled: soundEnabled ?? false,
2197+
ttsEnabled: ttsEnabled ?? false,
21792198
diffEnabled: diffEnabled ?? true,
21802199
enableCheckpoints: enableCheckpoints ?? true,
21812200
checkpointStorage: checkpointStorage ?? "task",
@@ -2326,6 +2345,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
23262345
taskHistory: stateValues.taskHistory,
23272346
allowedCommands: stateValues.allowedCommands,
23282347
soundEnabled: stateValues.soundEnabled ?? false,
2348+
ttsEnabled: stateValues.ttsEnabled ?? false,
23292349
diffEnabled: stateValues.diffEnabled ?? true,
23302350
enableCheckpoints: stateValues.enableCheckpoints ?? false,
23312351
checkpointStorage: stateValues.checkpointStorage ?? "task",

src/core/webview/__tests__/ClineProvider.test.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { ClineProvider } from "../ClineProvider"
77
import { ExtensionMessage, ExtensionState } from "../../../shared/ExtensionMessage"
88
import { GlobalStateKey, SecretKey } from "../../../shared/globalState"
99
import { setSoundEnabled } from "../../../utils/sound"
10+
import { setTtsEnabled } from "../../../utils/tts"
1011
import { defaultModeSlug } from "../../../shared/modes"
1112
import { experimentDefault } from "../../../shared/experiments"
1213
import { Cline } from "../../Cline"
@@ -193,6 +194,11 @@ jest.mock("../../../utils/sound", () => ({
193194
setSoundEnabled: jest.fn(),
194195
}))
195196

197+
// Mock tts utility
198+
jest.mock("../../../utils/tts", () => ({
199+
setTtsEnabled: jest.fn(),
200+
}))
201+
196202
// Mock ESM modules
197203
jest.mock("p-wait-for", () => ({
198204
__esModule: true,
@@ -423,6 +429,7 @@ describe("ClineProvider", () => {
423429
alwaysAllowMcp: false,
424430
uriScheme: "vscode",
425431
soundEnabled: false,
432+
ttsEnabled: false,
426433
diffEnabled: false,
427434
enableCheckpoints: false,
428435
checkpointStorage: "task",
@@ -517,6 +524,7 @@ describe("ClineProvider", () => {
517524
expect(state).toHaveProperty("alwaysAllowBrowser")
518525
expect(state).toHaveProperty("taskHistory")
519526
expect(state).toHaveProperty("soundEnabled")
527+
expect(state).toHaveProperty("ttsEnabled")
520528
expect(state).toHaveProperty("diffEnabled")
521529
expect(state).toHaveProperty("writeDelayMs")
522530
})
@@ -588,6 +596,18 @@ describe("ClineProvider", () => {
588596
expect(setSoundEnabled).toHaveBeenCalledWith(false)
589597
expect(mockContext.globalState.update).toHaveBeenCalledWith("soundEnabled", false)
590598
expect(mockPostMessage).toHaveBeenCalled()
599+
600+
// Simulate setting tts to enabled
601+
await messageHandler({ type: "ttsEnabled", bool: true })
602+
expect(setTtsEnabled).toHaveBeenCalledWith(true)
603+
expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", true)
604+
expect(mockPostMessage).toHaveBeenCalled()
605+
606+
// Simulate setting tts to disabled
607+
await messageHandler({ type: "ttsEnabled", bool: false })
608+
expect(setTtsEnabled).toHaveBeenCalledWith(false)
609+
expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", false)
610+
expect(mockPostMessage).toHaveBeenCalled()
591611
})
592612

593613
test("requestDelaySeconds defaults to 5 seconds", async () => {

src/shared/ExtensionMessage.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ export interface ExtensionState {
115115
currentTaskItem?: HistoryItem
116116
allowedCommands?: string[]
117117
soundEnabled?: boolean
118+
ttsEnabled?: boolean
118119
soundVolume?: number
119120
diffEnabled?: boolean
120121
enableCheckpoints: boolean

src/shared/WebviewMessage.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ export interface WebviewMessage {
4949
| "alwaysAllowMcp"
5050
| "alwaysAllowModeSwitch"
5151
| "playSound"
52+
| "playTts"
5253
| "soundEnabled"
54+
| "ttsEnabled"
5355
| "soundVolume"
5456
| "diffEnabled"
5557
| "enableCheckpoints"

src/shared/globalState.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export const GLOBAL_STATE_KEYS = [
5757
"openRouterUseMiddleOutTransform",
5858
"allowedCommands",
5959
"soundEnabled",
60+
"ttsEnabled",
6061
"soundVolume",
6162
"diffEnabled",
6263
"enableCheckpoints",

src/utils/tts.ts

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import * as vscode from "vscode"
2+
3+
let isTtsEnabled = false
4+
let isSpeaking = false
5+
const utteranceQueue: string[] = []
6+
7+
/**
8+
* Set tts configuration
9+
* @param enabled boolean
10+
*/
11+
export const setTtsEnabled = (enabled: boolean): void => {
12+
isTtsEnabled = enabled
13+
}
14+
15+
/**
16+
* Process the next item in the utterance queue
17+
*/
18+
const processQueue = async (): Promise<void> => {
19+
if (!isTtsEnabled || isSpeaking || utteranceQueue.length === 0) {
20+
return
21+
}
22+
23+
try {
24+
isSpeaking = true
25+
const nextUtterance = utteranceQueue.shift()!
26+
const say = require("say")
27+
28+
// Wrap say.speak in a promise to handle completion
29+
await new Promise<void>((resolve, reject) => {
30+
say.speak(nextUtterance, null, null, (err: Error) => {
31+
if (err) {
32+
reject(err)
33+
} else {
34+
resolve()
35+
}
36+
})
37+
})
38+
39+
isSpeaking = false
40+
// Process next item in queue if any
41+
await processQueue()
42+
} catch (error: any) {
43+
isSpeaking = false
44+
vscode.window.showErrorMessage(error.message)
45+
// Try to continue with next item despite error
46+
await processQueue()
47+
}
48+
}
49+
50+
/**
51+
* Queue a tts message to be spoken
52+
* @param message string
53+
* @return void
54+
*/
55+
export const playTts = async (message: string): Promise<void> => {
56+
if (!isTtsEnabled) {
57+
return
58+
}
59+
60+
try {
61+
utteranceQueue.push(message)
62+
await processQueue()
63+
} catch (error: any) {
64+
vscode.window.showErrorMessage(error.message)
65+
}
66+
}

webview-ui/src/components/chat/ChatView.tsx

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
8686
const disableAutoScrollRef = useRef(false)
8787
const [showScrollToBottom, setShowScrollToBottom] = useState(false)
8888
const [isAtBottom, setIsAtBottom] = useState(false)
89+
const lastTtsRef = useRef<string>("")
8990

9091
const [wasStreaming, setWasStreaming] = useState<boolean>(false)
9192
const [showCheckpointWarning, setShowCheckpointWarning] = useState<boolean>(false)
@@ -99,6 +100,10 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
99100
vscode.postMessage({ type: "playSound", audioType })
100101
}
101102

103+
function playTts(text: string) {
104+
vscode.postMessage({ type: "playTts", text })
105+
}
106+
102107
useDeepCompareEffect(() => {
103108
// if last message is an ask, show user ask UI
104109
// if user finished a task, then start a new task with a new conversation history since in this moment that the extension is waiting for user response, the user could close the extension and the conversation history would be lost.
@@ -659,6 +664,25 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
659664
)
660665

661666
useEffect(() => {
667+
// skip input message
668+
if (lastMessage && messages.length > 1) {
669+
let text = lastMessage?.text || ""
670+
671+
if (
672+
lastMessage.type === "say" && // is a say message
673+
!lastMessage.partial && // not a partial message
674+
!text.startsWith("{") && // not a json object
675+
text !== lastTtsRef.current // not the same as last TTS message
676+
) {
677+
try {
678+
playTts(text)
679+
lastTtsRef.current = text
680+
} catch (error) {
681+
console.error("Failed to execute text-to-speech:", error)
682+
}
683+
}
684+
}
685+
662686
// Only execute when isStreaming changes from true to false
663687
if (wasStreaming && !isStreaming && lastMessage) {
664688
// Play appropriate sound based on lastMessage content
@@ -691,7 +715,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
691715
}
692716
// Update previous value
693717
setWasStreaming(isStreaming)
694-
}, [isStreaming, lastMessage, wasStreaming, isAutoApproved])
718+
}, [isStreaming, lastMessage, wasStreaming, isAutoApproved, messages.length])
695719

696720
const isBrowserSessionMessage = (message: ClineMessage): boolean => {
697721
// which of visible messages are browser session messages, see above

webview-ui/src/components/settings/NotificationSettings.tsx

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@ import { SectionHeader } from "./SectionHeader"
77
import { Section } from "./Section"
88

99
type NotificationSettingsProps = HTMLAttributes<HTMLDivElement> & {
10+
ttsEnabled?: boolean
1011
soundEnabled?: boolean
1112
soundVolume?: number
12-
setCachedStateField: SetCachedStateField<"soundEnabled" | "soundVolume">
13+
setCachedStateField: SetCachedStateField<"ttsEnabled" | "soundEnabled" | "soundVolume">
1314
}
1415

1516
export const NotificationSettings = ({
17+
ttsEnabled,
1618
soundEnabled,
1719
soundVolume,
1820
setCachedStateField,
@@ -28,6 +30,16 @@ export const NotificationSettings = ({
2830
</SectionHeader>
2931

3032
<Section>
33+
<div>
34+
<VSCodeCheckbox
35+
checked={ttsEnabled}
36+
onChange={(e: any) => setCachedStateField("ttsEnabled", e.target.checked)}>
37+
<span className="font-medium">Enable text-to-speech</span>
38+
</VSCodeCheckbox>
39+
<p className="text-vscode-descriptionForeground text-sm mt-0">
40+
When enabled, Roo will read aloud its responses using text-to-speech.
41+
</p>
42+
</div>
3143
<div>
3244
<VSCodeCheckbox
3345
checked={soundEnabled}

0 commit comments

Comments
 (0)