Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0d4a743
Add text-to-speech functionality
heyseth Mar 6, 2025
4cd5545
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 6, 2025
8d98ce6
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 7, 2025
1a47e9d
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 8, 2025
88cf106
Add speed config option to text-to-speech
heyseth Mar 8, 2025
8f19387
Fix test case for tts speed slider
heyseth Mar 8, 2025
a734d51
Fix test case for tts speed slider (really)
heyseth Mar 8, 2025
be9e57e
Disabled error message logging in tts.ts
heyseth Mar 9, 2025
409d67c
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 9, 2025
da8a98c
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 10, 2025
0b716f2
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 11, 2025
2223762
Merge branch 'RooVetGit:main' into feature/textToSpeech
heyseth Mar 16, 2025
1b6b830
ignore markdown and mermaid diagrams in TTS
heyseth Mar 17, 2025
b4eed3f
Merge branch 'feature/textToSpeech' of https://github.com/heyseth/Roo…
heyseth Mar 17, 2025
552022d
add ttsEnabled and ttsSpeed to GlobalStateKey
heyseth Mar 17, 2025
5f32cb9
Merge remote-tracking branch 'upstream/main' into feature/textToSpeech
heyseth Mar 17, 2025
1d7de4b
fix failing webview test for save button
heyseth Mar 17, 2025
63d6d64
Merge remote-tracking branch 'origin/main' into feature/textToSpeech
mrubens Mar 18, 2025
d867e0e
Translations
mrubens Mar 18, 2025
730548e
Fix tests
mrubens Mar 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@
"pretty-bytes": "^6.1.1",
"puppeteer-chromium-resolver": "^23.0.0",
"puppeteer-core": "^23.4.0",
"say": "^0.16.0",
"serialize-error": "^11.0.3",
"simple-git": "^3.27.0",
"sound-play": "^1.1.0",
Expand Down
20 changes: 20 additions & 0 deletions src/core/webview/ClineProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import { McpHub } from "../../services/mcp/McpHub"
import { McpServerManager } from "../../services/mcp/McpServerManager"
import { fileExistsAtPath } from "../../utils/fs"
import { playSound, setSoundEnabled, setSoundVolume } from "../../utils/sound"
import { playTts, setTtsEnabled } from "../../utils/tts"
import { singleCompletionHandler } from "../../utils/single-completion-handler"
import { searchCommits } from "../../utils/git"
import { getDiffStrategy } from "../diff/DiffStrategy"
Expand Down Expand Up @@ -394,6 +395,11 @@ export class ClineProvider implements vscode.WebviewViewProvider {
setSoundEnabled(soundEnabled ?? false)
})

// Initialize tts enabled state
this.getState().then(({ ttsEnabled }) => {
setTtsEnabled(ttsEnabled ?? false)
})

webviewView.webview.options = {
// Allow scripts in the webview
enableScripts: true,
Expand Down Expand Up @@ -1204,6 +1210,17 @@ export class ClineProvider implements vscode.WebviewViewProvider {
setSoundVolume(soundVolume)
await this.postStateToWebview()
break
case "ttsEnabled":
const ttsEnabled = message.bool ?? true
await this.updateGlobalState("ttsEnabled", ttsEnabled)
setTtsEnabled(ttsEnabled) // Add this line to update the tts utility
await this.postStateToWebview()
break
case "playTts":
if (message.text) {
playTts(message.text)
}
break
case "diffEnabled":
const diffEnabled = message.bool ?? true
await this.updateGlobalState("diffEnabled", diffEnabled)
Expand Down Expand Up @@ -2125,6 +2142,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
alwaysAllowMcp,
alwaysAllowModeSwitch,
soundEnabled,
ttsEnabled,
diffEnabled,
enableCheckpoints,
checkpointStorage,
Expand Down Expand Up @@ -2176,6 +2194,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
.filter((item: HistoryItem) => item.ts && item.task)
.sort((a: HistoryItem, b: HistoryItem) => b.ts - a.ts),
soundEnabled: soundEnabled ?? false,
ttsEnabled: ttsEnabled ?? false,
diffEnabled: diffEnabled ?? true,
enableCheckpoints: enableCheckpoints ?? true,
checkpointStorage: checkpointStorage ?? "task",
Expand Down Expand Up @@ -2326,6 +2345,7 @@ export class ClineProvider implements vscode.WebviewViewProvider {
taskHistory: stateValues.taskHistory,
allowedCommands: stateValues.allowedCommands,
soundEnabled: stateValues.soundEnabled ?? false,
ttsEnabled: stateValues.ttsEnabled ?? false,
diffEnabled: stateValues.diffEnabled ?? true,
enableCheckpoints: stateValues.enableCheckpoints ?? false,
checkpointStorage: stateValues.checkpointStorage ?? "task",
Expand Down
20 changes: 20 additions & 0 deletions src/core/webview/__tests__/ClineProvider.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { ClineProvider } from "../ClineProvider"
import { ExtensionMessage, ExtensionState } from "../../../shared/ExtensionMessage"
import { GlobalStateKey, SecretKey } from "../../../shared/globalState"
import { setSoundEnabled } from "../../../utils/sound"
import { setTtsEnabled } from "../../../utils/tts"
import { defaultModeSlug } from "../../../shared/modes"
import { experimentDefault } from "../../../shared/experiments"
import { Cline } from "../../Cline"
Expand Down Expand Up @@ -193,6 +194,11 @@ jest.mock("../../../utils/sound", () => ({
setSoundEnabled: jest.fn(),
}))

// Mock tts utility
jest.mock("../../../utils/tts", () => ({
setTtsEnabled: jest.fn(),
}))

// Mock ESM modules
jest.mock("p-wait-for", () => ({
__esModule: true,
Expand Down Expand Up @@ -423,6 +429,7 @@ describe("ClineProvider", () => {
alwaysAllowMcp: false,
uriScheme: "vscode",
soundEnabled: false,
ttsEnabled: false,
diffEnabled: false,
enableCheckpoints: false,
checkpointStorage: "task",
Expand Down Expand Up @@ -517,6 +524,7 @@ describe("ClineProvider", () => {
expect(state).toHaveProperty("alwaysAllowBrowser")
expect(state).toHaveProperty("taskHistory")
expect(state).toHaveProperty("soundEnabled")
expect(state).toHaveProperty("ttsEnabled")
expect(state).toHaveProperty("diffEnabled")
expect(state).toHaveProperty("writeDelayMs")
})
Expand Down Expand Up @@ -588,6 +596,18 @@ describe("ClineProvider", () => {
expect(setSoundEnabled).toHaveBeenCalledWith(false)
expect(mockContext.globalState.update).toHaveBeenCalledWith("soundEnabled", false)
expect(mockPostMessage).toHaveBeenCalled()

// Simulate setting tts to enabled
await messageHandler({ type: "ttsEnabled", bool: true })
expect(setTtsEnabled).toHaveBeenCalledWith(true)
expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", true)
expect(mockPostMessage).toHaveBeenCalled()

// Simulate setting tts to disabled
await messageHandler({ type: "ttsEnabled", bool: false })
expect(setTtsEnabled).toHaveBeenCalledWith(false)
expect(mockContext.globalState.update).toHaveBeenCalledWith("ttsEnabled", false)
expect(mockPostMessage).toHaveBeenCalled()
})

test("requestDelaySeconds defaults to 5 seconds", async () => {
Expand Down
1 change: 1 addition & 0 deletions src/shared/ExtensionMessage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ export interface ExtensionState {
currentTaskItem?: HistoryItem
allowedCommands?: string[]
soundEnabled?: boolean
ttsEnabled?: boolean
soundVolume?: number
diffEnabled?: boolean
enableCheckpoints: boolean
Expand Down
2 changes: 2 additions & 0 deletions src/shared/WebviewMessage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ export interface WebviewMessage {
| "alwaysAllowMcp"
| "alwaysAllowModeSwitch"
| "playSound"
| "playTts"
| "soundEnabled"
| "ttsEnabled"
| "soundVolume"
| "diffEnabled"
| "enableCheckpoints"
Expand Down
1 change: 1 addition & 0 deletions src/shared/globalState.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ export const GLOBAL_STATE_KEYS = [
"openRouterUseMiddleOutTransform",
"allowedCommands",
"soundEnabled",
"ttsEnabled",
"soundVolume",
"diffEnabled",
"enableCheckpoints",
Expand Down
66 changes: 66 additions & 0 deletions src/utils/tts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import * as vscode from "vscode"

let isTtsEnabled = false
let isSpeaking = false
const utteranceQueue: string[] = []

/**
* Set tts configuration
* @param enabled boolean
*/
export const setTtsEnabled = (enabled: boolean): void => {
isTtsEnabled = enabled
}

/**
* Process the next item in the utterance queue
*/
const processQueue = async (): Promise<void> => {
if (!isTtsEnabled || isSpeaking || utteranceQueue.length === 0) {
return
}

try {
isSpeaking = true
const nextUtterance = utteranceQueue.shift()!
const say = require("say")

// Wrap say.speak in a promise to handle completion
await new Promise<void>((resolve, reject) => {
say.speak(nextUtterance, null, null, (err: Error) => {
if (err) {
reject(err)
} else {
resolve()
}
})
})

isSpeaking = false
// Process next item in queue if any
await processQueue()
} catch (error: any) {
isSpeaking = false
vscode.window.showErrorMessage(error.message)
// Try to continue with next item despite error
await processQueue()
}
}

/**
* Queue a tts message to be spoken
* @param message string
* @return void
*/
export const playTts = async (message: string): Promise<void> => {
if (!isTtsEnabled) {
return
}

try {
utteranceQueue.push(message)
await processQueue()
} catch (error: any) {
vscode.window.showErrorMessage(error.message)
}
}
26 changes: 25 additions & 1 deletion webview-ui/src/components/chat/ChatView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
const disableAutoScrollRef = useRef(false)
const [showScrollToBottom, setShowScrollToBottom] = useState(false)
const [isAtBottom, setIsAtBottom] = useState(false)
const lastTtsRef = useRef<string>("")

const [wasStreaming, setWasStreaming] = useState<boolean>(false)
const [showCheckpointWarning, setShowCheckpointWarning] = useState<boolean>(false)
Expand All @@ -99,6 +100,10 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
vscode.postMessage({ type: "playSound", audioType })
}

function playTts(text: string) {
vscode.postMessage({ type: "playTts", text })
}

useDeepCompareEffect(() => {
// if last message is an ask, show user ask UI
// if user finished a task, then start a new task with a new conversation history since in this moment that the extension is waiting for user response, the user could close the extension and the conversation history would be lost.
Expand Down Expand Up @@ -659,6 +664,25 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
)

useEffect(() => {
// skip input message
if (lastMessage && messages.length > 1) {
let text = lastMessage?.text || ""

if (
lastMessage.type === "say" && // is a say message
!lastMessage.partial && // not a partial message
!text.startsWith("{") && // not a json object
text !== lastTtsRef.current // not the same as last TTS message
) {
try {
playTts(text)
lastTtsRef.current = text
} catch (error) {
console.error("Failed to execute text-to-speech:", error)
}
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mind explaining the logic here? Thank you!

Copy link
Contributor Author

@heyseth heyseth Mar 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello, essentially I only want to read out the messages which the user would expect Roo to read, ie: messages which appear in the chat interface. The first item in messages is the user input, which we don't need to read aloud. We also don't need to read aloud incomplete messages or json objects. The reason that I had it check if the message type is say is that I didn't want Roo reading aloud ask messages such as this:

image

Maybe this last behavior should be a toggleable option though?

The code also stores a reference to the last spoken message to prevent duplicate responses from being read.

Copy link
Collaborator

@mrubens mrubens Mar 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just pulled the branch down and it did seem to read my messages back to me - is that unintended? It does also still seem to read mermaid and json.

Pretty cool experience overall though!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@heyseth sorry I accidentally resolved this conversation somehow. Any ideas on my last question here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mrubens sorry for the late response! I'm writing some fixes now that should prevent the mermaid diagrams/json and user input messages from being read aloud


// Only execute when isStreaming changes from true to false
if (wasStreaming && !isStreaming && lastMessage) {
// Play appropriate sound based on lastMessage content
Expand Down Expand Up @@ -691,7 +715,7 @@ const ChatView = ({ isHidden, showAnnouncement, hideAnnouncement, showHistoryVie
}
// Update previous value
setWasStreaming(isStreaming)
}, [isStreaming, lastMessage, wasStreaming, isAutoApproved])
}, [isStreaming, lastMessage, wasStreaming, isAutoApproved, messages.length])

const isBrowserSessionMessage = (message: ClineMessage): boolean => {
// which of visible messages are browser session messages, see above
Expand Down
14 changes: 13 additions & 1 deletion webview-ui/src/components/settings/NotificationSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ import { SectionHeader } from "./SectionHeader"
import { Section } from "./Section"

type NotificationSettingsProps = HTMLAttributes<HTMLDivElement> & {
ttsEnabled?: boolean
soundEnabled?: boolean
soundVolume?: number
setCachedStateField: SetCachedStateField<"soundEnabled" | "soundVolume">
setCachedStateField: SetCachedStateField<"ttsEnabled" | "soundEnabled" | "soundVolume">
}

export const NotificationSettings = ({
ttsEnabled,
soundEnabled,
soundVolume,
setCachedStateField,
Expand All @@ -28,6 +30,16 @@ export const NotificationSettings = ({
</SectionHeader>

<Section>
<div>
<VSCodeCheckbox
checked={ttsEnabled}
onChange={(e: any) => setCachedStateField("ttsEnabled", e.target.checked)}>
<span className="font-medium">Enable text-to-speech</span>
</VSCodeCheckbox>
<p className="text-vscode-descriptionForeground text-sm mt-0">
When enabled, Roo will read aloud its responses using text-to-speech.
</p>
</div>
<div>
<VSCodeCheckbox
checked={soundEnabled}
Expand Down
3 changes: 3 additions & 0 deletions webview-ui/src/components/settings/SettingsView.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },
requestDelaySeconds,
screenshotQuality,
soundEnabled,
ttsEnabled,
soundVolume,
terminalOutputLineLimit,
writeDelayMs,
Expand Down Expand Up @@ -149,6 +150,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },
vscode.postMessage({ type: "allowedCommands", commands: allowedCommands ?? [] })
vscode.postMessage({ type: "browserToolEnabled", bool: browserToolEnabled })
vscode.postMessage({ type: "soundEnabled", bool: soundEnabled })
vscode.postMessage({ type: "ttsEnabled", bool: ttsEnabled })
vscode.postMessage({ type: "soundVolume", value: soundVolume })
vscode.postMessage({ type: "diffEnabled", bool: diffEnabled })
vscode.postMessage({ type: "enableCheckpoints", bool: enableCheckpoints })
Expand Down Expand Up @@ -370,6 +372,7 @@ const SettingsView = forwardRef<SettingsViewRef, SettingsViewProps>(({ onDone },

<div ref={notificationsRef}>
<NotificationSettings
ttsEnabled={ttsEnabled}
soundEnabled={soundEnabled}
soundVolume={soundVolume}
setCachedStateField={setCachedStateField}
Expand Down
3 changes: 3 additions & 0 deletions webview-ui/src/context/ExtensionStateContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export interface ExtensionStateContextType extends ExtensionState {
setAllowedCommands: (value: string[]) => void
setSoundEnabled: (value: boolean) => void
setSoundVolume: (value: number) => void
setTtsEnabled: (value: boolean) => void
setDiffEnabled: (value: boolean) => void
setEnableCheckpoints: (value: boolean) => void
setBrowserViewportSize: (value: string) => void
Expand Down Expand Up @@ -105,6 +106,7 @@ export const ExtensionStateContextProvider: React.FC<{ children: React.ReactNode
allowedCommands: [],
soundEnabled: false,
soundVolume: 0.5,
ttsEnabled: false,
diffEnabled: false,
enableCheckpoints: true,
checkpointStorage: "task",
Expand Down Expand Up @@ -242,6 +244,7 @@ export const ExtensionStateContextProvider: React.FC<{ children: React.ReactNode
setAllowedCommands: (value) => setState((prevState) => ({ ...prevState, allowedCommands: value })),
setSoundEnabled: (value) => setState((prevState) => ({ ...prevState, soundEnabled: value })),
setSoundVolume: (value) => setState((prevState) => ({ ...prevState, soundVolume: value })),
setTtsEnabled: (value) => setState((prevState) => ({ ...prevState, ttsEnabled: value })),
setDiffEnabled: (value) => setState((prevState) => ({ ...prevState, diffEnabled: value })),
setEnableCheckpoints: (value) => setState((prevState) => ({ ...prevState, enableCheckpoints: value })),
setBrowserViewportSize: (value: string) =>
Expand Down
Loading