fix: integrate Gemini grounding sources into assistant message

roomote · roomote · commit a9174a8bb45c · 2025-07-29T17:19:38.000Z
- Modified streaming logic to append grounding sources to the last text chunk instead of yielding as separate message - Added tracking of content yielding to ensure sources only appear when content exists - Added comprehensive test coverage for grounding functionality including edge cases - Fixes issue where grounding sources appeared as separate message bubbles Fixes #6372
diff --git a/src/api/providers/__tests__/gemini.spec.ts b/src/api/providers/__tests__/gemini.spec.ts
@@ -7,6 +7,15 @@ import { type ModelInfo, geminiDefaultModelId } from "@roo-code/types"
 import { t } from "i18next"
 import { GeminiHandler } from "../gemini"
 
+// Mock the translation function
+vitest.mock("i18next", () => ({
+	t: vitest.fn((key: string) => {
+		if (key === "common:errors.gemini.sources") return "Sources:"
+		if (key === "common:errors.gemini.generate_complete_prompt") return "Gemini completion error: {{error}}"
+		return key
+	}),
+}))
+
 const GEMINI_20_FLASH_THINKING_NAME = "gemini-2.0-flash-thinking-exp-1219"
 
 describe("GeminiHandler", () => {
@@ -102,6 +111,155 @@ describe("GeminiHandler", () => {
 				}
 			}).rejects.toThrow()
 		})
+
+		it("should integrate grounding sources into the assistant message", async () => {
+			// Setup the mock implementation to return an async generator with grounding metadata
+			;(handler["client"].models.generateContentStream as any).mockResolvedValue({
+				[Symbol.asyncIterator]: async function* () {
+					yield {
+						candidates: [
+							{
+								content: {
+									parts: [{ text: "Here is some information about AI." }],
+								},
+								groundingMetadata: {
+									groundingChunks: [
+										{ web: { uri: "https://example.com/ai-info" } },
+										{ web: { uri: "https://example.com/ai-research" } },
+									],
+								},
+							},
+						],
+					}
+					yield { usageMetadata: { promptTokenCount: 10, candidatesTokenCount: 15 } }
+				},
+			})
+
+			const stream = handler.createMessage(systemPrompt, mockMessages)
+			const chunks = []
+
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Should have 3 chunks: main content, sources, and usage info
+			expect(chunks.length).toBe(3)
+			expect(chunks[0]).toEqual({ type: "text", text: "Here is some information about AI." })
+			expect(chunks[1]).toEqual({
+				type: "text",
+				text: "\n\nSources: [1](https://example.com/ai-info), [2](https://example.com/ai-research)",
+			})
+			expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 15 })
+		})
+
+		it("should handle grounding metadata without web sources", async () => {
+			// Setup the mock implementation with grounding metadata but no web sources
+			;(handler["client"].models.generateContentStream as any).mockResolvedValue({
+				[Symbol.asyncIterator]: async function* () {
+					yield {
+						candidates: [
+							{
+								content: {
+									parts: [{ text: "Response without web sources." }],
+								},
+								groundingMetadata: {
+									groundingChunks: [{ someOtherSource: { data: "non-web-source" } }],
+								},
+							},
+						],
+					}
+					yield { usageMetadata: { promptTokenCount: 5, candidatesTokenCount: 8 } }
+				},
+			})
+
+			const stream = handler.createMessage(systemPrompt, mockMessages)
+			const chunks = []
+
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Should have 2 chunks: main content and usage info (no sources since no web URIs)
+			expect(chunks.length).toBe(2)
+			expect(chunks[0]).toEqual({ type: "text", text: "Response without web sources." })
+			expect(chunks[1]).toEqual({ type: "usage", inputTokens: 5, outputTokens: 8 })
+		})
+
+		it("should not yield sources when no content is generated", async () => {
+			// Setup the mock implementation with grounding metadata but no content
+			;(handler["client"].models.generateContentStream as any).mockResolvedValue({
+				[Symbol.asyncIterator]: async function* () {
+					yield {
+						candidates: [
+							{
+								groundingMetadata: {
+									groundingChunks: [{ web: { uri: "https://example.com/source" } }],
+								},
+							},
+						],
+					}
+					yield { usageMetadata: { promptTokenCount: 5, candidatesTokenCount: 0 } }
+				},
+			})
+
+			const stream = handler.createMessage(systemPrompt, mockMessages)
+			const chunks = []
+
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Should only have usage info, no sources since no content was yielded
+			expect(chunks.length).toBe(1)
+			expect(chunks[0]).toEqual({ type: "usage", inputTokens: 5, outputTokens: 0 })
+		})
+
+		it("should handle multiple text chunks with grounding sources", async () => {
+			// Setup the mock implementation with multiple text chunks and grounding
+			;(handler["client"].models.generateContentStream as any).mockResolvedValue({
+				[Symbol.asyncIterator]: async function* () {
+					yield {
+						candidates: [
+							{
+								content: {
+									parts: [{ text: "First part of response" }],
+								},
+							},
+						],
+					}
+					yield {
+						candidates: [
+							{
+								content: {
+									parts: [{ text: " and second part." }],
+								},
+								groundingMetadata: {
+									groundingChunks: [{ web: { uri: "https://example.com/source1" } }],
+								},
+							},
+						],
+					}
+					yield { usageMetadata: { promptTokenCount: 12, candidatesTokenCount: 18 } }
+				},
+			})
+
+			const stream = handler.createMessage(systemPrompt, mockMessages)
+			const chunks = []
+
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Should have 4 chunks: two text chunks, sources, and usage info
+			expect(chunks.length).toBe(4)
+			expect(chunks[0]).toEqual({ type: "text", text: "First part of response" })
+			expect(chunks[1]).toEqual({ type: "text", text: " and second part." })
+			expect(chunks[2]).toEqual({
+				type: "text",
+				text: "\n\nSources: [1](https://example.com/source1)",
+			})
+			expect(chunks[3]).toEqual({ type: "usage", inputTokens: 12, outputTokens: 18 })
+		})
 	})
 
 	describe("completePrompt", () => {
@@ -143,6 +301,38 @@ describe("GeminiHandler", () => {
 			const result = await handler.completePrompt("Test prompt")
 			expect(result).toBe("")
 		})
+
+		it("should integrate grounding sources in completePrompt", async () => {
+			// Mock the response with grounding metadata
+			;(handler["client"].models.generateContent as any).mockResolvedValue({
+				text: "AI is a fascinating field of study.",
+				candidates: [
+					{
+						groundingMetadata: {
+							groundingChunks: [
+								{ web: { uri: "https://example.com/ai-study" } },
+								{ web: { uri: "https://example.com/ai-research" } },
+							],
+						},
+					},
+				],
+			})
+
+			const result = await handler.completePrompt("Tell me about AI")
+			expect(result).toBe(
+				"AI is a fascinating field of study.\n\nSources: [1](https://example.com/ai-study), [2](https://example.com/ai-research)",
+			)
+		})
+
+		it("should handle completePrompt without grounding sources", async () => {
+			// Mock the response without grounding metadata
+			;(handler["client"].models.generateContent as any).mockResolvedValue({
+				text: "Simple response without sources.",
+			})
+
+			const result = await handler.completePrompt("Simple question")
+			expect(result).toBe("Simple response without sources.")
+		})
 	})
 
 	describe("getModel", () => {
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
@@ -94,6 +94,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
 			let lastUsageMetadata: GenerateContentResponseUsageMetadata | undefined
 			let pendingGroundingMetadata: GroundingMetadata | undefined
+			let lastTextChunk: string | null = null
+			let hasYieldedContent = false
 
 			for await (const chunk of result) {
 				// Process candidates and their parts to separate thoughts from content
@@ -114,6 +116,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 							} else {
 								// This is regular content
 								if (part.text) {
+									lastTextChunk = part.text
+									hasYieldedContent = true
 									yield { type: "text", text: part.text }
 								}
 							}
@@ -123,6 +127,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
 				// Fallback to the original text property if no candidates structure
 				else if (chunk.text) {
+					lastTextChunk = chunk.text
+					hasYieldedContent = true
 					yield { type: "text", text: chunk.text }
 				}
 
@@ -131,10 +137,12 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 				}
 			}
 
-			if (pendingGroundingMetadata) {
+			// If we have grounding metadata and content was yielded, append sources to the last text chunk
+			if (pendingGroundingMetadata && hasYieldedContent) {
 				const citations = this.extractCitationsOnly(pendingGroundingMetadata)
 				if (citations) {
-					yield { type: "text", text: `\n\n${t("common:errors.gemini.sources")} ${citations}` }
+					const sourcesText = `\n\n${t("common:errors.gemini.sources")} ${citations}`
+					yield { type: "text", text: sourcesText }
 				}
 			}
 

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl`
`94`	`94`
`95`	`95`	`let lastUsageMetadata: GenerateContentResponseUsageMetadata \| undefined`
`96`	`96`	`let pendingGroundingMetadata: GroundingMetadata \| undefined`
	`97`	`+ let lastTextChunk: string \| null = null`
	`98`	`+ let hasYieldedContent = false`
`97`	`99`
`98`	`100`	`for await (const chunk of result) {`
`99`	`101`	`// Process candidates and their parts to separate thoughts from content`
`@@ -114,6 +116,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl`
`114`	`116`	`} else {`
`115`	`117`	`// This is regular content`
`116`	`118`	`if (part.text) {`
	`119`	`+ lastTextChunk = part.text`
	`120`	`+ hasYieldedContent = true`
`117`	`121`	`yield { type: "text", text: part.text }`
`118`	`122`	`}`
`119`	`123`	`}`
`@@ -123,6 +127,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl`
`123`	`127`
`124`	`128`	`// Fallback to the original text property if no candidates structure`
`125`	`129`	`else if (chunk.text) {`
	`130`	`+ lastTextChunk = chunk.text`
	`131`	`+ hasYieldedContent = true`
`126`	`132`	`yield { type: "text", text: chunk.text }`
`127`	`133`	`}`
`128`	`134`
`@@ -131,10 +137,12 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl`
`131`	`137`	`}`
`132`	`138`	`}`
`133`	`139`
`134`		`- if (pendingGroundingMetadata) {`
	`140`	`+ // If we have grounding metadata and content was yielded, append sources to the last text chunk`
	`141`	`+ if (pendingGroundingMetadata && hasYieldedContent) {`
`135`	`142`	`const citations = this.extractCitationsOnly(pendingGroundingMetadata)`
`136`	`143`	`if (citations) {`
`137`		- yield { type: "text", text: `\n\n${t("common:errors.gemini.sources")} ${citations}` }
	`144`	+ const sourcesText = `\n\n${t("common:errors.gemini.sources")} ${citations}`
	`145`	`+ yield { type: "text", text: sourcesText }`
`138`	`146`	`}`
`139`	`147`	`}`
`140`	`148`