Skip to content

Commit ec674d2

Browse files
committed
feat(transform, chat, gemini, media): Gemini enable video processing
This commit introduces support for video content by updating the Gemini transformer and generalizing media handling in the UI. Key changes include: - Extending gemini-format.ts to process video content blocks. - Adding tests to gemini-format.spec.ts to validate video and mixed media handling. - Refactoring the chat UI to use a generic MediaThumbnails component. - Introducing a getMimeType utility for identifying media types from data URIs. Closes: #6144
1 parent 2411c8f commit ec674d2

File tree

8 files changed

+218
-55
lines changed

8 files changed

+218
-55
lines changed

src/api/transform/__tests__/gemini-format.spec.ts

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,77 @@ describe("convertAnthropicMessageToGemini", () => {
7171
expect(result).toEqual({
7272
role: "user",
7373
parts: [
74+
{
75+
inlineData: {
76+
data: "base64encodeddata",
77+
mimeType: "image/jpeg",
78+
},
79+
},
7480
{ text: "Check out this image:" },
81+
],
82+
})
83+
})
84+
85+
it("should convert a message with a video", () => {
86+
const anthropicMessage: Anthropic.Messages.MessageParam = {
87+
role: "user",
88+
content: [
89+
{ type: "text", text: "Check out this video:" },
90+
{
91+
type: "video",
92+
source: {
93+
type: "base64",
94+
media_type: "video/mp4",
95+
data: "base64encodedvideodata",
96+
},
97+
} as any,
98+
],
99+
}
100+
101+
const result = convertAnthropicMessageToGemini(anthropicMessage)
102+
103+
expect(result).toEqual({
104+
role: "user",
105+
parts: [
106+
{
107+
inlineData: {
108+
data: "base64encodedvideodata",
109+
mimeType: "video/mp4",
110+
},
111+
},
112+
{ text: "Check out this video:" },
113+
],
114+
})
115+
})
116+
117+
it("should handle text after inlineData", () => {
118+
const anthropicMessage: Anthropic.Messages.MessageParam = {
119+
role: "user",
120+
content: [
121+
{
122+
type: "image",
123+
source: {
124+
type: "base64",
125+
media_type: "image/jpeg",
126+
data: "base64encodeddata",
127+
},
128+
},
129+
{ type: "text", text: "Check out this image:" },
130+
],
131+
}
132+
133+
const result = convertAnthropicMessageToGemini(anthropicMessage)
134+
135+
expect(result).toEqual({
136+
role: "user",
137+
parts: [
75138
{
76139
inlineData: {
77140
data: "base64encodeddata",
78141
mimeType: "image/jpeg",
79142
},
80143
},
144+
{ text: "Check out this image:" },
81145
],
82146
})
83147
})

src/api/transform/gemini-format.ts

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
11
import { Anthropic } from "@anthropic-ai/sdk"
22
import { Content, Part } from "@google/genai"
33

4-
export function convertAnthropicContentToGemini(content: string | Anthropic.ContentBlockParam[]): Part[] {
4+
// Extended type to support video content blocks that aren't in the standard Anthropic SDK
5+
interface VideoContentBlock {
6+
type: "video"
7+
source: {
8+
type: "base64"
9+
data: string
10+
media_type: string
11+
}
12+
}
13+
14+
type ExtendedContentBlockParam = Anthropic.ContentBlockParam | VideoContentBlock
15+
16+
export function convertAnthropicContentToGemini(content: string | ExtendedContentBlockParam[]): Part[] {
517
if (typeof content === "string") {
618
return [{ text: content }]
719
}
820

9-
return content.flatMap((block): Part | Part[] => {
21+
const parts = content.flatMap((block): Part | Part[] => {
1022
switch (block.type) {
1123
case "text":
1224
return { text: block.text }
@@ -15,6 +27,11 @@ export function convertAnthropicContentToGemini(content: string | Anthropic.Cont
1527
throw new Error("Unsupported image source type")
1628
}
1729

30+
return { inlineData: { data: block.source.data, mimeType: block.source.media_type } }
31+
case "video":
32+
if (block.source.type !== "base64") {
33+
throw new Error("Unsupported video source type")
34+
}
1835
return { inlineData: { data: block.source.data, mimeType: block.source.media_type } }
1936
case "tool_use":
2037
return {
@@ -68,6 +85,17 @@ export function convertAnthropicContentToGemini(content: string | Anthropic.Cont
6885
throw new Error(`Unsupported content block type: ${block.type}`)
6986
}
7087
})
88+
89+
// Sort parts to ensure inlineData comes before text
90+
return parts.sort((a, b) => {
91+
if ("inlineData" in a && "text" in b) {
92+
return -1
93+
}
94+
if ("text" in a && "inlineData" in b) {
95+
return 1
96+
}
97+
return 0
98+
})
7199
}
72100

73101
export function convertAnthropicMessageToGemini(message: Anthropic.Messages.MessageParam): Content {

webview-ui/src/components/chat/ChatRow.tsx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,8 +1069,8 @@ export const ChatRowContent = ({
10691069
sendingDisabled={false}
10701070
selectApiConfigDisabled={true}
10711071
placeholderText={t("chat:editMessage.placeholder")}
1072-
selectedImages={editImages}
1073-
setSelectedImages={setEditImages}
1072+
selectedMedia={editImages}
1073+
setSelectedMedia={setEditImages}
10741074
onSend={handleSaveEdit}
10751075
onSelectImages={handleSelectImages}
10761076
shouldDisableImages={false}
@@ -1079,6 +1079,7 @@ export const ChatRowContent = ({
10791079
modeShortcutText=""
10801080
isEditMode={true}
10811081
onCancel={handleCancelEdit}
1082+
acceptedFileTypes={[]}
10821083
/>
10831084
</div>
10841085
) : (

webview-ui/src/components/chat/ChatTextArea.tsx

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ import {
2121
import { convertToMentionPath } from "@/utils/path-mentions"
2222
import { SelectDropdown, DropdownOptionType, Button, StandardTooltip } from "@/components/ui"
2323

24-
import Thumbnails from "../common/Thumbnails"
2524
import ModeSelector from "./ModeSelector"
2625
import { MAX_IMAGES_PER_MESSAGE } from "./ChatView"
26+
import MediaThumbnails from "../common/MediaThumbnails"
2727
import ContextMenu from "./ContextMenu"
2828
import { VolumeX, Pin, Check, Image, WandSparkles, SendHorizontal } from "lucide-react"
2929
import { IndexingStatusBadge } from "./IndexingStatusBadge"
@@ -37,15 +37,16 @@ interface ChatTextAreaProps {
3737
sendingDisabled: boolean
3838
selectApiConfigDisabled: boolean
3939
placeholderText: string
40-
selectedImages: string[]
41-
setSelectedImages: React.Dispatch<React.SetStateAction<string[]>>
40+
selectedMedia: string[]
41+
setSelectedMedia: React.Dispatch<React.SetStateAction<string[]>>
4242
onSend: () => void
4343
onSelectImages: () => void
4444
shouldDisableImages: boolean
4545
onHeightChange?: (height: number) => void
4646
mode: Mode
4747
setMode: (value: Mode) => void
4848
modeShortcutText: string
49+
acceptedFileTypes: string[]
4950
// Edit mode props
5051
isEditMode?: boolean
5152
onCancel?: () => void
@@ -59,8 +60,8 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
5960
sendingDisabled,
6061
selectApiConfigDisabled,
6162
placeholderText,
62-
selectedImages,
63-
setSelectedImages,
63+
selectedMedia,
64+
setSelectedMedia,
6465
onSend,
6566
onSelectImages,
6667
shouldDisableImages,
@@ -70,6 +71,7 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
7071
modeShortcutText,
7172
isEditMode = false,
7273
onCancel,
74+
acceptedFileTypes,
7375
},
7476
ref,
7577
) => {
@@ -598,17 +600,15 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
598600
return
599601
}
600602

601-
const acceptedTypes = ["png", "jpeg", "webp"]
602-
603-
const imageItems = Array.from(items).filter((item) => {
603+
const mediaItems = Array.from(items).filter((item) => {
604604
const [type, subtype] = item.type.split("/")
605-
return type === "image" && acceptedTypes.includes(subtype)
605+
return (type === "image" || type === "video") && acceptedFileTypes.includes(subtype)
606606
})
607607

608-
if (!shouldDisableImages && imageItems.length > 0) {
608+
if (!shouldDisableImages && mediaItems.length > 0) {
609609
e.preventDefault()
610610

611-
const imagePromises = imageItems.map((item) => {
611+
const mediaPromises = mediaItems.map((item) => {
612612
return new Promise<string | null>((resolve) => {
613613
const blob = item.getAsFile()
614614

@@ -633,17 +633,17 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
633633
})
634634
})
635635

636-
const imageDataArray = await Promise.all(imagePromises)
637-
const dataUrls = imageDataArray.filter((dataUrl): dataUrl is string => dataUrl !== null)
636+
const mediaDataArray = await Promise.all(mediaPromises)
637+
const dataUrls = mediaDataArray.filter((dataUrl): dataUrl is string => dataUrl !== null)
638638

639639
if (dataUrls.length > 0) {
640-
setSelectedImages((prevImages) => [...prevImages, ...dataUrls].slice(0, MAX_IMAGES_PER_MESSAGE))
640+
setSelectedMedia((prevItems) => [...prevItems, ...dataUrls].slice(0, MAX_IMAGES_PER_MESSAGE))
641641
} else {
642642
console.warn(t("chat:noValidImages"))
643643
}
644644
}
645645
},
646-
[shouldDisableImages, setSelectedImages, cursorPosition, setInputValue, inputValue, t],
646+
[shouldDisableImages, setSelectedMedia, cursorPosition, setInputValue, inputValue, t, acceptedFileTypes],
647647
)
648648

649649
const handleMenuMouseDown = useCallback(() => {
@@ -732,15 +732,13 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
732732
const files = Array.from(e.dataTransfer.files)
733733

734734
if (files.length > 0) {
735-
const acceptedTypes = ["png", "jpeg", "webp"]
736-
737-
const imageFiles = files.filter((file) => {
735+
const mediaFiles = files.filter((file) => {
738736
const [type, subtype] = file.type.split("/")
739-
return type === "image" && acceptedTypes.includes(subtype)
737+
return (type === "image" || type === "video") && acceptedFileTypes.includes(subtype)
740738
})
741739

742-
if (!shouldDisableImages && imageFiles.length > 0) {
743-
const imagePromises = imageFiles.map((file) => {
740+
if (!shouldDisableImages && mediaFiles.length > 0) {
741+
const mediaPromises = mediaFiles.map((file) => {
744742
return new Promise<string | null>((resolve) => {
745743
const reader = new FileReader()
746744

@@ -758,12 +756,12 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
758756
})
759757
})
760758

761-
const imageDataArray = await Promise.all(imagePromises)
762-
const dataUrls = imageDataArray.filter((dataUrl): dataUrl is string => dataUrl !== null)
759+
const mediaDataArray = await Promise.all(mediaPromises)
760+
const dataUrls = mediaDataArray.filter((dataUrl): dataUrl is string => dataUrl !== null)
763761

764762
if (dataUrls.length > 0) {
765-
setSelectedImages((prevImages) =>
766-
[...prevImages, ...dataUrls].slice(0, MAX_IMAGES_PER_MESSAGE),
763+
setSelectedMedia((prevItems) =>
764+
[...prevItems, ...dataUrls].slice(0, MAX_IMAGES_PER_MESSAGE),
767765
)
768766

769767
if (typeof vscode !== "undefined") {
@@ -783,8 +781,9 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
783781
setCursorPosition,
784782
setIntendedCursorPosition,
785783
shouldDisableImages,
786-
setSelectedImages,
784+
setSelectedMedia,
787785
t,
786+
acceptedFileTypes,
788787
],
789788
)
790789

@@ -1268,10 +1267,10 @@ const ChatTextArea = forwardRef<HTMLTextAreaElement, ChatTextAreaProps>(
12681267
)}
12691268
</div>
12701269

1271-
{selectedImages.length > 0 && (
1272-
<Thumbnails
1273-
images={selectedImages}
1274-
setImages={setSelectedImages}
1270+
{selectedMedia.length > 0 && (
1271+
<MediaThumbnails
1272+
mediaItems={selectedMedia}
1273+
setMediaItems={setSelectedMedia}
12751274
style={{
12761275
left: "16px",
12771276
zIndex: 2,

0 commit comments

Comments
 (0)