feat: support sparse embeddings & video input

hks · hks · commit e98d87f5d12e · 2025-08-12T12:05:43.000+08:00
diff --git a/volcenginesdkarkruntime/resources/multimodal_embeddings.py b/volcenginesdkarkruntime/resources/multimodal_embeddings.py
@@ -13,7 +13,10 @@
 )
 from .._utils._utils import with_sts_token, async_with_sts_token
 from ..types.multimodal_embedding import EmbeddingInputParam
-from ..types.multimodal_embedding import MultimodalEmbeddingResponse
+from ..types.multimodal_embedding import (
+    MultimodalEmbeddingResponse,
+    SparseEmbeddingInput,
+)
 
 __all__ = ["MultimodalEmbeddings", "AsyncMultimodalEmbeddings"]
 
@@ -31,6 +34,7 @@ def create(
         model: str,
         encoding_format: Literal["float", "base64"] = "float",
         dimensions: int | None = None,
+        sparse_embedding: SparseEmbeddingInput | None = None,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -45,6 +49,7 @@ def create(
                 "model": model,
                 "encoding_format": encoding_format,
                 "dimensions": dimensions,
+                "sparse_embedding": sparse_embedding,
             },
             options=make_request_options(
                 extra_headers=extra_headers,
@@ -69,6 +74,7 @@ async def create(
         model: str,
         encoding_format: Literal["float", "base64"] = "float",
         dimensions: int | None = None,
+        sparse_embedding: SparseEmbeddingInput | None = None,
         # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
         # The extra values given here take precedence over values defined on the client or passed to this method.
         extra_headers: Headers | None = None,
@@ -83,6 +89,7 @@ async def create(
                 "model": model,
                 "encoding_format": encoding_format,
                 "dimensions": dimensions,
+                "sparse_embedding": sparse_embedding,
             },
             options=make_request_options(
                 extra_headers=extra_headers,
diff --git a/volcenginesdkarkruntime/types/images/__init__.py b/volcenginesdkarkruntime/types/images/__init__.py
@@ -1 +1,3 @@
 from .images import ImagesResponse
+
+__all__ = ["ImagesResponse"]
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/__init__.py b/volcenginesdkarkruntime/types/multimodal_embedding/__init__.py
@@ -4,12 +4,15 @@
 from .embedding_content_part_text_param import MultimodalEmbeddingContentPartTextParam
 from .embedding_content_part_image_param import MultimodalEmbeddingContentPartImageParam
 from .embedding_input import EmbeddingInputParam
-from .embedding_data import MultimodalEmbedding
+from .embedding_data import MultimodalEmbedding, SparseEmbedding
+from .sparse_embedding_input import SparseEmbeddingInput
 
 __all__ = [
     "MultimodalEmbeddingResponse",
     "MultimodalEmbeddingContentPartTextParam",
     "MultimodalEmbeddingContentPartImageParam",
     "EmbeddingInputParam",
     "MultimodalEmbedding",
+    "SparseEmbeddingInput",
+    "SparseEmbedding",
 ]
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/embedding_content_part_video_param.py b/volcenginesdkarkruntime/types/multimodal_embedding/embedding_content_part_video_param.py
@@ -0,0 +1,17 @@
+from __future__ import annotations
+
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["MultimodalEmbeddingContentPartVideoParam", "VideoURL"]
+
+
+class VideoURL(TypedDict, total=False):
+    url: Required[str]
+    """Either a URL of the video or the base64 encoded video data."""
+
+
+class MultimodalEmbeddingContentPartVideoParam(TypedDict, total=False):
+    video_url: Required[VideoURL]
+
+    type: Required[Literal["video_url"]]
+    """The type of the content part."""
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/embedding_data.py b/volcenginesdkarkruntime/types/multimodal_embedding/embedding_data.py
@@ -3,7 +3,14 @@
 
 from ..._models import BaseModel
 
-__all__ = ["MultimodalEmbedding"]
+__all__ = ["MultimodalEmbedding", "SparseEmbedding"]
+
+
+class SparseEmbedding(BaseModel):
+    index: int
+    """The token index of the embedding."""
+    value: float
+    """The value of the embedding."""
 
 
 class MultimodalEmbedding(BaseModel):
@@ -12,3 +19,6 @@ class MultimodalEmbedding(BaseModel):
 
     object: Literal["embedding"]
     """The object type, which is always "embedding"."""
+
+    sparse_embedding: SparseEmbedding
+    """The sparse embeddings generated by the model."""
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/embedding_input.py b/volcenginesdkarkruntime/types/multimodal_embedding/embedding_input.py
@@ -6,9 +6,12 @@
 from .embedding_content_part_text_param import (
     MultimodalEmbeddingContentPartTextParam,
 )
+from .embedding_content_part_video_param import MultimodalEmbeddingContentPartVideoParam
 
 __all__ = ["EmbeddingInputParam"]
 
 EmbeddingInputParam = Union[
-    MultimodalEmbeddingContentPartImageParam, MultimodalEmbeddingContentPartTextParam
+    MultimodalEmbeddingContentPartImageParam,
+    MultimodalEmbeddingContentPartTextParam,
+    MultimodalEmbeddingContentPartVideoParam,
 ]
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/embedding_response.py b/volcenginesdkarkruntime/types/multimodal_embedding/embedding_response.py
@@ -1,4 +1,3 @@
-from typing import List
 from typing_extensions import Literal
 
 from .embedding_data import MultimodalEmbedding
@@ -15,8 +14,8 @@ class MultimodalEmbeddingResponse(BaseModel):
     created: int
     """The Unix timestamp (in seconds) of when the embeddings was created."""
 
-    data: List[MultimodalEmbedding]
-    """The list of embeddings generated by the model."""
+    data: MultimodalEmbedding
+    """The embeddings generated by the model."""
 
     model: str
     """The name of the model used to generate the embedding."""
diff --git a/volcenginesdkarkruntime/types/multimodal_embedding/sparse_embedding_input.py b/volcenginesdkarkruntime/types/multimodal_embedding/sparse_embedding_input.py
@@ -0,0 +1,7 @@
+from typing_extensions import Literal, Required, TypedDict
+
+__all__ = ["SparseEmbeddingInput"]
+
+
+class SparseEmbeddingInput(TypedDict, total=False):
+    type: Required[Literal["enabled", "disabled"]]
diff --git a/volcenginesdkexamples/volcenginesdkarkruntime/sparse_embeddings.py b/volcenginesdkexamples/volcenginesdkarkruntime/sparse_embeddings.py
@@ -0,0 +1,25 @@
+from volcenginesdkarkruntime import Ark
+from volcenginesdkarkruntime.types.multimodal_embedding import MultimodalEmbeddingResponse
+
+client = Ark()
+
+print("----- multimodal embeddings request -----")
+resp: MultimodalEmbeddingResponse = client.multimodal_embeddings.create(
+    model="doubao-embedding-vision-250615",
+    input=[
+        {
+            "type": "text",
+            "text": "花椰菜又称菜花、花菜，是一种常见的蔬菜。"
+        }
+    ],
+    sparse_embedding={"type": "enabled"},  # enable sparse embedding
+)
+# dense embeddings
+print("---- dense embeddings ----")
+print(resp.data.embedding)
+
+# sparse embeddings
+print("---- sparse embeddings ----")
+for item in resp.data.sparse_embedding:
+    print(item)
+

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`from .images import ImagesResponse`
	`2`	`+`
	`3`	`+__all__ = ["ImagesResponse"]`
Original file line number	Diff line number	Diff line change
`@@ -6,9 +6,12 @@`
`6`	`6`	`from .embedding_content_part_text_param import (`
`7`	`7`	`MultimodalEmbeddingContentPartTextParam,`
`8`	`8`	`)`
	`9`	`+from .embedding_content_part_video_param import MultimodalEmbeddingContentPartVideoParam`
`9`	`10`
`10`	`11`	`__all__ = ["EmbeddingInputParam"]`
`11`	`12`
`12`	`13`	`EmbeddingInputParam = Union[`
`13`		`- MultimodalEmbeddingContentPartImageParam, MultimodalEmbeddingContentPartTextParam`
	`14`	`+ MultimodalEmbeddingContentPartImageParam,`
	`15`	`+ MultimodalEmbeddingContentPartTextParam,`
	`16`	`+ MultimodalEmbeddingContentPartVideoParam,`
`14`	`17`	`]`