Support for new TaskTypes and output_dimensionality param (#285)

mayureshagashe2105 · web-flow · commit b3a35c178931 · 2024-04-18T10:48:09.000-07:00
* Update Task Types for embeddings

* Add support for  param

* format

* fix typo

* Add guard against negative ouput_dim

* Update docs

* async code match

* update tests

* format

* Update task types
diff --git a/google/generativeai/embedding.py b/google/generativeai/embedding.py
@@ -59,6 +59,14 @@
     EmbeddingTaskType.CLUSTERING: EmbeddingTaskType.CLUSTERING,
     5: EmbeddingTaskType.CLUSTERING,
     "clustering": EmbeddingTaskType.CLUSTERING,
+    6: EmbeddingTaskType.QUESTION_ANSWERING,
+    "question_answering": EmbeddingTaskType.QUESTION_ANSWERING,
+    "qa": EmbeddingTaskType.QUESTION_ANSWERING,
+    EmbeddingTaskType.QUESTION_ANSWERING: EmbeddingTaskType.QUESTION_ANSWERING,
+    7: EmbeddingTaskType.FACT_VERIFICATION,
+    "fact_verification": EmbeddingTaskType.FACT_VERIFICATION,
+    "verification": EmbeddingTaskType.FACT_VERIFICATION,
+    EmbeddingTaskType.FACT_VERIFICATION: EmbeddingTaskType.FACT_VERIFICATION,
 }
 
 
@@ -94,6 +102,7 @@ def embed_content(
     content: content_types.ContentType,
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceClient | None = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.EmbeddingDict: ...
@@ -105,6 +114,7 @@ def embed_content(
     content: Iterable[content_types.ContentType],
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceClient | None = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.BatchEmbeddingDict: ...
@@ -115,6 +125,7 @@ def embed_content(
     content: content_types.ContentType | Iterable[content_types.ContentType],
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceClient = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.EmbeddingDict | text_types.BatchEmbeddingDict:
@@ -135,6 +146,12 @@ def embed_content(
         title:
             An optional title for the text. Only applicable when task_type is
             `RETRIEVAL_DOCUMENT`.
+
+        output_dimensionality:
+            Optional reduced dimensionality for the output embeddings. If set,
+            excessive values from the output embeddings will be truncated from
+            the end.
+
         request_options:
             Options for the request.
 
@@ -155,14 +172,21 @@ def embed_content(
             "If a title is specified, the task must be a retrieval document type task."
         )
 
+    if output_dimensionality and output_dimensionality < 0:
+        raise ValueError("`output_dimensionality` must be a non-negative integer.")
+
     if task_type:
         task_type = to_task_type(task_type)
 
     if isinstance(content, Iterable) and not isinstance(content, (str, Mapping)):
         result = {"embedding": []}
         requests = (
             glm.EmbedContentRequest(
-                model=model, content=content_types.to_content(c), task_type=task_type, title=title
+                model=model,
+                content=content_types.to_content(c),
+                task_type=task_type,
+                title=title,
+                output_dimensionality=output_dimensionality,
             )
             for c in content
         )
@@ -177,7 +201,11 @@ def embed_content(
         return result
     else:
         embedding_request = glm.EmbedContentRequest(
-            model=model, content=content_types.to_content(content), task_type=task_type, title=title
+            model=model,
+            content=content_types.to_content(content),
+            task_type=task_type,
+            title=title,
+            output_dimensionality=output_dimensionality,
         )
         embedding_response = client.embed_content(
             embedding_request,
@@ -194,6 +222,7 @@ async def embed_content_async(
     content: content_types.ContentType,
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceAsyncClient | None = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.EmbeddingDict: ...
@@ -205,6 +234,7 @@ async def embed_content_async(
     content: Iterable[content_types.ContentType],
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceAsyncClient | None = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.BatchEmbeddingDict: ...
@@ -215,6 +245,7 @@ async def embed_content_async(
     content: content_types.ContentType | Iterable[content_types.ContentType],
     task_type: EmbeddingTaskTypeOptions | None = None,
     title: str | None = None,
+    output_dimensionality: int | None = None,
     client: glm.GenerativeServiceAsyncClient = None,
     request_options: dict[str, Any] | None = None,
 ) -> text_types.EmbeddingDict | text_types.BatchEmbeddingDict:
@@ -232,14 +263,21 @@ async def embed_content_async(
             "If a title is specified, the task must be a retrieval document type task."
         )
 
+    if output_dimensionality and output_dimensionality < 0:
+        raise ValueError("`output_dimensionality` must be a non-negative integer.")
+
     if task_type:
         task_type = to_task_type(task_type)
 
     if isinstance(content, Iterable) and not isinstance(content, (str, Mapping)):
         result = {"embedding": []}
         requests = (
             glm.EmbedContentRequest(
-                model=model, content=content_types.to_content(c), task_type=task_type, title=title
+                model=model,
+                content=content_types.to_content(c),
+                task_type=task_type,
+                title=title,
+                output_dimensionality=output_dimensionality,
             )
             for c in content
         )
@@ -254,7 +292,11 @@ async def embed_content_async(
         return result
     else:
         embedding_request = glm.EmbedContentRequest(
-            model=model, content=content_types.to_content(content), task_type=task_type, title=title
+            model=model,
+            content=content_types.to_content(content),
+            task_type=task_type,
+            title=title,
+            output_dimensionality=output_dimensionality,
         )
         embedding_response = await client.embed_content(
             embedding_request,
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -122,9 +122,14 @@ def test_embed_content_title_and_task_2(self):
         text = "What are you?"
         with self.assertRaises(ValueError):
             embedding.embed_content(
-                model=DEFAULT_EMB_MODEL, content=text, task_type="unspecified", title="Exploring AI"
+                model=DEFAULT_EMB_MODEL, content=text, task_type="similarity", title="Exploring AI"
             )
 
+    def test_embed_content_with_negative_output_dimensionality(self):
+        text = "What are you?"
+        with self.assertRaises(ValueError):
+            embedding.embed_content(model=DEFAULT_EMB_MODEL, content=text, output_dimensionality=-1)
+
     def test_generate_answer_called_with_request_options(self):
         self.client.embed_content = mock.MagicMock()
         request = mock.ANY
@@ -174,6 +179,34 @@ def test_embed_content_called_with_request_options(self):
 
         self.client.embed_content.assert_called_once_with(request, **request_options)
 
+    @parameterized.named_parameters(
+        dict(
+            testcase_name="embedding.embed_content",
+            obj=embedding.embed_content,
+            aobj=embedding.embed_content_async,
+        ),
+    )
+    def test_async_code_match(self, obj, aobj):
+        import inspect
+        import re
+
+        source = inspect.getsource(obj)
+        asource = inspect.getsource(aobj)
+        source = re.sub('""".*"""', "", source, flags=re.DOTALL)
+        asource = re.sub('""".*"""', "", asource, flags=re.DOTALL)
+        asource = (
+            asource.replace("anext", "next")
+            .replace("aiter", "iter")
+            .replace("_async", "")
+            .replace("async ", "")
+            .replace("await ", "")
+            .replace("Async", "")
+            .replace("ASYNC_", "")
+        )
+
+        asource = re.sub(" *?# type: ignore", "", asource)
+        self.assertEqual(source, asource)
+
 
 if __name__ == "__main__":
     absltest.main()
diff --git a/tests/test_embedding_async.py b/tests/test_embedding_async.py
@@ -121,7 +121,14 @@ async def test_embed_content_async_title_and_task_2(self):
         text = "What are you?"
         with self.assertRaises(ValueError):
             await embedding.embed_content_async(
-                model=DEFAULT_EMB_MODEL, content=text, task_type="unspecified", title="Exploring AI"
+                model=DEFAULT_EMB_MODEL, content=text, task_type="similarity", title="Exploring AI"
+            )
+
+    async def test_embed_content_with_negative_output_dimensionality(self):
+        text = "What are you?"
+        with self.assertRaises(ValueError):
+            await embedding.embed_content_async(
+                model=DEFAULT_EMB_MODEL, content=text, output_dimensionality=-1
             )
 
     async def test_embed_content_called_with_request_options(self):