Skip to content

Commit 3072d38

Browse files
Shuowei Lishuoweil
andauthored
feat: Update llm.TextEmbeddingGenerator to 005 (#1186)
* docs(bigquery): update minor parts in base.py * docs(bigquery): update minor changes for bigframes/ml/base.py * udpate docs in semantics.py to match the text-embedding-005 update --------- Co-authored-by: Shuowei Li <[email protected]>
1 parent 0693a7d commit 3072d38

File tree

7 files changed

+19
-14
lines changed

7 files changed

+19
-14
lines changed

bigframes/ml/llm.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,11 @@
4747
_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT,
4848
)
4949

50+
_TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005"
5051
_TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004"
5152
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002"
5253
_TEXT_EMBEDDING_ENDPOINTS = (
54+
_TEXT_EMBEDDING_005_ENDPOINT,
5355
_TEXT_EMBEDDING_004_ENDPOINT,
5456
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT,
5557
)
@@ -606,8 +608,8 @@ class TextEmbeddingGenerator(base.BaseEstimator):
606608
607609
Args:
608610
model_name (str, Default to "text-embedding-004"):
609-
The model for text embedding. Possible values are "text-embedding-004" or "text-multilingual-embedding-002".
610-
text-embedding models returns model embeddings for text inputs.
611+
The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004"
612+
or "text-multilingual-embedding-002". text-embedding models returns model embeddings for text inputs.
611613
text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages.
612614
Default to "text-embedding-004".
613615
session (bigframes.Session or None):
@@ -621,7 +623,9 @@ def __init__(
621623
self,
622624
*,
623625
model_name: Literal[
624-
"text-embedding-004", "text-multilingual-embedding-002"
626+
"text-embedding-005",
627+
"text-embedding-004",
628+
"text-multilingual-embedding-002",
625629
] = "text-embedding-004",
626630
session: Optional[bigframes.Session] = None,
627631
connection_name: Optional[str] = None,

bigframes/ml/loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator,
7272
llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator,
7373
llm._CLAUDE_3_OPUS_ENDPOINT: llm.Claude3TextGenerator,
74+
llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator,
7475
llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator,
7576
llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator,
7677
}

bigframes/operations/semantics.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -647,12 +647,12 @@ def search(
647647
>>> bigframes.options.experiments.semantic_operators = True
648648
649649
>>> import bigframes.ml.llm as llm
650-
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
650+
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
651651
652652
>>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]})
653653
>>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance')
654654
creatures distance
655-
3 chimpanzee 0.781101
655+
3 chimpanzee 0.635844
656656
<BLANKLINE>
657657
[1 rows x 2 columns]
658658
@@ -945,7 +945,7 @@ def sim_join(
945945
>>> bigframes.options.experiments.semantic_operators = True
946946
947947
>>> import bigframes.ml.llm as llm
948-
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
948+
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
949949
950950
>>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})
951951
>>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})

notebooks/experimental/semantic_operators.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151
"source": [
152152
"import bigframes.ml.llm as llm\n",
153153
"gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n",
154-
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")"
154+
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")"
155155
]
156156
},
157157
{

owlbot.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@
104104

105105
# Use a custom table of contents since the default one isn't organized well
106106
# enough for the number of classes we have.
107-
assert 1 == s.replace( # publish-docs.sh
107+
assert 1 == s.replace( # publish-docs.sh
108108
[".kokoro/publish-docs.sh"],
109109
(
110110
re.escape("# upload docs")
@@ -122,14 +122,14 @@
122122
)
123123

124124
# Fixup the documentation.
125-
assert 1 == s.replace( # docs/conf.py
125+
assert 1 == s.replace( # docs/conf.py
126126
["docs/conf.py"],
127127
re.escape("Google Cloud Client Libraries for bigframes"),
128128
"BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
129129
)
130130

131131
# Don't omit `*/core/*.py` when counting test coverages
132-
assert 1 == s.replace( # .coveragerc
132+
assert 1 == s.replace( # .coveragerc
133133
[".coveragerc"],
134134
re.escape(" */core/*.py\n"),
135135
"",

tests/system/large/operations/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,5 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator:
2929
@pytest.fixture(scope="session")
3030
def text_embedding_generator(session, bq_connection) -> llm.TextEmbeddingGenerator:
3131
return llm.TextEmbeddingGenerator(
32-
session=session, connection_name=bq_connection, model_name="text-embedding-004"
32+
session=session, connection_name=bq_connection, model_name="text-embedding-005"
3333
)

tests/system/small/ml/test_llm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def test_text_generator_predict_with_params_success(
196196

197197
@pytest.mark.parametrize(
198198
"model_name",
199-
("text-embedding-004", "text-multilingual-embedding-002"),
199+
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
200200
)
201201
def test_create_load_text_embedding_generator_model(
202202
dataset_id, model_name, session, bq_connection
@@ -218,7 +218,7 @@ def test_create_load_text_embedding_generator_model(
218218

219219
@pytest.mark.parametrize(
220220
"model_name",
221-
("text-embedding-004", "text-multilingual-embedding-002"),
221+
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
222222
)
223223
@pytest.mark.flaky(retries=2)
224224
def test_text_embedding_generator_predict_default_params_success(
@@ -236,7 +236,7 @@ def test_text_embedding_generator_predict_default_params_success(
236236

237237
@pytest.mark.parametrize(
238238
"model_name",
239-
("text-embedding-004", "text-multilingual-embedding-002"),
239+
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
240240
)
241241
@pytest.mark.flaky(retries=2)
242242
def test_text_embedding_generator_multi_cols_predict_success(

0 commit comments

Comments
 (0)