Skip to content

Commit aedcbb3

Browse files
authored
Reinstate binary-encoding of vectors in Collections write path (#338)
* reinstate binary encoding of vectors in collection write path (Data API fixed 1710) * adjust tests on collection bin-enc + docstring for APIOptions * Adapt tests to binenc in collections and docker lagging behind
1 parent 744f63a commit aedcbb3

9 files changed

+56
-35
lines changed

CHANGES

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Spawner methods for databases/admins standardized; they don't issue DevOps API c
1616
Support for Astra DB "custom domain" endpoints for database
1717
- in which case: `.id`, `.region`, `.get_database_admin()`, `.info()` and `.name()` aren't available.
1818
Support for the `indexType` field to describe table indexes (for compatibility, said field is not mandatory).
19+
Collections write path now obeys the binary-encoding API Option (which in turn defaults to True. Formerly bin-encoding was always turned off.)
1920
DataAPITime: support for "hh:mm" no-seconds format.
2021
DataAPIDuration: improved parse performance by caching regexpes.
2122
DataAPIDuration: support for "P4W"-type strings and for zeroes such as "P", "-PR".

astrapy/data/utils/collection_converters.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,7 @@ def preprocess_collection_payload_value(
5959
_value = convert_vector_to_floats(_value)
6060
# now _value is either a list or a DataAPIVector.
6161
# can/should it be binary-encoded?
62-
can_bin_encode = False
63-
# TODO: reinstate the following condition once the Data API
64-
# correctly excludes $binary from indexing for collections:
65-
# can_bin_encode = path[0] in {"insertOne", "insertMany"}
62+
can_bin_encode = path[0] in {"insertOne", "insertMany"}
6663
# will it be bin-encoded?
6764
if isinstance(_value, DataAPIVector):
6865
# if I can, I will

astrapy/utils/api_options.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ class SerdesOptions:
281281
as instances of `DataAPIVector`, while for collections this affects
282282
the encoding of the quantity found in the "$vector" field, if present,
283283
regardless of its representation in the method argument. Defaults to True.
284-
*Note: For release `2.0.0-preview`, binary encoding in collections is OFF.*
285284
custom_datatypes_in_reading: Read-Path. This setting determines whether return
286285
values from read methods should use astrapy custom classes (default setting
287286
of True), or try to use only standard-library data types instead (False).

tests/base/integration/collections/test_collection_dml_async.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -212,21 +212,21 @@ async def test_collection_vector_insertion_options_async(
212212
)
213213

214214
# check how the documents are stored
215-
# TODO: reinstate the expectation once collection regains conditional binenc.
216215
expect_binaries = {
217-
"Yb_Yc_()": False, # True,
218-
"Yb_Yc_[]": False, # True,
219-
"Yb_Yc_DV": False, # True,
220-
"Nb_Yc_()": False, # False,
221-
"Nb_Yc_[]": False, # False,
222-
"Nb_Yc_DV": False, # False,
216+
"Yb_Yc_()": True,
217+
"Yb_Yc_[]": True,
218+
"Yb_Yc_DV": True,
219+
"Nb_Yc_()": False,
220+
"Nb_Yc_[]": False,
221+
"Nb_Yc_DV": False,
223222
#
224-
"Yb_Nc_[]": False, # True,
225-
"Yb_Nc_DV": False, # True,
223+
"Yb_Nc_[]": True,
224+
"Yb_Nc_DV": True,
226225
#
227-
"Nb_Nc_[]": False, # False,
228-
"Nb_Nc_DV": False, # False,
226+
"Nb_Nc_[]": False,
227+
"Nb_Nc_DV": False,
229228
}
229+
230230
raw_find_response = await async_empty_collection.command(
231231
body={"find": {"projection": {"_id": True, "$vector": True}}},
232232
)

tests/base/integration/collections/test_collection_dml_sync.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -183,21 +183,21 @@ def test_collection_vector_insertion_options_sync(
183183
)
184184

185185
# check how the documents are stored
186-
# TODO: reinstate the expectation once collection regains conditional binenc.
187186
expect_binaries = {
188-
"Yb_Yc_()": False, # True,
189-
"Yb_Yc_[]": False, # True,
190-
"Yb_Yc_DV": False, # True,
191-
"Nb_Yc_()": False, # False,
192-
"Nb_Yc_[]": False, # False,
193-
"Nb_Yc_DV": False, # False,
187+
"Yb_Yc_()": True,
188+
"Yb_Yc_[]": True,
189+
"Yb_Yc_DV": True,
190+
"Nb_Yc_()": False,
191+
"Nb_Yc_[]": False,
192+
"Nb_Yc_DV": False,
194193
#
195-
"Yb_Nc_[]": False, # True,
196-
"Yb_Nc_DV": False, # True,
194+
"Yb_Nc_[]": True,
195+
"Yb_Nc_DV": True,
197196
#
198-
"Nb_Nc_[]": False, # False,
199-
"Nb_Nc_DV": False, # False,
197+
"Nb_Nc_[]": False,
198+
"Nb_Nc_DV": False,
200199
}
200+
201201
raw_find_response = sync_empty_collection.command(
202202
body={"find": {"projection": {"_id": True, "$vector": True}}},
203203
)

tests/base/integration/collections/test_collection_vectorize_methods_async.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,18 @@
1919
import pytest
2020

2121
from astrapy import AsyncDatabase
22+
from astrapy.api_options import APIOptions, SerdesOptions
2223
from astrapy.constants import DefaultDocumentType
2324
from astrapy.cursors import AsyncCollectionFindCursor
2425
from astrapy.data_types import DataAPIVector
2526
from astrapy.exceptions import CollectionInsertManyException, DataAPIResponseException
2627
from astrapy.info import CollectionDefinition
2728

28-
from ..conftest import HEADER_EMBEDDING_API_KEY_OPENAI, DefaultAsyncCollection
29+
from ..conftest import (
30+
HEADER_EMBEDDING_API_KEY_OPENAI,
31+
IS_ASTRA_DB,
32+
DefaultAsyncCollection,
33+
)
2934

3035

3136
@pytest.mark.skipif(
@@ -42,13 +47,19 @@ async def test_collection_methods_vectorize_async(
4247
acol = async_empty_service_collection
4348
service_vector_dimension = service_collection_parameters["dimension"]
4449

50+
# TODO we lift storage of binencoded vectors on nonAstra because docker image
51+
# 1.0.20-ct1 does not have fix 1738 yet (long nonindexed binenc strings)
52+
binencoptions = APIOptions(
53+
serdes_options=SerdesOptions(binary_encode_vectors=IS_ASTRA_DB)
54+
)
55+
4556
await acol.insert_one({"t": "tower", "$vectorize": "How high is this tower?"})
4657
await acol.insert_one({"t": "vectorless"})
47-
await acol.insert_one(
58+
await acol.with_options(api_options=binencoptions).insert_one(
4859
{"t": "vectorful", "$vector": [0.01] * service_vector_dimension},
4960
)
5061

51-
await acol.insert_many(
62+
await acol.with_options(api_options=binencoptions).insert_many(
5263
[
5364
{
5465
"t": "guide",

tests/base/integration/collections/test_collection_vectorize_methods_sync.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,16 @@
1919
import pytest
2020

2121
from astrapy import Database
22+
from astrapy.api_options import APIOptions, SerdesOptions
2223
from astrapy.data_types import DataAPIVector
2324
from astrapy.exceptions import CollectionInsertManyException, DataAPIResponseException
2425
from astrapy.info import CollectionDefinition
2526

26-
from ..conftest import HEADER_EMBEDDING_API_KEY_OPENAI, DefaultCollection
27+
from ..conftest import (
28+
HEADER_EMBEDDING_API_KEY_OPENAI,
29+
IS_ASTRA_DB,
30+
DefaultCollection,
31+
)
2732

2833

2934
@pytest.mark.skipif(
@@ -40,11 +45,19 @@ def test_collection_methods_vectorize_sync(
4045
col = sync_empty_service_collection
4146
service_vector_dimension = service_collection_parameters["dimension"]
4247

48+
# TODO we lift storage of binencoded vectors on nonAstra because docker image
49+
# 1.0.20-ct1 does not have fix 1738 yet (long nonindexed binenc strings)
50+
binencoptions = APIOptions(
51+
serdes_options=SerdesOptions(binary_encode_vectors=IS_ASTRA_DB)
52+
)
53+
4354
col.insert_one({"t": "tower", "$vectorize": "How high is this tower?"})
4455
col.insert_one({"t": "vectorless"})
45-
col.insert_one({"t": "vectorful", "$vector": [0.01] * service_vector_dimension})
56+
col.with_options(api_options=binencoptions).insert_one(
57+
{"t": "vectorful", "$vector": [0.01] * service_vector_dimension}
58+
)
4659

47-
col.insert_many(
60+
col.with_options(api_options=binencoptions).insert_many(
4861
[
4962
{
5063
"t": "guide",

tests/base/integration/misc/test_vectorize_ops_async.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
class TestVectorizeOpsAsync:
2626
@pytest.mark.describe("test of find_embedding_providers, async")
27-
async def test_collection_methods_vectorize_async(
27+
async def test_findembeddingproviders_async(
2828
self,
2929
async_database: AsyncDatabase,
3030
) -> None:

tests/base/integration/misc/test_vectorize_ops_sync.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
class TestVectorizeOpsSync:
2626
@pytest.mark.describe("test of find_embedding_providers, sync")
27-
def test_collection_methods_vectorize_sync(
27+
def test_findembeddingproviders_sync(
2828
self,
2929
sync_database: Database,
3030
) -> None:

0 commit comments

Comments
 (0)