Skip to content

Commit 59922a3

Browse files
authored
Fix breaking changes from renaming Chunk to EmbeddableItem (#37651)
1 parent dfb3f87 commit 59922a3

File tree

3 files changed

+50
-35
lines changed

3 files changed

+50
-35
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
22
"comment": "Modify this file in a trivial way to cause this test suite to run",
3-
"modification": 15
3+
"modification": 16
44
}

sdks/python/apache_beam/ml/rag/enrichment/bigquery_vector_search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ def format_query(self, items: List[EmbeddableItem]) -> str:
241241
ARRAY_AGG(
242242
STRUCT({"distance, " if self.include_distance else ""}\
243243
{base_columns_str})
244-
) as embeddable_items
244+
) as chunks
245245
FROM VECTOR_SEARCH(
246246
(SELECT {columns_str}, {self.embedding_column}
247247
FROM `{self.table_name}`

sdks/python/apache_beam/ml/rag/ingestion/bigquery.py

Lines changed: 48 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
import warnings
1718
from collections.abc import Callable
18-
from dataclasses import dataclass
1919
from typing import Any
2020
from typing import Dict
2121
from typing import Optional
@@ -28,41 +28,56 @@
2828
from apache_beam.typehints.row_type import RowTypeConstraint
2929

3030
EmbeddableToDictFn = Callable[[EmbeddableItem], Dict[str, any]]
31+
# Backward compatibility alias.
32+
ChunkToDictFn = EmbeddableToDictFn
3133

3234

33-
@dataclass
3435
class SchemaConfig:
35-
"""Configuration for custom BigQuery schema and row conversion.
36-
37-
Allows overriding the default schema and row conversion logic for BigQuery
38-
vector storage. This enables custom table schemas beyond the default
39-
id/embedding/content/metadata structure.
40-
41-
Attributes:
42-
schema: BigQuery TableSchema dict defining the table structure.
43-
Example:
44-
>>> {
45-
... 'fields': [
46-
... {'name': 'id', 'type': 'STRING'},
47-
... {'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED'},
48-
... {'name': 'custom_field', 'type': 'STRING'}
49-
... ]
50-
... }
51-
embeddable_to_dict_fn: Function that converts an
52-
EmbeddableItem to a dict matching the schema.
53-
Takes an EmbeddableItem and returns
54-
Dict[str, Any] with keys matching
55-
schema fields.
56-
Example:
57-
>>> def embeddable_to_dict(item: EmbeddableItem) -> Dict[str, Any]:
58-
... return {
59-
... 'id': item.id,
60-
... 'embedding': item.embedding.dense_embedding,
61-
... 'custom_field': item.metadata.get('custom_field')
62-
... }
63-
"""
64-
schema: Dict
65-
embeddable_to_dict_fn: EmbeddableToDictFn
36+
def __init__(
37+
self,
38+
schema: Dict,
39+
embeddable_to_dict_fn: Optional[EmbeddableToDictFn] = None,
40+
**kwargs):
41+
"""Configuration for custom BigQuery schema and row conversion.
42+
43+
Allows overriding the default schema and row conversion logic for BigQuery
44+
vector storage. This enables custom table schemas beyond the default
45+
id/embedding/content/metadata structure.
46+
47+
Args:
48+
schema: BigQuery TableSchema dict defining the table structure.
49+
embeddable_to_dict_fn: Function that converts an EmbeddableItem to a
50+
dict matching the schema. Takes an EmbeddableItem and returns
51+
Dict[str, Any] with keys matching schema fields.
52+
53+
Example with custom schema:
54+
>>> schema_config = SchemaConfig(
55+
... schema={
56+
... 'fields': [
57+
... {'name': 'id', 'type': 'STRING'},
58+
... {'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED'},
59+
... {'name': 'source_url', 'type': 'STRING'}
60+
... ]
61+
... },
62+
... embeddable_to_dict_fn=lambda item: {
63+
... 'id': item.id,
64+
... 'embedding': item.embedding.dense_embedding,
65+
... 'source_url': item.metadata.get('url')
66+
... }
67+
... )
68+
"""
69+
self.schema = schema
70+
if 'chunk_to_dict_fn' in kwargs:
71+
warnings.warn(
72+
"chunk_to_dict_fn is deprecated, use embeddable_to_dict_fn",
73+
DeprecationWarning,
74+
stacklevel=2)
75+
embeddable_to_dict_fn = kwargs.pop('chunk_to_dict_fn')
76+
if kwargs:
77+
raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}")
78+
if embeddable_to_dict_fn is None:
79+
raise TypeError("SchemaConfig requires embeddable_to_dict_fn")
80+
self.embeddable_to_dict_fn = embeddable_to_dict_fn
6681

6782

6883
class BigQueryVectorWriterConfig(VectorDatabaseWriteConfig):

0 commit comments

Comments
 (0)