|
14 | 14 | # See the License for the specific language governing permissions and |
15 | 15 | # limitations under the License. |
16 | 16 |
|
| 17 | +import warnings |
17 | 18 | from collections.abc import Callable |
18 | | -from dataclasses import dataclass |
19 | 19 | from typing import Any |
20 | 20 | from typing import Dict |
21 | 21 | from typing import Optional |
|
28 | 28 | from apache_beam.typehints.row_type import RowTypeConstraint |
29 | 29 |
|
30 | 30 | EmbeddableToDictFn = Callable[[EmbeddableItem], Dict[str, any]] |
| 31 | +# Backward compatibility alias. |
| 32 | +ChunkToDictFn = EmbeddableToDictFn |
31 | 33 |
|
32 | 34 |
|
33 | | -@dataclass |
34 | 35 | class SchemaConfig: |
35 | | - """Configuration for custom BigQuery schema and row conversion. |
36 | | - |
37 | | - Allows overriding the default schema and row conversion logic for BigQuery |
38 | | - vector storage. This enables custom table schemas beyond the default |
39 | | - id/embedding/content/metadata structure. |
40 | | -
|
41 | | - Attributes: |
42 | | - schema: BigQuery TableSchema dict defining the table structure. |
43 | | - Example: |
44 | | - >>> { |
45 | | - ... 'fields': [ |
46 | | - ... {'name': 'id', 'type': 'STRING'}, |
47 | | - ... {'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED'}, |
48 | | - ... {'name': 'custom_field', 'type': 'STRING'} |
49 | | - ... ] |
50 | | - ... } |
51 | | - embeddable_to_dict_fn: Function that converts an |
52 | | - EmbeddableItem to a dict matching the schema. |
53 | | - Takes an EmbeddableItem and returns |
54 | | - Dict[str, Any] with keys matching |
55 | | - schema fields. |
56 | | - Example: |
57 | | - >>> def embeddable_to_dict(item: EmbeddableItem) -> Dict[str, Any]: |
58 | | - ... return { |
59 | | - ... 'id': item.id, |
60 | | - ... 'embedding': item.embedding.dense_embedding, |
61 | | - ... 'custom_field': item.metadata.get('custom_field') |
62 | | - ... } |
63 | | - """ |
64 | | - schema: Dict |
65 | | - embeddable_to_dict_fn: EmbeddableToDictFn |
| 36 | + def __init__( |
| 37 | + self, |
| 38 | + schema: Dict, |
| 39 | + embeddable_to_dict_fn: Optional[EmbeddableToDictFn] = None, |
| 40 | + **kwargs): |
| 41 | + """Configuration for custom BigQuery schema and row conversion. |
| 42 | +
|
| 43 | + Allows overriding the default schema and row conversion logic for BigQuery |
| 44 | + vector storage. This enables custom table schemas beyond the default |
| 45 | + id/embedding/content/metadata structure. |
| 46 | +
|
| 47 | + Args: |
| 48 | + schema: BigQuery TableSchema dict defining the table structure. |
| 49 | + embeddable_to_dict_fn: Function that converts an EmbeddableItem to a |
| 50 | + dict matching the schema. Takes an EmbeddableItem and returns |
| 51 | + Dict[str, Any] with keys matching schema fields. |
| 52 | +
|
| 53 | + Example with custom schema: |
| 54 | + >>> schema_config = SchemaConfig( |
| 55 | + ... schema={ |
| 56 | + ... 'fields': [ |
| 57 | + ... {'name': 'id', 'type': 'STRING'}, |
| 58 | + ... {'name': 'embedding', 'type': 'FLOAT64', 'mode': 'REPEATED'}, |
| 59 | + ... {'name': 'source_url', 'type': 'STRING'} |
| 60 | + ... ] |
| 61 | + ... }, |
| 62 | + ... embeddable_to_dict_fn=lambda item: { |
| 63 | + ... 'id': item.id, |
| 64 | + ... 'embedding': item.embedding.dense_embedding, |
| 65 | + ... 'source_url': item.metadata.get('url') |
| 66 | + ... } |
| 67 | + ... ) |
| 68 | + """ |
| 69 | + self.schema = schema |
| 70 | + if 'chunk_to_dict_fn' in kwargs: |
| 71 | + warnings.warn( |
| 72 | + "chunk_to_dict_fn is deprecated, use embeddable_to_dict_fn", |
| 73 | + DeprecationWarning, |
| 74 | + stacklevel=2) |
| 75 | + embeddable_to_dict_fn = kwargs.pop('chunk_to_dict_fn') |
| 76 | + if kwargs: |
| 77 | + raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}") |
| 78 | + if embeddable_to_dict_fn is None: |
| 79 | + raise TypeError("SchemaConfig requires embeddable_to_dict_fn") |
| 80 | + self.embeddable_to_dict_fn = embeddable_to_dict_fn |
66 | 81 |
|
67 | 82 |
|
68 | 83 | class BigQueryVectorWriterConfig(VectorDatabaseWriteConfig): |
|
0 commit comments