Skip to content

Commit 542d3db

Browse files
gaudybGaudy Blanco
andauthored
Prefix vector store (#2106)
* add prefix to vector store configuration and removal of container name * docs updated * change prefix property name * change prefix property name * feedback implemented --------- Co-authored-by: Gaudy Blanco <[email protected]>
1 parent 1bb9fa8 commit 542d3db

File tree

7 files changed

+60
-14
lines changed

7 files changed

+60
-14
lines changed

docs/config/yaml.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,45 @@ Where to put all vectors for the system. Configured for lancedb by default. This
171171
- `url` **str** (only for AI Search) - AI Search endpoint
172172
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
173173
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
174-
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
174+
- `index_prefix` **str** - (optional) A prefix for the indexes you will create for embeddings. This stores all indexes (tables) for a given dataset ingest.
175175
- `database_name` **str** - (cosmosdb only) Name of the database.
176-
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
176+
- `embeddings_schema` **list[dict[str, str]]** (optional) - Enables customization for each of your embeddings.
177+
- `<supported_embedding>`:
178+
- `index_name` **str**: (optional) - Name for the specific embedding index table.
179+
- `id_field` **str**: (optional) - Field name to be used as id. Default=`id`
180+
- `vector_field` **str**: (optional) - Field name to be used as vector. Default=`vector`
181+
- `vector_size` **int**: (optional) - Vector size for the embeddings. Default=`3072`
182+
183+
The supported embeddings are:
184+
185+
- `text_unit.text`
186+
- `document.text`
187+
- `entity.title`
188+
- `entity.description`
189+
- `relationship.description`
190+
- `community.title`
191+
- `community.summary`
192+
- `community.full_content`
193+
194+
For example:
195+
196+
```yaml
197+
vector_store:
198+
type: lancedb
199+
db_uri: output/lancedb
200+
container_name: christmas-carol
201+
index_prefix: "christmas-carol"
202+
embeddings_schema:
203+
text_unit.text:
204+
index_name: "text-unit-embeddings"
205+
id_field: "id_custom"
206+
vector_field: "vector_custom"
207+
vector_size: 3072
208+
entity.description:
209+
id_field: "id_custom"
210+
211+
```
212+
177213

178214
## Workflow Configurations
179215

graphrag/config/defaults.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,8 @@ class VectorStoreDefaults:
371371

372372
type: ClassVar[str] = VectorStoreType.LanceDB.value
373373
db_uri: str = str(Path(DEFAULT_OUTPUT_BASE_DIR) / "lancedb")
374-
container_name: str = "default"
375374
overwrite: bool = True
375+
index_prefix: None = None
376376
url: None = None
377377
api_key: None = None
378378
audience: None = None

graphrag/config/embeddings.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@
3030

3131

3232
def create_index_name(
33-
container_name: str, embedding_name: str, validate: bool = True
33+
index_prefix: str, embedding_name: str, validate: bool = True
3434
) -> str:
3535
"""
3636
Create a index name for the embedding store.
3737
3838
Within any given vector store, we can have multiple sets of embeddings organized into projects.
39-
The `container` param is used for this partitioning, and is added as a prefix to the index name for differentiation.
39+
The `container` param is used for this partitioning, and is added as a index_prefix to the index name for differentiation.
4040
4141
The embedding name is fixed, with the available list defined in graphrag.index.config.embeddings
4242
@@ -45,4 +45,7 @@ def create_index_name(
4545
if validate and embedding_name not in all_embeddings:
4646
msg = f"Invalid embedding name: {embedding_name}"
4747
raise KeyError(msg)
48-
return f"{container_name}-{embedding_name}".replace(".", "-")
48+
49+
if index_prefix:
50+
return f"{index_prefix}-{embedding_name}".replace(".", "-")
51+
return embedding_name.replace(".", "-")

graphrag/config/init_content.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@
7777
vector_store:
7878
type: {vector_store_defaults.type}
7979
db_uri: {vector_store_defaults.db_uri}
80-
container_name: {vector_store_defaults.container_name}
8180
8281
### Workflow settings ###
8382

graphrag/config/models/vector_store_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ def _validate_url(self) -> None:
7272
default=vector_store_defaults.audience,
7373
)
7474

75-
container_name: str = Field(
76-
description="The container name to use.",
77-
default=vector_store_defaults.container_name,
75+
index_prefix: str | None = Field(
76+
description="The index prefix to use.",
77+
default=vector_store_defaults.index_prefix,
7878
)
7979

8080
database_name: str | None = Field(

graphrag/index/workflows/generate_text_embeddings.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,14 @@ def _create_vector_store(
253253
else:
254254
single_embedding_config = raw_config
255255

256+
if (
257+
single_embedding_config.index_name is not None
258+
and vector_store_config.index_prefix
259+
):
260+
single_embedding_config.index_name = (
261+
f"{vector_store_config.index_prefix}-{single_embedding_config.index_name}"
262+
)
263+
256264
if single_embedding_config.index_name is None:
257265
single_embedding_config.index_name = index_name
258266

@@ -268,9 +276,9 @@ def _create_vector_store(
268276

269277

270278
def _get_index_name(vector_store_config: VectorStoreConfig, embedding_name: str) -> str:
271-
container_name = vector_store_config.container_name
272-
index_name = create_index_name(container_name, embedding_name)
279+
index_prefix = vector_store_config.index_prefix or ""
280+
index_name = create_index_name(index_prefix, embedding_name)
273281

274-
msg = f"using vector store {vector_store_config.type} with container_name {container_name} for embedding {embedding_name}: {index_name}"
282+
msg = f"using vector store {vector_store_config.type} with index prefix {index_prefix} for embedding {embedding_name}: {index_name}"
275283
logger.info(msg)
276284
return index_name

tests/unit/config/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def assert_vector_store_configs(
114114
assert actual.url == expected.url
115115
assert actual.api_key == expected.api_key
116116
assert actual.audience == expected.audience
117-
assert actual.container_name == expected.container_name
117+
assert actual.index_prefix == expected.index_prefix
118118
assert actual.database_name == expected.database_name
119119

120120

0 commit comments

Comments
 (0)