Skip to content

Commit 79ad9b9

Browse files
gaudybGaudy Blanco
andauthored
reduce schema fields (#2089)
* reduce schema fields * fix launch.json --------- Co-authored-by: Gaudy Blanco <[email protected]>
1 parent 2b5284c commit 79ad9b9

File tree

12 files changed

+24
-148
lines changed

12 files changed

+24
-148
lines changed

graphrag/config/models/vector_store_schema_config.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,6 @@ class VectorStoreSchemaConfig(BaseModel):
3030
default="vector",
3131
)
3232

33-
text_field: str = Field(
34-
description="The text field to use.",
35-
default="text",
36-
)
37-
38-
attributes_field: str = Field(
39-
description="The attributes field to use.",
40-
default="attributes",
41-
)
42-
4333
vector_size: int = Field(
4434
description="The vector size to use.",
4535
default=DEFAULT_VECTOR_SIZE,
@@ -52,8 +42,6 @@ def _validate_schema(self) -> None:
5242
for field in [
5343
self.id_field,
5444
self.vector_field,
55-
self.text_field,
56-
self.attributes_field,
5745
]:
5846
if not is_valid_field_name(field):
5947
msg = f"Unsafe or invalid field name: {field}"

graphrag/index/operations/embed_text/embed_text.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ async def _text_embed_with_vector_store(
152152
)
153153
batch = input.iloc[insert_batch_size * i : insert_batch_size * (i + 1)]
154154
texts: list[str] = batch[embed_column].tolist()
155-
titles: list[str] = batch[title].tolist()
156155
ids: list[str] = batch[id_column].tolist()
157156
result = await strategy_exec(texts, callbacks, cache, strategy_config)
158157
if result.embeddings:
@@ -163,16 +162,12 @@ async def _text_embed_with_vector_store(
163162

164163
vectors = result.embeddings or []
165164
documents: list[VectorStoreDocument] = []
166-
for doc_id, doc_text, doc_title, doc_vector in zip(
167-
ids, texts, titles, vectors, strict=True
168-
):
165+
for doc_id, doc_vector in zip(ids, vectors, strict=True):
169166
if type(doc_vector) is np.ndarray:
170167
doc_vector = doc_vector.tolist()
171168
document = VectorStoreDocument(
172169
id=doc_id,
173-
text=doc_text,
174170
vector=doc_vector,
175-
attributes={"title": doc_title},
176171
)
177172
documents.append(document)
178173

graphrag/query/structured_search/basic_search/basic_context.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,15 @@ def build_context(
5959
text_embedder=lambda t: self.text_embedder.embed(t),
6060
k=k,
6161
)
62-
related_text_list = [
63-
{
64-
text_id_col: self.text_id_map[f"{chunk.document.id}"],
65-
text_col: chunk.document.text,
66-
}
67-
for chunk in related_texts
62+
63+
text_unit_ids = {t.document.id for t in related_texts}
64+
text_units_filtered = []
65+
text_units_filtered = [
66+
{text_id_col: t.id, text_col: t.text}
67+
for t in self.text_units or []
68+
if t.id in text_unit_ids
6869
]
69-
related_text_df = pd.DataFrame(related_text_list)
70+
related_text_df = pd.DataFrame(text_units_filtered)
7071
else:
7172
related_text_df = pd.DataFrame({
7273
text_id_col: [],

graphrag/vector_stores/azure_ai_search.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
"""A package containing the Azure AI Search vector store implementation."""
55

6-
import json
76
from typing import Any
87

98
from azure.core.credentials import AzureKeyCredential
@@ -13,7 +12,6 @@
1312
from azure.search.documents.indexes.models import (
1413
HnswAlgorithmConfiguration,
1514
HnswParameters,
16-
SearchableField,
1715
SearchField,
1816
SearchFieldDataType,
1917
SearchIndex,
@@ -121,13 +119,6 @@ def load_documents(
121119
vector_search_dimensions=self.vector_size,
122120
vector_search_profile_name=self.vector_search_profile_name,
123121
),
124-
SearchableField(
125-
name=self.text_field, type=SearchFieldDataType.String
126-
),
127-
SimpleField(
128-
name=self.attributes_field,
129-
type=SearchFieldDataType.String,
130-
),
131122
],
132123
vector_search=vector_search,
133124
)
@@ -139,8 +130,6 @@ def load_documents(
139130
{
140131
self.id_field: doc.id,
141132
self.vector_field: doc.vector,
142-
self.text_field: doc.text,
143-
self.attributes_field: json.dumps(doc.attributes),
144133
}
145134
for doc in documents
146135
if doc.vector is not None
@@ -165,9 +154,7 @@ def similarity_search_by_vector(
165154
VectorStoreSearchResult(
166155
document=VectorStoreDocument(
167156
id=doc.get(self.id_field, ""),
168-
text=doc.get(self.text_field, ""),
169157
vector=doc.get(self.vector_field, []),
170-
attributes=(json.loads(doc.get(self.attributes_field, "{}"))),
171158
),
172159
# Cosine similarity between 0.333 and 1.000
173160
# https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#scores-in-a-hybrid-search-results
@@ -192,7 +179,5 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
192179
response = self.db_connection.get_document(id)
193180
return VectorStoreDocument(
194181
id=response.get(self.id_field, ""),
195-
text=response.get(self.text_field, ""),
196182
vector=response.get(self.vector_field, []),
197-
attributes=(json.loads(response.get(self.attributes_field, "{}"))),
198183
)

graphrag/vector_stores/base.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""Base classes for vector stores."""
55

66
from abc import ABC, abstractmethod
7-
from dataclasses import dataclass, field
7+
from dataclasses import dataclass
88
from typing import Any
99

1010
from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
@@ -18,12 +18,8 @@ class VectorStoreDocument:
1818
id: str | int
1919
"""unique id for the document"""
2020

21-
text: str | None
2221
vector: list[float] | None
2322

24-
attributes: dict[str, Any] = field(default_factory=dict)
25-
"""store any additional metadata, e.g. title, date ranges, etc"""
26-
2723

2824
@dataclass
2925
class VectorStoreSearchResult:
@@ -54,9 +50,7 @@ def __init__(
5450

5551
self.index_name = vector_store_schema_config.index_name
5652
self.id_field = vector_store_schema_config.id_field
57-
self.text_field = vector_store_schema_config.text_field
5853
self.vector_field = vector_store_schema_config.vector_field
59-
self.attributes_field = vector_store_schema_config.attributes_field
6054
self.vector_size = vector_store_schema_config.vector_size
6155

6256
@abstractmethod

graphrag/vector_stores/cosmosdb.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
"""A package containing the CosmosDB vector store implementation."""
55

6-
import json
76
from typing import Any
87

98
from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
@@ -171,8 +170,6 @@ def load_documents(
171170
doc_json = {
172171
self.id_field: doc.id,
173172
self.vector_field: doc.vector,
174-
self.text_field: doc.text,
175-
self.attributes_field: json.dumps(doc.attributes),
176173
}
177174
print("Storing document in CosmosDB:") # noqa: T201
178175
print(doc_json) # noqa: T201
@@ -187,7 +184,7 @@ def similarity_search_by_vector(
187184
raise ValueError(msg)
188185

189186
try:
190-
query = f"SELECT TOP {k} c.{self.id_field}, c.{self.text_field}, c.{self.vector_field}, c.{self.attributes_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
187+
query = f"SELECT TOP {k} c.{self.id_field}, c.{self.vector_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
191188
query_params = [{"name": "@embedding", "value": query_embedding}]
192189
items = list(
193190
self._container_client.query_items(
@@ -199,7 +196,7 @@ def similarity_search_by_vector(
199196
except (CosmosHttpResponseError, ValueError):
200197
# Currently, the CosmosDB emulator does not support the VectorDistance function.
201198
# For emulator or test environments - fetch all items and calculate distance locally
202-
query = f"SELECT c.{self.id_field}, c.{self.text_field}, c.{self.vector_field}, c.{self.attributes_field} FROM c" # noqa: S608
199+
query = f"SELECT c.{self.id_field}, c.{self.vector_field} FROM c" # noqa: S608
203200
items = list(
204201
self._container_client.query_items(
205202
query=query,
@@ -231,9 +228,7 @@ def cosine_similarity(a, b):
231228
VectorStoreSearchResult(
232229
document=VectorStoreDocument(
233230
id=item.get(self.id_field, ""),
234-
text=item.get(self.text_field, ""),
235231
vector=item.get(self.vector_field, []),
236-
attributes=(json.loads(item.get(self.attributes_field, "{}"))),
237232
),
238233
score=item.get("SimilarityScore", 0.0),
239234
)
@@ -261,8 +256,6 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
261256
return VectorStoreDocument(
262257
id=item.get(self.id_field, ""),
263258
vector=item.get(self.vector_field, []),
264-
text=item.get(self.text_field, ""),
265-
attributes=(json.loads(item.get(self.attributes_field, "{}"))),
266259
)
267260

268261
def clear(self) -> None:

graphrag/vector_stores/lancedb.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@
33

44
"""The LanceDB vector storage implementation package."""
55

6-
import json # noqa: I001
76
from typing import Any
8-
import pyarrow as pa
7+
8+
import lancedb
99
import numpy as np
10+
import pyarrow as pa
11+
1012
from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
1113
from graphrag.data_model.types import TextEmbedder
12-
1314
from graphrag.vector_stores.base import (
1415
BaseVectorStore,
1516
VectorStoreDocument,
1617
VectorStoreSearchResult,
1718
)
18-
import lancedb
1919

2020

2121
class LanceDBVectorStore(BaseVectorStore):
@@ -41,19 +41,15 @@ def load_documents(
4141
"""Load documents into vector storage."""
4242
# Step 1: Prepare data columns manually
4343
ids = []
44-
texts = []
4544
vectors = []
46-
attributes = []
4745

4846
for document in documents:
4947
self.vector_size = (
5048
len(document.vector) if document.vector else self.vector_size
5149
)
5250
if document.vector is not None and len(document.vector) == self.vector_size:
5351
ids.append(document.id)
54-
texts.append(document.text)
5552
vectors.append(np.array(document.vector, dtype=np.float32))
56-
attributes.append(json.dumps(document.attributes))
5753

5854
# Step 2: Handle empty case
5955
if len(ids) == 0:
@@ -69,9 +65,7 @@ def load_documents(
6965
# Step 4: Create PyArrow table (let schema be inferred)
7066
data = pa.table({
7167
self.id_field: pa.array(ids, type=pa.string()),
72-
self.text_field: pa.array(texts, type=pa.string()),
7368
self.vector_field: vector_column,
74-
self.attributes_field: pa.array(attributes, type=pa.string()),
7569
})
7670

7771
# NOTE: If modifying the next section of code, ensure that the schema remains the same.
@@ -127,9 +121,7 @@ def similarity_search_by_vector(
127121
VectorStoreSearchResult(
128122
document=VectorStoreDocument(
129123
id=doc[self.id_field],
130-
text=doc[self.text_field],
131124
vector=doc[self.vector_field],
132-
attributes=json.loads(doc[self.attributes_field]),
133125
),
134126
score=1 - abs(float(doc["_distance"])),
135127
)
@@ -155,8 +147,6 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
155147
if doc:
156148
return VectorStoreDocument(
157149
id=doc[0][self.id_field],
158-
text=doc[0][self.text_field],
159150
vector=doc[0][self.vector_field],
160-
attributes=json.loads(doc[0][self.attributes_field]),
161151
)
162-
return VectorStoreDocument(id=id, text=None, vector=None)
152+
return VectorStoreDocument(id=id, vector=None)

tests/integration/vector_stores/test_azure_ai_search.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,6 @@ def vector_store_custom(self, mock_search_client, mock_index_client):
6363
vector_store_schema_config=VectorStoreSchemaConfig(
6464
index_name="test_vectors",
6565
id_field="id_custom",
66-
text_field="text_custom",
67-
attributes_field="attributes_custom",
6866
vector_field="vector_custom",
6967
vector_size=5,
7068
),
@@ -86,15 +84,11 @@ def sample_documents(self):
8684
return [
8785
VectorStoreDocument(
8886
id="doc1",
89-
text="This is document 1",
9087
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
91-
attributes={"title": "Doc 1", "category": "test"},
9288
),
9389
VectorStoreDocument(
9490
id="doc2",
95-
text="This is document 2",
9691
vector=[0.2, 0.3, 0.4, 0.5, 0.6],
97-
attributes={"title": "Doc 2", "category": "test"},
9892
),
9993
]
10094

@@ -110,26 +104,20 @@ async def test_vector_store_operations(
110104
search_results = [
111105
{
112106
"id": "doc1",
113-
"text": "This is document 1",
114107
"vector": [0.1, 0.2, 0.3, 0.4, 0.5],
115-
"attributes": '{"title": "Doc 1", "category": "test"}',
116108
"@search.score": 0.9,
117109
},
118110
{
119111
"id": "doc2",
120-
"text": "This is document 2",
121112
"vector": [0.2, 0.3, 0.4, 0.5, 0.6],
122-
"attributes": '{"title": "Doc 2", "category": "test"}',
123113
"@search.score": 0.8,
124114
},
125115
]
126116
mock_search_client.search.return_value = search_results
127117

128118
mock_search_client.get_document.return_value = {
129119
"id": "doc1",
130-
"text": "This is document 1",
131120
"vector": [0.1, 0.2, 0.3, 0.4, 0.5],
132-
"attributes": '{"title": "Doc 1", "category": "test"}',
133121
}
134122

135123
vector_store.load_documents(sample_documents)
@@ -154,8 +142,6 @@ def mock_embedder(text: str) -> list[float]:
154142

155143
doc = vector_store.search_by_id("doc1")
156144
assert doc.id == "doc1"
157-
assert doc.text == "This is document 1"
158-
assert doc.attributes["title"] == "Doc 1"
159145

160146
async def test_empty_embedding(self, vector_store, mock_search_client):
161147
"""Test similarity search by text with empty embedding."""
@@ -186,26 +172,20 @@ async def test_vector_store_customization(
186172
search_results = [
187173
{
188174
vector_store_custom.id_field: "doc1",
189-
vector_store_custom.text_field: "This is document 1",
190175
vector_store_custom.vector_field: [0.1, 0.2, 0.3, 0.4, 0.5],
191-
vector_store_custom.attributes_field: '{"title": "Doc 1", "category": "test"}',
192176
"@search.score": 0.9,
193177
},
194178
{
195179
vector_store_custom.id_field: "doc2",
196-
vector_store_custom.text_field: "This is document 2",
197180
vector_store_custom.vector_field: [0.2, 0.3, 0.4, 0.5, 0.6],
198-
vector_store_custom.attributes_field: '{"title": "Doc 2", "category": "test"}',
199181
"@search.score": 0.8,
200182
},
201183
]
202184
mock_search_client.search.return_value = search_results
203185

204186
mock_search_client.get_document.return_value = {
205187
vector_store_custom.id_field: "doc1",
206-
vector_store_custom.text_field: "This is document 1",
207188
vector_store_custom.vector_field: [0.1, 0.2, 0.3, 0.4, 0.5],
208-
vector_store_custom.attributes_field: '{"title": "Doc 1", "category": "test"}',
209189
}
210190

211191
vector_store_custom.load_documents(sample_documents)
@@ -230,5 +210,3 @@ def mock_embedder(text: str) -> list[float]:
230210

231211
doc = vector_store_custom.search_by_id("doc1")
232212
assert doc.id == "doc1"
233-
assert doc.text == "This is document 1"
234-
assert doc.attributes["title"] == "Doc 1"

0 commit comments

Comments
 (0)