Skip to content

Commit 714b944

Browse files
mascisilvanocerza
andauthored
chore: rename store to document_store for clarity (#5547)
* store -> document_store * fix leftovers * fix import name * moar leftovers * rebase on main, update MemoryDocumentStore to the new protocol * Update haystack/preview/pipeline.py Co-authored-by: Silvano Cerza <[email protected]> --------- Co-authored-by: Silvano Cerza <[email protected]>
1 parent e7532c4 commit 714b944

File tree

16 files changed

+392
-365
lines changed

16 files changed

+392
-365
lines changed

haystack/preview/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from canals.component import component
2-
from haystack.preview.document_stores.decorator import store
2+
from haystack.preview.document_stores.decorator import document_store
33
from haystack.preview.dataclasses import Document
4-
from haystack.preview.pipeline import Pipeline, PipelineError, NoSuchStoreError, load_pipelines, save_pipelines
4+
from haystack.preview.pipeline import Pipeline, PipelineError, NoSuchDocumentStoreError, load_pipelines, save_pipelines

haystack/preview/components/retrievers/memory.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
from typing import Dict, List, Any, Optional
22

33
from haystack.preview import component, Document
4-
from haystack.preview.document_stores import MemoryDocumentStore, StoreAwareMixin
4+
from haystack.preview.document_stores import MemoryDocumentStore, DocumentStoreAwareMixin
55

66

77
@component
8-
class MemoryRetriever(StoreAwareMixin):
8+
class MemoryRetriever(DocumentStoreAwareMixin):
99
"""
1010
A component for retrieving documents from a MemoryDocumentStore using the BM25 algorithm.
1111
1212
Needs to be connected to a MemoryDocumentStore to run.
1313
"""
1414

15-
supported_stores = [MemoryDocumentStore]
15+
supported_document_stores = [MemoryDocumentStore]
1616

1717
def __init__(self, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, scale_score: bool = True):
1818
"""
@@ -46,14 +46,16 @@ def run(
4646
:param filters: A dictionary with filters to narrow down the search space.
4747
:param top_k: The maximum number of documents to return.
4848
:param scale_score: Whether to scale the BM25 scores or not.
49-
:param stores: A dictionary mapping document store names to instances.
49+
:param document_stores: A dictionary mapping DocumentStore names to instances.
5050
:return: The retrieved documents.
5151
52-
:raises ValueError: If the specified document store is not found or is not a MemoryDocumentStore instance.
52+
:raises ValueError: If the specified DocumentStore is not found or is not a MemoryDocumentStore instance.
5353
"""
54-
self.store: MemoryDocumentStore
55-
if not self.store:
56-
raise ValueError("MemoryRetriever needs a store to run: set the store instance to the self.store attribute")
54+
self.document_store: MemoryDocumentStore
55+
if not self.document_store:
56+
raise ValueError(
57+
"MemoryRetriever needs a DocumentStore to run: set the DocumentStore instance to the self.document_store attribute"
58+
)
5759

5860
if filters is None:
5961
filters = self.filters
@@ -64,5 +66,7 @@ def run(
6466

6567
docs = []
6668
for query in queries:
67-
docs.append(self.store.bm25_retrieval(query=query, filters=filters, top_k=top_k, scale_score=scale_score))
69+
docs.append(
70+
self.document_store.bm25_retrieval(query=query, filters=filters, top_k=top_k, scale_score=scale_score)
71+
)
6872
return {"documents": docs}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from haystack.preview.document_stores.protocols import Store, DuplicatePolicy
2-
from haystack.preview.document_stores.mixins import StoreAwareMixin
1+
from haystack.preview.document_stores.protocols import DocumentStore, DuplicatePolicy
2+
from haystack.preview.document_stores.mixins import DocumentStoreAwareMixin
33
from haystack.preview.document_stores.memory.document_store import MemoryDocumentStore
4-
from haystack.preview.document_stores.errors import StoreError, DuplicateDocumentError, MissingDocumentError
5-
from haystack.preview.document_stores.decorator import store
4+
from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError, MissingDocumentError
5+
from haystack.preview.document_stores.decorator import document_store
Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,34 @@
11
from typing import Dict, Any, Type
22
import logging
33

4-
from haystack.preview.document_stores.protocols import Store
5-
from haystack.preview.document_stores.errors import StoreDeserializationError
4+
from haystack.preview.document_stores.protocols import DocumentStore
5+
from haystack.preview.document_stores.errors import DocumentStoreDeserializationError
66

77
logger = logging.getLogger(__name__)
88

99

10-
class _Store:
10+
class _DocumentStore:
1111
"""
12-
Marks a class as an Haystack Store.
13-
All classes decorated with @store will be registered here and can be used in Haystack Pipelines.
12+
Marks a class as an Haystack _DocumentStore.
13+
All classes decorated with @document_store will be registered here and can be used in Haystack Pipelines.
1414
"""
1515

1616
def __init__(self):
1717
self.registry = {}
1818

1919
def _decorate(self, cls):
20-
cls.__haystack_store__ = True
20+
cls.__haystack_document_store__ = True
2121

2222
if cls.__name__ in self.registry:
2323
logger.error(
24-
"Store %s is already registered. Previous imported from '%s', new imported from '%s'",
24+
"DocumentStore %s is already registered. Previous imported from '%s', new imported from '%s'",
2525
cls.__name__,
2626
self.registry[cls.__name__],
2727
cls,
2828
)
2929

3030
self.registry[cls.__name__] = cls
31-
logger.debug("Registered Store %s", cls)
32-
33-
cls.to_dict = _default_store_to_dict
34-
cls.from_dict = classmethod(_default_store_from_dict)
31+
logger.debug("Registered DocumentStore %s", cls)
3532

3633
return cls
3734

@@ -42,13 +39,13 @@ def __call__(self, cls=None):
4239
return self._decorate
4340

4441

45-
store = _Store()
42+
document_store = _DocumentStore()
4643

4744

48-
def _default_store_to_dict(store_: Store) -> Dict[str, Any]:
45+
def default_document_store_to_dict(store_: DocumentStore) -> Dict[str, Any]:
4946
"""
50-
Default store serializer.
51-
Serializes a store to a dictionary.
47+
Default DocumentStore serializer.
48+
Serializes a DocumentStore to a dictionary.
5249
"""
5350
return {
5451
"hash": id(store_),
@@ -57,14 +54,16 @@ def _default_store_to_dict(store_: Store) -> Dict[str, Any]:
5754
}
5855

5956

60-
def _default_store_from_dict(cls: Type[Store], data: Dict[str, Any]) -> Store:
57+
def default_document_store_from_dict(cls: Type[DocumentStore], data: Dict[str, Any]) -> DocumentStore:
6158
"""
62-
Default store deserializer.
59+
Default DocumentStore deserializer.
6360
The "type" field in `data` must match the class that is being deserialized into.
6461
"""
6562
init_params = data.get("init_parameters", {})
6663
if "type" not in data:
67-
raise StoreDeserializationError("Missing 'type' in store serialization data")
64+
raise DocumentStoreDeserializationError("Missing 'type' in DocumentStore serialization data")
6865
if data["type"] != cls.__name__:
69-
raise StoreDeserializationError(f"Store '{data['type']}' can't be deserialized as '{cls.__name__}'")
66+
raise DocumentStoreDeserializationError(
67+
f"DocumentStore '{data['type']}' can't be deserialized as '{cls.__name__}'"
68+
)
7069
return cls(**init_params)
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
class StoreError(Exception):
1+
class DocumentStoreError(Exception):
22
pass
33

44

5-
class FilterError(StoreError):
5+
class FilterError(DocumentStoreError):
66
pass
77

88

9-
class DuplicateDocumentError(StoreError):
9+
class DuplicateDocumentError(DocumentStoreError):
1010
pass
1111

1212

13-
class MissingDocumentError(StoreError):
13+
class MissingDocumentError(DocumentStoreError):
1414
pass
1515

1616

17-
class StoreDeserializationError(StoreError):
17+
class DocumentStoreDeserializationError(DocumentStoreError):
1818
pass

haystack/preview/document_stores/memory/document_store.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@
77
import rank_bm25
88
from tqdm.auto import tqdm
99

10-
from haystack.preview.document_stores.decorator import store
10+
from haystack.preview.document_stores.decorator import (
11+
document_store,
12+
default_document_store_to_dict,
13+
default_document_store_from_dict,
14+
)
1115
from haystack.preview.dataclasses import Document
12-
from haystack.preview.document_stores.protocols import DuplicatePolicy
16+
from haystack.preview.document_stores.protocols import DuplicatePolicy, DocumentStore
1317
from haystack.preview.document_stores.memory._filters import match
1418
from haystack.preview.document_stores.errors import DuplicateDocumentError, MissingDocumentError
1519
from haystack.utils.scipy_utils import expit
@@ -24,7 +28,7 @@
2428
SCALING_FACTOR = 8
2529

2630

27-
@store
31+
@document_store
2832
class MemoryDocumentStore:
2933
"""
3034
Stores data in-memory. It's ephemeral and cannot be saved to disk.
@@ -37,7 +41,7 @@ def __init__(
3741
bm25_parameters: Optional[Dict] = None,
3842
):
3943
"""
40-
Initializes the store.
44+
Initializes the DocumentStore.
4145
"""
4246
self.storage: Dict[str, Document] = {}
4347
self.tokenizer = re.compile(bm25_tokenization_regex).findall
@@ -54,9 +58,22 @@ def __init__(
5458
"bm25_parameters": self.bm25_parameters,
5559
}
5660

61+
def to_dict(self) -> Dict[str, Any]:
62+
"""
63+
Serializes this store to a dictionary.
64+
"""
65+
return default_document_store_to_dict(self)
66+
67+
@classmethod
68+
def from_dict(cls, data: Dict[str, Any]) -> "DocumentStore":
69+
"""
70+
Deserializes the store from a dictionary.
71+
"""
72+
return default_document_store_from_dict(cls, data)
73+
5774
def count_documents(self) -> int:
5875
"""
59-
Returns the number of how many documents are present in the document store.
76+
Returns the number of how many documents are present in the DocumentStore.
6077
"""
6178
return len(self.storage.keys())
6279

@@ -137,11 +154,11 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
137154

138155
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None:
139156
"""
140-
Writes (or overwrites) documents into the store.
157+
Writes (or overwrites) documents into the DocumentStore.
141158
142159
:param documents: a list of documents.
143160
:param policy: documents with the same ID count as duplicates. When duplicates are met,
144-
the store can:
161+
the DocumentStore can:
145162
- skip: keep the existing document and ignore the new one.
146163
- overwrite: remove the old document and write the new one.
147164
- fail: an error is raised
@@ -165,8 +182,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
165182

166183
def delete_documents(self, document_ids: List[str]) -> None:
167184
"""
168-
Deletes all documents with a matching document_ids from the document store.
169-
Fails with `MissingDocumentError` if no document with this id is present in the store.
185+
Deletes all documents with a matching document_ids from the DocumentStore.
186+
Fails with `MissingDocumentError` if no document with this id is present in the DocumentStore.
170187
171188
:param object_ids: the object_ids to delete
172189
"""
@@ -218,7 +235,7 @@ def bm25_retrieval(
218235
csv_content = str_content.to_csv(index=False)
219236
lower_case_documents.append(csv_content.lower())
220237

221-
# Tokenize the entire content of the document store
238+
# Tokenize the entire content of the DocumentStore
222239
tokenized_corpus = [
223240
self.tokenizer(doc) for doc in tqdm(lower_case_documents, unit=" docs", desc="Ranking by BM25...")
224241
]
Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,40 @@
11
from typing import List, Optional, Type
22

33

4-
from haystack.preview.document_stores.protocols import Store
4+
from haystack.preview.document_stores.protocols import DocumentStore
55

66

7-
class StoreAwareMixin:
7+
class DocumentStoreAwareMixin:
88
"""
9-
Adds the capability of a component to use a single document store from the `self.store` property.
9+
Adds the capability of a component to use a single DocumentStore from the `self.document_store` property.
1010
11-
To use this mixin you must specify which document stores to support by setting a value to `supported_stores`.
12-
To support any document store, set it to `[Store]`.
11+
To use this mixin you must specify which DocumentStores to support by setting a value to `supported_stores`.
12+
To support any DocumentStore, set it to `[DocumentStore]`.
1313
"""
1414

15-
_store: Optional[Store] = None
15+
_document_store: Optional[DocumentStore] = None
1616
# This is necessary to ease serialisation when converting a Component that uses
17-
# a Store into a dictionary.
17+
# a DocumentStore into a dictionary.
1818
# This is only set when calling `Pipeline.add_component()`.
19-
_store_name: str = ""
20-
supported_stores: List[Type[Store]] # type: ignore # (see https://github.com/python/mypy/issues/4717)
19+
_document_store_name: str = ""
20+
supported_document_stores: List[Type[DocumentStore]] # type: ignore # (see https://github.com/python/mypy/issues/4717)
2121

2222
@property
23-
def store(self) -> Optional[Store]:
24-
return self._store
25-
26-
@store.setter
27-
def store(self, store: Store):
28-
if not getattr(store, "__haystack_store__", False):
29-
raise ValueError(f"'{type(store).__name__}' is not decorate with @store.")
30-
if not self._is_supported(store):
23+
def document_store(self) -> Optional[DocumentStore]:
24+
return self._document_store
25+
26+
@document_store.setter
27+
def document_store(self, document_store: DocumentStore):
28+
if not getattr(document_store, "__haystack_document_store__", False):
29+
raise ValueError(f"'{type(document_store).__name__}' is not decorate with @document_store.")
30+
if not self._is_supported(document_store):
3131
raise ValueError(
32-
f"Store type '{type(store).__name__}' is not compatible with this component. "
33-
f"Compatible store types: {[type_.__name__ for type_ in type(self).supported_stores]}"
32+
f"DocumentStore type '{type(document_store).__name__}' is not compatible with this component. "
33+
f"Compatible DocumentStore types: {[type_.__name__ for type_ in type(self).supported_document_stores]}"
3434
)
35-
self._store = store
35+
self._document_store = document_store
3636

37-
def _is_supported(self, store: Store):
38-
if Store in self.supported_stores:
37+
def _is_supported(self, document_store: DocumentStore):
38+
if DocumentStore in self.supported_document_stores:
3939
return True
40-
return any(isinstance(store, type_) for type_ in self.supported_stores)
40+
return any(isinstance(document_store, type_) for type_ in self.supported_document_stores)

haystack/preview/document_stores/protocols.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ class DuplicatePolicy(Enum):
1414
FAIL = "fail"
1515

1616

17-
class Store(Protocol):
17+
class DocumentStore(Protocol):
1818
"""
1919
Stores Documents to be used by the components of a Pipeline.
2020
2121
Classes implementing this protocol often store the documents permanently and allow specialized components to
2222
perform retrieval on them, either by embedding, by keyword, hybrid, and so on, depending on the backend used.
2323
24-
In order to retrieve documents, consider using a Retriever that supports the document store implementation that
24+
In order to retrieve documents, consider using a Retriever that supports the DocumentStore implementation that
2525
you're using.
2626
"""
2727

@@ -31,7 +31,7 @@ def to_dict(self) -> Dict[str, Any]:
3131
"""
3232

3333
@classmethod
34-
def from_dict(cls, data: Dict[str, Any]) -> "Store":
34+
def from_dict(cls, data: Dict[str, Any]) -> "DocumentStore":
3535
"""
3636
Deserializes the store from a dictionary.
3737
"""
@@ -115,11 +115,11 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
115115

116116
def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None:
117117
"""
118-
Writes (or overwrites) documents into the store.
118+
Writes (or overwrites) documents into the DocumentStore.
119119
120120
:param documents: a list of documents.
121121
:param policy: documents with the same ID count as duplicates. When duplicates are met,
122-
the store can:
122+
the DocumentStore can:
123123
- skip: keep the existing document and ignore the new one.
124124
- overwrite: remove the old document and write the new one.
125125
- fail: an error is raised
@@ -129,8 +129,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
129129

130130
def delete_documents(self, document_ids: List[str]) -> None:
131131
"""
132-
Deletes all documents with a matching document_ids from the document store.
133-
Fails with `MissingDocumentError` if no document with this id is present in the store.
132+
Deletes all documents with a matching document_ids from the DocumentStore.
133+
Fails with `MissingDocumentError` if no document with this id is present in the DocumentStore.
134134
135135
:param object_ids: the object_ids to delete
136136
"""

0 commit comments

Comments
 (0)