Skip to content

Commit adcc9bd

Browse files
committed
Merge branch 'master' of github.com:experimaestro/datamaestro_text
2 parents 46d4318 + 5f2ef08 commit adcc9bd

File tree

2 files changed

+32
-3
lines changed

2 files changed

+32
-3
lines changed

src/datamaestro_text/data/ir/__init__.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Generic data types for information retrieval"""
22

33
from abc import ABC, abstractmethod
4+
from enum import Enum
45
from functools import cached_property
56
import logging
67
from pathlib import Path
@@ -88,6 +89,19 @@ def document_recordtype(self) -> Type[DocumentRecord]:
8889
...
8990

9091

92+
class FileAccess(Enum):
93+
"""Defines how to access files (e.g. for document stores)"""
94+
95+
FILE = 0
96+
"""Direct file access"""
97+
98+
MMAP = 1
99+
"""Use mmap"""
100+
101+
MEMORY = 2
102+
"""Use memory"""
103+
104+
91105
class DocumentStore(Documents):
92106
"""A document store
93107
@@ -97,6 +111,10 @@ class DocumentStore(Documents):
97111
- return the number of documents
98112
"""
99113

114+
file_access: Meta[FileAccess] = FileAccess.MMAP
115+
"""How to access the file collection (might not have any impact, depends on
116+
the docstore)"""
117+
100118
def docid_internal2external(self, docid: int):
101119
"""Converts an internal collection ID (integer) to an external ID"""
102120
raise NotImplementedError(f"For class {self.__class__}")
@@ -327,5 +345,4 @@ class PairwiseSampleDataset(Base, ABC):
327345
"""Datasets where each record is a query with positive and negative samples"""
328346

329347
@abstractmethod
330-
def iter(self) -> Iterator[PairwiseSample]:
331-
...
348+
def iter(self) -> Iterator[PairwiseSample]: ...

src/datamaestro_text/datasets/irds/data.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,19 @@ def documentcount(self):
215215

216216
@cached_property
217217
def store(self):
218-
return self.dataset.docs_store()
218+
kwargs = {}
219+
try:
220+
# Translate to ir datasets docstore options
221+
import ir_datasets.indices as ir_indices
222+
file_access = {
223+
ir.FileAccess.MMAP: ir_indices.FileAccess.MMAP,
224+
ir.FileAccess.FILE: ir_indices.FileAccess.FILE,
225+
ir.FileAccess.MEMORY: ir_indices.FileAccess.MEMORY
226+
}[self.file_access]
227+
kwargs = {"options": ir_indices.DocstoreOptions(file_access=file_access)}
228+
except ImportError:
229+
logging.warning("This version of ir-datasets cannot handle docstore options")
230+
return self.dataset.docs_store(**kwargs)
219231

220232
@cached_property
221233
def _docs(self):

0 commit comments

Comments
 (0)