Skip to content

Commit b98d938

Browse files
authored
Merge pull request #5 from MathVast/master
Add DprW100 converters, MsMarcoV2Passage and Trec-News
2 parents 868c75d + a9dc2bd commit b98d938

File tree

3 files changed

+97
-3
lines changed

3 files changed

+97
-3
lines changed

src/datamaestro_text/data/ir/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@ class IDItem(Item, ABC):
4343

4444
id: str
4545

46+
@define
47+
class UrlItem(Item):
48+
"""An url item"""
49+
50+
url: str
51+
4652

4753
@define
4854
class AdhocAssessment:

src/datamaestro_text/data/ir/formats.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class NFCorpusDocument(TextItem):
5858

5959
@cached_property
6060
def text(self):
61-
return self.abstract
61+
return f"{self.title} {self.abstract}"
6262

6363

6464
@define
@@ -99,8 +99,8 @@ class WapoDocument(TextItem):
9999
body_media: Tuple[WapoDocMedia, ...]
100100

101101
@cached_property
102-
def text(self):
103-
return self.body
102+
def text(self):
103+
return f"{self.title} {self.body_paras_html}"
104104

105105

106106
@define
@@ -128,6 +128,15 @@ def text(self):
128128

129129

130130
@define
131+
class DprW100Doc(TextItem):
132+
text: str
133+
title: str
134+
135+
@define
136+
class MsMarcoV2Passage(TextItem):
137+
text: str
138+
spans: Tuple[Tuple[int, int], ...]
139+
msmarco_document_id: str
131140
class Touche2020(TextItem):
132141
text: str
133142
title: str
@@ -194,3 +203,17 @@ class TrecTopic(SimpleTextItem):
194203

195204

196205
TrecTopicRecord = record_type(IDItem, TrecTopic)
206+
207+
@define
208+
class DprW100Query(TextItem):
209+
text: str
210+
answers: Tuple[str]
211+
212+
@define
213+
class TrecBackgroundLinkingQuery(IDItem):
214+
query_id: str
215+
doc_id: str
216+
url: str
217+
218+
def get_text(self):
219+
raise NotImplementedError()

src/datamaestro_text/datasets/irds/data.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
SimpleAdhocAssessment,
3838
SimpleTextItem,
3939
TopicRecord,
40+
UrlItem,
4041
create_record,
4142
)
4243

@@ -165,6 +166,19 @@ class Documents(ir.DocumentStore, IRDSId):
165166
"source",
166167
"source_content_type",
167168
),
169+
_irds.dpr_w100.DprW100Doc: tuple_constructor(
170+
formats.DprW100Doc,
171+
"doc_id",
172+
"text",
173+
"title",
174+
),
175+
_irds.msmarco_passage_v2.MsMarcoV2Passage: tuple_constructor(
176+
formats.MsMarcoV2Passage,
177+
"doc_id",
178+
"text",
179+
"spans",
180+
"msmarco_document_id",
181+
),
168182
}
169183

170184
"""Wraps an ir datasets collection -- and provide a default text
@@ -385,6 +399,12 @@ class Topics(ir.TopicsStore, IRDSId):
385399
"tweet_time",
386400
"description",
387401
),
402+
_irds.dpr_w100.DprW100Query: tuple_constructor(
403+
formats.DprW100Query,
404+
"query_id",
405+
"text",
406+
"answers"
407+
),
388408
}
389409

390410
HANDLERS = {
@@ -415,7 +435,52 @@ def topic_ext(self, external_topic_id: str) -> TopicRecord:
415435
def iter(self) -> Iterator[TopicRecord]:
416436
"""Returns an iterator over topics"""
417437
return self.handler.iter()
438+
439+
class TrecBackgroundLinkingTopicsHandler(TopicsHandler):
440+
def __init__(self, dataset):
441+
self.dataset = dataset
442+
443+
@cached_property
444+
def ext2records(self):
445+
return {record[IDItem].id: record for record in self.records}
446+
447+
def topic_int(self, internal_topic_id: int) -> TopicRecord:
448+
"""Returns a document given its internal ID"""
449+
return self.records[internal_topic_id]
450+
451+
def topic_ext(self, external_topic_id: str) -> TopicRecord:
452+
"""Returns a document given its external ID"""
453+
return self.ext2records[external_topic_id]
418454

455+
def iter(self) -> Iterator[ir.TopicRecord]:
456+
"""Returns an iterator over topics"""
457+
return iter(self.records)
458+
459+
@cached_property
460+
def records(self):
461+
try:
462+
records = []
463+
464+
for query in self.dataset.dataset.queries_iter():
465+
topic = Record(
466+
IDItem(query.query_id),
467+
# Following BEIR documentation, we use title of documents as queries: https://github.com/beir-cellar/beir/blob/main/examples/dataset/README.md#queries-and-qrels
468+
SimpleTextItem(self.dataset.dataset.docs_store().get(query.doc_id).title),
469+
UrlItem(query.url),
470+
)
471+
records.append(topic)
472+
except Exception:
473+
logging.exception("Error while computing topic records")
474+
raise
475+
476+
return records
477+
478+
479+
Topics.HANDLERS.update(
480+
{
481+
_irds.wapo.TrecBackgroundLinkingQuery: TrecBackgroundLinkingTopicsHandler
482+
}
483+
)
419484

420485
class CastTopicsHandler(TopicsHandler):
421486
def __init__(self, dataset):

0 commit comments

Comments
 (0)