Skip to content

Commit 6aed294

Browse files
authored
Merge pull request #3 from yzong12138/beir_adding
update the code: adding the structure for adapting the beir dataset
2 parents ae4dc7c + aab294f commit 6aed294

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

src/datamaestro_text/data/ir/formats.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from functools import cached_property
2-
from typing import ClassVar, Tuple
2+
from typing import ClassVar, Tuple, List
33
from attrs import define
44
from datamaestro.record import record_type
55
from ir_datasets.datasets.wapo import WapoDocMedia
@@ -126,6 +126,22 @@ class OrConvQADocument(TextItem):
126126
def text(self):
127127
return f"{self.title} {self.body}"
128128

129+
@define
130+
class Touche2020(TextItem):
131+
text: str
132+
title: str
133+
stance: str
134+
url: str
135+
136+
@define
137+
class SciDocs(TextItem):
138+
text: str
139+
title: str
140+
authors: List[str]
141+
year: int
142+
cited_by: List[str]
143+
references: List[str]
144+
129145

130146
@define
131147
class UrlTopic(TextItem):
@@ -159,6 +175,13 @@ class TrecMb14Query(TextItem):
159175
def get_text(self):
160176
return f"{self.query}"
161177

178+
@define
179+
class SciDocsTopic(TextItem):
180+
text: str
181+
authors: List[str]
182+
year: int
183+
cited_by: List[str]
184+
references: List[str]
162185

163186
@define()
164187
class TrecTopic(SimpleTextItem):

src/datamaestro_text/datasets/irds/data.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ class Documents(ir.DocumentStore, IRDSId):
110110
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
111111
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
112112
),
113+
_irds.beir.BeirToucheDoc: tuple_constructor(
114+
formats.Touche2020, "doc_id", "text", "title", "stance", "url"
115+
),
116+
_irds.beir.BeirSciDoc: tuple_constructor(
117+
formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
118+
),
113119
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
114120
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
115121
),
@@ -355,6 +361,12 @@ class Topics(ir.TopicsStore, IRDSId):
355361
TrecQuery: tuple_constructor(
356362
formats.TrecTopic, "query_id", "title", "description", "narrative"
357363
),
364+
_irds.beir.BeirToucheQuery: tuple_constructor(
365+
formats.TrecTopic, "query_id", "text", "description", "narrative"
366+
),
367+
_irds.beir.BeirSciQuery: tuple_constructor(
368+
formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
369+
),
358370
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
359371
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
360372
),

0 commit comments

Comments
 (0)