Skip to content

Commit ebf1881

Browse files
committed
Merge branch 'master' of github.com:experimaestro/datamaestro_text
2 parents 4853cb7 + 6aed294 commit ebf1881

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

src/datamaestro_text/data/ir/formats.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from functools import cached_property
2-
from typing import ClassVar, Tuple
2+
from typing import ClassVar, Tuple, List
33
from attrs import define
44
from datamaestro.record import record_type
55
from ir_datasets.datasets.wapo import WapoDocMedia
@@ -126,6 +126,22 @@ class OrConvQADocument(TextItem):
126126
def text(self):
127127
return f"{self.title} {self.body}"
128128

129+
@define
130+
class Touche2020(TextItem):
131+
text: str
132+
title: str
133+
stance: str
134+
url: str
135+
136+
@define
137+
class SciDocs(TextItem):
138+
text: str
139+
title: str
140+
authors: List[str]
141+
year: int
142+
cited_by: List[str]
143+
references: List[str]
144+
129145

130146
@define
131147
class UrlTopic(TextItem):
@@ -159,6 +175,13 @@ class TrecMb14Query(TextItem):
159175
def get_text(self):
160176
return f"{self.query}"
161177

178+
@define
179+
class SciDocsTopic(TextItem):
180+
text: str
181+
authors: List[str]
182+
year: int
183+
cited_by: List[str]
184+
references: List[str]
162185

163186
@define()
164187
class TrecTopic(SimpleTextItem):

src/datamaestro_text/datasets/irds/data.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,12 @@ class Documents(ir.DocumentStore, IRDSId):
117117
_irds.beir.BeirTitleUrlDoc: tuple_constructor(
118118
formats.TitleUrlDocument, "doc_id", "text", "title", "url"
119119
),
120+
_irds.beir.BeirToucheDoc: tuple_constructor(
121+
formats.Touche2020, "doc_id", "text", "title", "stance", "url"
122+
),
123+
_irds.beir.BeirSciDoc: tuple_constructor(
124+
formats.SciDocs, "doc_id", "text", "title", "authors", "year", "cited_by", "references"
125+
),
120126
_irds.msmarco_document.MsMarcoDocument: tuple_constructor(
121127
formats.MsMarcoDocument, "doc_id", "url", "title", "body"
122128
),
@@ -362,6 +368,12 @@ class Topics(ir.TopicsStore, IRDSId):
362368
TrecQuery: tuple_constructor(
363369
formats.TrecTopic, "query_id", "title", "description", "narrative"
364370
),
371+
_irds.beir.BeirToucheQuery: tuple_constructor(
372+
formats.TrecTopic, "query_id", "text", "description", "narrative"
373+
),
374+
_irds.beir.BeirSciQuery: tuple_constructor(
375+
formats.SciDocsTopic, "query_id", "text", "authors", "year", "cited_by", "references"
376+
),
365377
_irds.tweets2013_ia.TrecMb13Query: tuple_constructor(
366378
formats.TrecMb13Query, "query_id", "query", "time", "tweet_time"
367379
),

0 commit comments

Comments
 (0)