Skip to content

Commit 2881d26

Browse files
bpiwowarclaude
andcommitted
refactor: migrate dataset definitions to new Dataset class pattern
- Convert all dataset definitions to use the new class-based Dataset pattern with `def config(self) -> DataType:` method - Use relative IDs with leading dots (e.g., ".1.topics" instead of "1.topics") - Handle dynamic resource creation in Clueweb22 and Content classes - Bump datamaestro dependency to >=1.8.0 for Dataset class support Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 122f81b commit 2881d26

File tree

21 files changed

+563
-667
lines changed

21 files changed

+563
-667
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ classifiers = [
2626
requires-python = ">=3.10"
2727
dynamic = ["version"]
2828
dependencies = [
29-
"datamaestro>=1.7.3",
29+
"datamaestro>=1.8.0",
3030
"ir_datasets>=0.5.8",
3131
"attrs",
3232
"experimaestro",

src/datamaestro_text/config/com/github/aagohary/canard.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datamaestro.definitions import datatasks, datatags, dataset
1+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
22
from datamaestro.download.single import FileDownloader
33
from datamaestro.utils import HashCheck
44

@@ -9,7 +9,7 @@
99
@datatags("conversation", "context", "query")
1010
@datatasks("query rewriting")
1111
@dataset(url="https://sites.google.com/view/qanta/projects/canard", id="")
12-
class Main(Supervised):
12+
class Main(Dataset):
1313
"""Question-in-context rewriting
1414
1515
CANARD is a dataset for question-in-context rewriting that consists of
@@ -38,10 +38,9 @@ class Main(Supervised):
3838
checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
3939
)
4040

41-
@classmethod
42-
def __create_dataset__(cls, dataset):
43-
return cls.C(
44-
train=CanardDataset.C(path=cls.TRAIN.path),
45-
validation=CanardDataset.C(path=cls.DEV.path),
46-
test=CanardDataset.C(path=cls.TEST.path),
41+
def config(self) -> Supervised:
42+
return Supervised.C(
43+
train=CanardDataset.C(path=self.TRAIN.path),
44+
validation=CanardDataset.C(path=self.DEV.path),
45+
test=CanardDataset.C(path=self.TEST.path),
4746
)

src/datamaestro_text/config/com/github/apple/ml-qrecc.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import re
44
import json
55
from pathlib import Path
6-
from datamaestro.definitions import datatasks, datatags, dataset
6+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
77
from datamaestro.data.ml import Supervised
88
from datamaestro.download import reference
99
from datamaestro.download.archive import ZipDownloader
@@ -24,7 +24,7 @@
2424
doi="https://doi.org/10.48550/arXiv.2010.04898",
2525
id="",
2626
)
27-
class Main(Supervised):
27+
class Main(Dataset):
2828
"""Open-Domain Question Answering Goes Conversational via Question Rewriting
2929
3030
We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -41,36 +41,37 @@ class Main(Supervised):
4141
checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
4242
)
4343

44-
@classmethod
45-
def __create_dataset__(cls, dataset):
44+
def config(self) -> Supervised:
4645
return Supervised.C(
47-
train=QReCCDataset.C(path=cls.DATA.path / "qrecc_train.json"),
48-
test=QReCCDataset.C(path=cls.DATA.path / "qrecc_test.json"),
46+
train=QReCCDataset.C(path=self.DATA.path / "qrecc_train.json"),
47+
test=QReCCDataset.C(path=self.DATA.path / "qrecc_test.json"),
4948
)
5049

5150

5251
@dataset(
5352
url="https://github.com/apple/ml-qrecc",
5453
doi="https://doi.org/10.48550/arXiv.2010.04898",
5554
)
56-
class Content(LZ4JSONLDocumentStore):
55+
class Content(Dataset):
5756
"""QReCC mentionned URLs content"""
5857

59-
@staticmethod
60-
def __create_dataset__(dataset, options=None):
61-
ds = reference(reference=Main).setup(dataset, options)
62-
documents_path = wayback_documents(
63-
"20191127", lambda: Content._urls(ds), name="wayback.jsonl"
64-
).setup(dataset, options)
58+
MAIN = reference(reference=Main)
59+
60+
WAYBACK_DOCS = wayback_documents(
61+
"20191127",
62+
lambda: Content._urls(Content.MAIN.prepare()),
63+
name="wayback.jsonl",
64+
)
6565

66-
store_path = lz4docstore_builder(
67-
"store",
68-
lambda: Content._documents(documents_path),
69-
SimpleJsonDocument,
70-
"id",
71-
).setup(dataset, options)
66+
STORE = lz4docstore_builder(
67+
"store",
68+
lambda: Content._documents(Content.WAYBACK_DOCS.path),
69+
SimpleJsonDocument,
70+
"id",
71+
)
7272

73-
return Content.C(jsonl_path=store_path)
73+
def config(self) -> LZ4JSONLDocumentStore:
74+
return LZ4JSONLDocumentStore.C(jsonl_path=self.STORE.path)
7475

7576
@staticmethod
7677
def _documents(path: Path):
Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# See documentation on https://datamaestro.readthedocs.io
22

33
from datamaestro.download import reference
4-
from datamaestro.definitions import datatasks, datatags, dataset
4+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
55
from datamaestro_text.data.conversation.base import ConversationUserTopics
66
from datamaestro_text.data.ir import Adhoc
77

@@ -16,72 +16,73 @@
1616
from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
1717

1818

19-
@dataset(as_prepare=True)
20-
def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
19+
@dataset()
20+
class Clueweb22(Dataset):
2121
# Number of documents in the dataset
2222
count = 116_838_987
2323

24-
jsonl_folder = linkfolder(
24+
JSONL_FOLDER = linkfolder(
2525
"documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
26-
).setup(dataset, options)
27-
store_path = lz4docstore_builder(
26+
)
27+
28+
STORE_PATH = lz4docstore_builder(
2829
"store",
2930
IKatClueWeb22DocumentStore.generator(
30-
jsonl_folder,
31-
jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
32-
jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
31+
JSONL_FOLDER,
32+
"ikat_2023_passages_jsonl.sha256sums",
33+
"ikat_2023_passages_hashes.tsv.bz2",
3334
),
3435
IKatClueWeb22DocumentStore.Document,
3536
"id",
3637
count_hint=count,
37-
).setup(dataset, options)
38+
)
3839

39-
return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
40+
def config(self) -> IKatClueWeb22DocumentStore:
41+
return IKatClueWeb22DocumentStore.C(path=self.STORE_PATH.path, count=self.count)
4042

4143

4244
@datatags("conversation", "context", "query")
4345
@datatasks("conversational search", "query rewriting")
4446
@dataset(
45-
id="2025",
47+
id=".2025",
4648
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
4749
)
48-
class Test2025(Adhoc):
50+
class Test2025(Dataset):
4951
"""Question-in-context rewriting
5052
5153
iKAT is a test dataset for question-in-context rewriting that consists of
5254
questions each given in a dialog context together with a context-independent
5355
rewriting of the question.
5456
"""
5557

56-
DOCUMENTS = reference(varname="documents", reference=clueweb22)
58+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
5759
TOPICS = FileDownloader(
5860
"topics.json",
5961
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
6062
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
6163
)
6264

63-
@classmethod
64-
def __create_dataset__(cls, dataset):
65+
def config(self) -> Adhoc:
6566
return Adhoc.C(
6667
topics=ConversationUserTopics.C(
67-
conversations=IkatConversations.C(path=cls.TOPICS.path)
68+
conversations=IkatConversations.C(path=self.TOPICS.path)
6869
),
6970
# TODO: add when available
7071
assessments=TrecAdhocAssessments.C(path="/to/do"),
71-
documents=cls.DOCUMENTS.prepare(),
72+
documents=self.DOCUMENTS.prepare(),
7273
)
7374

7475

7576
@datatags("conversation", "context", "query")
7677
@datatasks("conversational search", "query rewriting")
7778
@dataset(
78-
id="2024",
79+
id=".2024",
7980
url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
8081
)
81-
class Test2024(Adhoc):
82+
class Test2024(Dataset):
8283
"""iKAT 2024 dataset"""
8384

84-
DOCUMENTS = reference(varname="documents", reference=clueweb22)
85+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
8586
QRELS = FileDownloader(
8687
"qrels",
8788
"https://trec.nist.gov/data/ikat/2024-qrels.txt",
@@ -93,27 +94,26 @@ class Test2024(Adhoc):
9394
checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
9495
)
9596

96-
@classmethod
97-
def __create_dataset__(cls, dataset):
97+
def config(self) -> Adhoc:
9898
return Adhoc.C(
9999
topics=ConversationUserTopics.C(
100-
conversations=IkatConversations.C(path=cls.TOPICS.path)
100+
conversations=IkatConversations.C(path=self.TOPICS.path)
101101
),
102-
assessments=TrecAdhocAssessments.C(path=cls.QRELS.path),
103-
documents=cls.DOCUMENTS.prepare(),
102+
assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
103+
documents=self.DOCUMENTS.prepare(),
104104
)
105105

106106

107107
@datatags("conversation", "context", "query")
108108
@datatasks("conversational search", "query rewriting")
109109
@dataset(
110-
id="2023",
110+
id=".2023",
111111
url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
112112
)
113-
class Test2023(Adhoc):
113+
class Test2023(Dataset):
114114
"""iKAT 2023 dataset"""
115115

116-
DOCUMENTS = reference(varname="documents", reference=clueweb22)
116+
DOCUMENTS = reference(varname="documents", reference=Clueweb22)
117117
QRELS = FileDownloader(
118118
"qrels",
119119
"https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
@@ -125,12 +125,11 @@ class Test2023(Adhoc):
125125
checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
126126
)
127127

128-
@classmethod
129-
def __create_dataset__(cls, dataset):
128+
def config(self) -> Adhoc:
130129
return Adhoc.C(
131130
topics=ConversationUserTopics.C(
132-
conversations=IkatConversations.C(path=cls.TOPICS.path)
131+
conversations=IkatConversations.C(path=self.TOPICS.path)
133132
),
134-
assessments=TrecAdhocAssessments.C(path=cls.QRELS.path),
135-
documents=cls.DOCUMENTS.prepare(),
133+
assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
134+
documents=self.DOCUMENTS.prepare(),
136135
)

src/datamaestro_text/config/com/github/prdwb/orconvqa.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import json
55
from pathlib import Path
66
from typing import Iterator
7-
from datamaestro.definitions import datatasks, datatags, dataset
7+
from datamaestro.definitions import Dataset, datatasks, datatags, dataset
88
from datamaestro.download.single import FileDownloader
99
from datamaestro.utils import HashCheck
1010

@@ -21,7 +21,7 @@
2121
@dataset(
2222
url="https://github.com/prdwb/orconvqa-release",
2323
)
24-
class Preprocessed(Supervised):
24+
class Preprocessed(Dataset):
2525
"""Open-Retrieval Conversational Question Answering datasets
2626
2727
OrConvQA is an aggregation of three existing datasets:
@@ -49,12 +49,11 @@ class Preprocessed(Supervised):
4949
checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
5050
)
5151

52-
@classmethod
53-
def __create_dataset__(cls, dataset):
54-
return cls.C(
55-
train=OrConvQADataset.C(path=cls.TRAIN.path),
56-
validation=OrConvQADataset.C(path=cls.DEV.path),
57-
test=OrConvQADataset.C(path=cls.TEST.path),
52+
def config(self) -> Supervised:
53+
return Supervised.C(
54+
train=OrConvQADataset.C(path=self.TRAIN.path),
55+
validation=OrConvQADataset.C(path=self.DEV.path),
56+
test=OrConvQADataset.C(path=self.TEST.path),
5857
)
5958

6059

@@ -69,7 +68,7 @@ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED
6968
@dataset(
7069
url="https://github.com/prdwb/orconvqa-release",
7170
)
72-
class Passages(OrConvQADocumentStore):
71+
class Passages(Dataset):
7372
"""orConvQA wikipedia files
7473
7574
OrConvQA is an aggregation of three existing datasets:
@@ -90,6 +89,5 @@ class Passages(OrConvQADocumentStore):
9089
count_hint=11_377_951,
9190
)
9291

93-
@classmethod
94-
def __create_dataset__(cls, dataset):
95-
return cls.C(path=cls.ALL_BLOCKS.path, count=11_377_951)
92+
def config(self) -> OrConvQADocumentStore:
93+
return OrConvQADocumentStore.C(path=self.ALL_BLOCKS.path, count=11_377_951)

0 commit comments

Comments
 (0)