experimaestro
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/datamaestro_text/config/com/github/aagohary/canard.py‎
Lines changed: 7 additions & 8 deletions b/‎src/datamaestro_text/config/com/github/aagohary/canard.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎src/datamaestro_text/config/com/github/apple/ml-qrecc.py‎
Lines changed: 21 additions & 20 deletions b/‎src/datamaestro_text/config/com/github/apple/ml-qrecc.py‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎src/datamaestro_text/config/com/github/ikat.py‎
Lines changed: 33 additions & 34 deletions b/‎src/datamaestro_text/config/com/github/ikat.py‎
Lines changed: 33 additions & 34 deletions
diff --git a/‎src/datamaestro_text/config/com/github/prdwb/orconvqa.py‎
Lines changed: 10 additions & 12 deletions b/‎src/datamaestro_text/config/com/github/prdwb/orconvqa.py‎
Lines changed: 10 additions & 12 deletions
@@ -26,7 +26,7 @@ classifiers = [
 requires-python = ">=3.10"
 dynamic = ["version"]
 dependencies = [
-    "datamaestro>=1.7.3",
+    "datamaestro>=1.8.0",
     "ir_datasets>=0.5.8",
     "attrs",
     "experimaestro",
 
@@ -1,4 +1,4 @@
-from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
 from datamaestro.download.single import FileDownloader
 from datamaestro.utils import HashCheck
 
@@ -9,7 +9,7 @@
 @datatags("conversation", "context", "query")
 @datatasks("query rewriting")
 @dataset(url="https://sites.google.com/view/qanta/projects/canard", id="")
-class Main(Supervised):
+class Main(Dataset):
     """Question-in-context rewriting
 
     CANARD is a dataset for question-in-context rewriting that consists of
@@ -38,10 +38,9 @@ class Main(Supervised):
         checker=HashCheck("3fc14d0078e7a5056f5da571728f024e"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
-        return cls.C(
-            train=CanardDataset.C(path=cls.TRAIN.path),
-            validation=CanardDataset.C(path=cls.DEV.path),
-            test=CanardDataset.C(path=cls.TEST.path),
+    def config(self) -> Supervised:
+        return Supervised.C(
+            train=CanardDataset.C(path=self.TRAIN.path),
+            validation=CanardDataset.C(path=self.DEV.path),
+            test=CanardDataset.C(path=self.TEST.path),
         )
@@ -3,7 +3,7 @@
 import re
 import json
 from pathlib import Path
-from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
 from datamaestro.data.ml import Supervised
 from datamaestro.download import reference
 from datamaestro.download.archive import ZipDownloader
@@ -24,7 +24,7 @@
     doi="https://doi.org/10.48550/arXiv.2010.04898",
     id="",
 )
-class Main(Supervised):
+class Main(Dataset):
     """Open-Domain Question Answering Goes Conversational via Question Rewriting
 
     We introduce QReCC (Question Rewriting in Conversational Context), an
@@ -41,36 +41,37 @@ class Main(Supervised):
         checker=HashCheck("f88fcc7ef3678cd6312080389c8abd67"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
+    def config(self) -> Supervised:
         return Supervised.C(
-            train=QReCCDataset.C(path=cls.DATA.path / "qrecc_train.json"),
-            test=QReCCDataset.C(path=cls.DATA.path / "qrecc_test.json"),
+            train=QReCCDataset.C(path=self.DATA.path / "qrecc_train.json"),
+            test=QReCCDataset.C(path=self.DATA.path / "qrecc_test.json"),
         )
 
 
 @dataset(
     url="https://github.com/apple/ml-qrecc",
     doi="https://doi.org/10.48550/arXiv.2010.04898",
 )
-class Content(LZ4JSONLDocumentStore):
+class Content(Dataset):
     """QReCC mentionned URLs content"""
 
-    @staticmethod
-    def __create_dataset__(dataset, options=None):
-        ds = reference(reference=Main).setup(dataset, options)
-        documents_path = wayback_documents(
-            "20191127", lambda: Content._urls(ds), name="wayback.jsonl"
-        ).setup(dataset, options)
+    MAIN = reference(reference=Main)
+
+    WAYBACK_DOCS = wayback_documents(
+        "20191127",
+        lambda: Content._urls(Content.MAIN.prepare()),
+        name="wayback.jsonl",
+    )
 
-        store_path = lz4docstore_builder(
-            "store",
-            lambda: Content._documents(documents_path),
-            SimpleJsonDocument,
-            "id",
-        ).setup(dataset, options)
+    STORE = lz4docstore_builder(
+        "store",
+        lambda: Content._documents(Content.WAYBACK_DOCS.path),
+        SimpleJsonDocument,
+        "id",
+    )
 
-        return Content.C(jsonl_path=store_path)
+    def config(self) -> LZ4JSONLDocumentStore:
+        return LZ4JSONLDocumentStore.C(jsonl_path=self.STORE.path)
 
     @staticmethod
     def _documents(path: Path):
 
@@ -1,7 +1,7 @@
 # See documentation on https://datamaestro.readthedocs.io
 
 from datamaestro.download import reference
-from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
 from datamaestro_text.data.conversation.base import ConversationUserTopics
 from datamaestro_text.data.ir import Adhoc
 
@@ -16,72 +16,73 @@
 from datamaestro_text.datasets.irds.helpers import lz4docstore_builder
 
 
-@dataset(as_prepare=True)
-def clueweb22(dataset, options=None) -> IKatClueWeb22DocumentStore:
+@dataset()
+class Clueweb22(Dataset):
     # Number of documents in the dataset
     count = 116_838_987
 
-    jsonl_folder = linkfolder(
+    JSONL_FOLDER = linkfolder(
         "documents", [DatafolderPath("gov.nist.trec.ikat.clueweb22", "jsonl")]
-    ).setup(dataset, options)
-    store_path = lz4docstore_builder(
+    )
+
+    STORE_PATH = lz4docstore_builder(
         "store",
         IKatClueWeb22DocumentStore.generator(
-            jsonl_folder,
-            jsonl_folder / "ikat_2023_passages_jsonl.sha256sums",
-            jsonl_folder / "ikat_2023_passages_hashes.tsv.bz2",
+            JSONL_FOLDER,
+            "ikat_2023_passages_jsonl.sha256sums",
+            "ikat_2023_passages_hashes.tsv.bz2",
         ),
         IKatClueWeb22DocumentStore.Document,
         "id",
         count_hint=count,
-    ).setup(dataset, options)
+    )
 
-    return IKatClueWeb22DocumentStore.C(path=store_path, count=count)
+    def config(self) -> IKatClueWeb22DocumentStore:
+        return IKatClueWeb22DocumentStore.C(path=self.STORE_PATH.path, count=self.count)
 
 
 @datatags("conversation", "context", "query")
 @datatasks("conversational search", "query rewriting")
 @dataset(
-    id="2025",
+    id=".2025",
     url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
 )
-class Test2025(Adhoc):
+class Test2025(Dataset):
     """Question-in-context rewriting
 
     iKAT is a test dataset for question-in-context rewriting that consists of
     questions each given in a dialog context together with a context-independent
     rewriting of the question.
     """
 
-    DOCUMENTS = reference(varname="documents", reference=clueweb22)
+    DOCUMENTS = reference(varname="documents", reference=Clueweb22)
     TOPICS = FileDownloader(
         "topics.json",
         "https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
         checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
+    def config(self) -> Adhoc:
         return Adhoc.C(
             topics=ConversationUserTopics.C(
-                conversations=IkatConversations.C(path=cls.TOPICS.path)
+                conversations=IkatConversations.C(path=self.TOPICS.path)
             ),
             # TODO: add when available
             assessments=TrecAdhocAssessments.C(path="/to/do"),
-            documents=cls.DOCUMENTS.prepare(),
+            documents=self.DOCUMENTS.prepare(),
         )
 
 
 @datatags("conversation", "context", "query")
 @datatasks("conversational search", "query rewriting")
 @dataset(
-    id="2024",
+    id=".2024",
     url="https://github.com/irlabamsterdam/iKAT/tree/main/2024",
 )
-class Test2024(Adhoc):
+class Test2024(Dataset):
     """iKAT 2024 dataset"""
 
-    DOCUMENTS = reference(varname="documents", reference=clueweb22)
+    DOCUMENTS = reference(varname="documents", reference=Clueweb22)
     QRELS = FileDownloader(
         "qrels",
         "https://trec.nist.gov/data/ikat/2024-qrels.txt",
@@ -93,27 +94,26 @@ class Test2024(Adhoc):
         checker=HashCheck("ad45bc6e7add2081d69ea60a0a4d1203"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
+    def config(self) -> Adhoc:
         return Adhoc.C(
             topics=ConversationUserTopics.C(
-                conversations=IkatConversations.C(path=cls.TOPICS.path)
+                conversations=IkatConversations.C(path=self.TOPICS.path)
             ),
-            assessments=TrecAdhocAssessments.C(path=cls.QRELS.path),
-            documents=cls.DOCUMENTS.prepare(),
+            assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
+            documents=self.DOCUMENTS.prepare(),
         )
 
 
 @datatags("conversation", "context", "query")
 @datatasks("conversational search", "query rewriting")
 @dataset(
-    id="2023",
+    id=".2023",
     url="https://github.com/irlabamsterdam/iKAT/tree/main/2023",
 )
-class Test2023(Adhoc):
+class Test2023(Dataset):
     """iKAT 2023 dataset"""
 
-    DOCUMENTS = reference(varname="documents", reference=clueweb22)
+    DOCUMENTS = reference(varname="documents", reference=Clueweb22)
     QRELS = FileDownloader(
         "qrels",
         "https://trec.nist.gov/data/ikat/2023-qrels.all-turns.txt",
@@ -125,12 +125,11 @@ class Test2023(Adhoc):
         checker=HashCheck("684fa0197cdec8c3cfb6a2e586ab83f6"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
+    def config(self) -> Adhoc:
         return Adhoc.C(
             topics=ConversationUserTopics.C(
-                conversations=IkatConversations.C(path=cls.TOPICS.path)
+                conversations=IkatConversations.C(path=self.TOPICS.path)
             ),
-            assessments=TrecAdhocAssessments.C(path=cls.QRELS.path),
-            documents=cls.DOCUMENTS.prepare(),
+            assessments=TrecAdhocAssessments.C(path=self.QRELS.path),
+            documents=self.DOCUMENTS.prepare(),
         )
@@ -4,7 +4,7 @@
 import json
 from pathlib import Path
 from typing import Iterator
-from datamaestro.definitions import datatasks, datatags, dataset
+from datamaestro.definitions import Dataset, datatasks, datatags, dataset
 from datamaestro.download.single import FileDownloader
 from datamaestro.utils import HashCheck
 
@@ -21,7 +21,7 @@
 @dataset(
     url="https://github.com/prdwb/orconvqa-release",
 )
-class Preprocessed(Supervised):
+class Preprocessed(Dataset):
     """Open-Retrieval Conversational Question Answering datasets
 
     OrConvQA is an aggregation of three existing datasets:
@@ -49,12 +49,11 @@ class Preprocessed(Supervised):
         checker=HashCheck("0cf3a755f06297b9c02e7db45f8dc8be"),
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
-        return cls.C(
-            train=OrConvQADataset.C(path=cls.TRAIN.path),
-            validation=OrConvQADataset.C(path=cls.DEV.path),
-            test=OrConvQADataset.C(path=cls.TEST.path),
+    def config(self) -> Supervised:
+        return Supervised.C(
+            train=OrConvQADataset.C(path=self.TRAIN.path),
+            validation=OrConvQADataset.C(path=self.DEV.path),
+            test=OrConvQADataset.C(path=self.TEST.path),
         )
 
 
@@ -69,7 +68,7 @@ def orConvQADocumentReader(source: Path) -> Iterator[OrConvQADocumentStore.NAMED
 @dataset(
     url="https://github.com/prdwb/orconvqa-release",
 )
-class Passages(OrConvQADocumentStore):
+class Passages(Dataset):
     """orConvQA wikipedia files
 
     OrConvQA is an aggregation of three existing datasets:
@@ -90,6 +89,5 @@ class Passages(OrConvQADocumentStore):
         count_hint=11_377_951,
     )
 
-    @classmethod
-    def __create_dataset__(cls, dataset):
-        return cls.C(path=cls.ALL_BLOCKS.path, count=11_377_951)
+    def config(self) -> OrConvQADocumentStore:
+        return OrConvQADocumentStore.C(path=self.ALL_BLOCKS.path, count=11_377_951)