Updates for new datamaestro version

bpiwowar · bpiwowar · commit 429d754090f5 · 2025-04-03T15:57:52.000+02:00
diff --git a/src/datamaestro_text/config/com/microsoft/msmarco/passage.py b/src/datamaestro_text/config/com/microsoft/msmarco/passage.py
@@ -1,11 +1,11 @@
 """MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
 
-  **Publication**:
-  Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
-  MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
+**Publication**:
+Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
+MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
 
 
-  See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
+See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
 """
 
 from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@
 
 # --- Document collection
 
+
 # TODO: Not ideal since it would be better to have small versions right away
 # instead of downloading again the MS Marco Collection
 @lua
@@ -43,10 +44,10 @@
     url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
     checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
 )
-@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
-def collection_etc(data):
+@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
+def collection_etc(data) -> Folder:
     """Documents and some more files"""
-    return {"path": data}
+    return Folder(path=data)
 
 
 @lua
diff --git a/src/datamaestro_text/config/com/sentiment140.py b/src/datamaestro_text/config/com/sentiment140.py
@@ -4,10 +4,6 @@
 from datamaestro.data.ml import Supervised
 from datamaestro.utils import HashCheck
 
-# name: Sentiment140
-# web: http://help.sentiment140.com/for-students/
-
-# description: |
 
 
 @zipdownloader(
diff --git a/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py b/src/datamaestro_text/config/edu/upenn/ldc/aquaint.py
@@ -1,10 +1,9 @@
 """The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
 
-from datamaestro.definitions import DatafolderPath
-from datamaestro.data import Base
-from datamaestro_text.data.ir.trec import TipsterCollection
-from datamaestro.definitions import argument, datatasks, datatags, dataset
+from datamaestro.context import DatafolderPath
+from datamaestro.definitions import dataset
 from datamaestro.download.links import links, linkfolder
+from datamaestro_text.data.ir.trec import TipsterCollection
 
 
 URL = "https://catalog.ldc.upenn.edu/LDC2002T31"
diff --git a/src/datamaestro_text/config/gov/nist/trec/tipster.py b/src/datamaestro_text/config/gov/nist/trec/tipster.py
@@ -22,8 +22,8 @@
 from datamaestro.download.links import linkfolder
 from datamaestro.definitions import (
     dataset,
-    DatafolderPath,
 )
+from datamaestro.context import DatafolderPath
 
 # Store meta-information
 TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")
diff --git a/src/datamaestro_text/data/embeddings.py b/src/datamaestro_text/data/embeddings.py
@@ -1,5 +1,5 @@
-from pathlib import Path
-from datamaestro.data import Base, File, argument
+from experimaestro import Meta
+from datamaestro.data import Base, File
 from datamaestro.definitions import datatags
 import numpy as np
 from typing import Tuple, List
@@ -18,9 +18,9 @@ def load(self) -> Tuple[List[str], np.matrix]:
         raise NotImplementedError()
 
 
-@argument("encoding", str, ignored=True, default="utf-8")
 class WordEmbeddingsText(WordEmbeddings, File):
     """Word embeddings as a text word / values"""
+    encoding: Meta[str] = "utf-8"
 
     def load(self):
         words = []
diff --git a/src/datamaestro_text/data/ir/cord19.py b/src/datamaestro_text/data/ir/cord19.py
@@ -1,7 +1,8 @@
 from csv import DictReader
 from typing import Iterator
 
-from datamaestro.data import File, documentation
+from experimaestro import documentation
+from datamaestro.data import File
 from datamaestro.record import Record
 from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
 from datamaestro_text.data.ir.formats import (
diff --git a/src/datamaestro_text/data/recommendation.py b/src/datamaestro_text/data/recommendation.py
@@ -1,14 +1,13 @@
-from datamaestro.data import Base, File, argument
+from experimaestro import Param
+from datamaestro.data import Base, File
 import datamaestro.data.csv as csv
 
 
-@argument("ratings", type=File)
 class RatedItems(Base):
-    pass
+    ratings: Param[File]
 
 
-@argument("links", type=csv.Generic)
-@argument("movies", type=csv.Generic)
-@argument("tags", type=csv.Generic)
 class Movielens(RatedItems):
-    pass
+    links: Param[csv.Generic]
+    movies: Param[csv.Generic]
+    tags: Param[csv.Generic]
diff --git a/src/datamaestro_text/data/text.py b/src/datamaestro_text/data/text.py
@@ -1,15 +1,15 @@
-from pathlib import Path
-from datamaestro.data import Base, Folder, File, argument
+from typing import Optional
+from experimaestro import Param
+from datamaestro.data import Base, Folder, File
 from datamaestro.data.ml import Supervised
 
 
-@argument("train", type=Base)
-@argument("test", type=Base, required=False)
-@argument("validation", type=Base, required=False)
 class TrainingText(Supervised):
     """ "A dataset used for training with a train and a test"""
 
-    pass
+    train: Param[Base]
+    test: Param[Optional[Base]] = None
+    validation: Param[Optional[Base]] = None
 
 
 class TextFolder(Folder):

Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,8 @@`
`22`	`22`	`from datamaestro.download.links import linkfolder`
`23`	`23`	`from datamaestro.definitions import (`
`24`	`24`	`dataset,`
`25`		`- DatafolderPath,`
`26`	`25`	`)`
	`26`	`+from datamaestro.context import DatafolderPath`
`27`	`27`
`28`	`28`	`# Store meta-information`
`29`	`29`	`TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")`