Skip to content

Commit 429d754

Browse files
committed
Updates for new datamaestro version
1 parent ebf1881 commit 429d754

File tree

8 files changed

+29
-33
lines changed

8 files changed

+29
-33
lines changed

src/datamaestro_text/config/com/microsoft/msmarco/passage.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""MS MARCO (Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. A variant of this task will be the part of TREC and AFIRM 2019. For Updates about TREC 2019 please follow This Repository Passage Reranking task Task Given a query q and a the 1000 most relevant passages P = p1, p2, p3,... p1000, as retrieved by BM25 a succeful system is expected to rerank the most relevant passage as high as possible. For this task not all 1000 relevant items have a human labeled relevant passage. Evaluation will be done using MRR.
22
3-
**Publication**:
4-
Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5-
MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
3+
**Publication**:
4+
Tri Nguyen, Mir Rosenberg, Xia Song, Jianfeng Gao, Saurabh Tiwary, RanganMajumder, and Li Deng. 2016.
5+
MS MARCO: A Human Generated MAchineReading COmprehension Dataset. In CoCo@NIPS.
66
77
8-
See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
8+
See [https://github.com/microsoft/MSMARCO-Passage-Ranking](https://github.com/microsoft/MSMARCO-Passage-Ranking) for more details
99
"""
1010

1111
from datamaestro.annotations.agreement import useragreement
@@ -35,6 +35,7 @@
3535

3636
# --- Document collection
3737

38+
3839
# TODO: Not ideal since it would be better to have small versions right away
3940
# instead of downloading again the MS Marco Collection
4041
@lua
@@ -43,10 +44,10 @@
4344
url="https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz",
4445
checker=HashCheck("31644046b18952c1386cd4564ba2ae69", md5),
4546
)
46-
@dataset(Folder, url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
47-
def collection_etc(data):
47+
@dataset(url="https://github.com/microsoft/MSMARCO-Passage-Ranking")
48+
def collection_etc(data) -> Folder:
4849
"""Documents and some more files"""
49-
return {"path": data}
50+
return Folder(path=data)
5051

5152

5253
@lua

src/datamaestro_text/config/com/sentiment140.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
from datamaestro.data.ml import Supervised
55
from datamaestro.utils import HashCheck
66

7-
# name: Sentiment140
8-
# web: http://help.sentiment140.com/for-students/
9-
10-
# description: |
117

128

139
@zipdownloader(

src/datamaestro_text/config/edu/upenn/ldc/aquaint.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
"""The AQUAINT Corpus, Linguistic Data Consortium (LDC) catalog number LDC2002T31 and ISBN 1-58563-240-6 consists of newswire text data in English, drawn from three sources: the Xinhua News Service (People's Republic of China), the New York Times News Service, and the Associated Press Worldstream News Service. It was prepared by the LDC for the AQUAINT Project, and will be used in official benchmark evaluations conducted by National Institute of Standards and Technology (NIST)."""
22

3-
from datamaestro.definitions import DatafolderPath
4-
from datamaestro.data import Base
5-
from datamaestro_text.data.ir.trec import TipsterCollection
6-
from datamaestro.definitions import argument, datatasks, datatags, dataset
3+
from datamaestro.context import DatafolderPath
4+
from datamaestro.definitions import dataset
75
from datamaestro.download.links import links, linkfolder
6+
from datamaestro_text.data.ir.trec import TipsterCollection
87

98

109
URL = "https://catalog.ldc.upenn.edu/LDC2002T31"

src/datamaestro_text/config/gov/nist/trec/tipster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
from datamaestro.download.links import linkfolder
2323
from datamaestro.definitions import (
2424
dataset,
25-
DatafolderPath,
2625
)
26+
from datamaestro.context import DatafolderPath
2727

2828
# Store meta-information
2929
TIPSTER = dataset(TipsterCollection, url="https://catalog.ldc.upenn.edu/LDC93T3A")

src/datamaestro_text/data/embeddings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from pathlib import Path
2-
from datamaestro.data import Base, File, argument
1+
from experimaestro import Meta
2+
from datamaestro.data import Base, File
33
from datamaestro.definitions import datatags
44
import numpy as np
55
from typing import Tuple, List
@@ -18,9 +18,9 @@ def load(self) -> Tuple[List[str], np.matrix]:
1818
raise NotImplementedError()
1919

2020

21-
@argument("encoding", str, ignored=True, default="utf-8")
2221
class WordEmbeddingsText(WordEmbeddings, File):
2322
"""Word embeddings as a text word / values"""
23+
encoding: Meta[str] = "utf-8"
2424

2525
def load(self):
2626
words = []

src/datamaestro_text/data/ir/cord19.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from csv import DictReader
22
from typing import Iterator
33

4-
from datamaestro.data import File, documentation
4+
from experimaestro import documentation
5+
from datamaestro.data import File
56
from datamaestro.record import Record
67
from datamaestro_text.data.ir import Documents, TopicRecord, Topics, IDItem
78
from datamaestro_text.data.ir.formats import (
Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
from datamaestro.data import Base, File, argument
1+
from experimaestro import Param
2+
from datamaestro.data import Base, File
23
import datamaestro.data.csv as csv
34

45

5-
@argument("ratings", type=File)
66
class RatedItems(Base):
7-
pass
7+
ratings: Param[File]
88

99

10-
@argument("links", type=csv.Generic)
11-
@argument("movies", type=csv.Generic)
12-
@argument("tags", type=csv.Generic)
1310
class Movielens(RatedItems):
14-
pass
11+
links: Param[csv.Generic]
12+
movies: Param[csv.Generic]
13+
tags: Param[csv.Generic]

src/datamaestro_text/data/text.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
from pathlib import Path
2-
from datamaestro.data import Base, Folder, File, argument
1+
from typing import Optional
2+
from experimaestro import Param
3+
from datamaestro.data import Base, Folder, File
34
from datamaestro.data.ml import Supervised
45

56

6-
@argument("train", type=Base)
7-
@argument("test", type=Base, required=False)
8-
@argument("validation", type=Base, required=False)
97
class TrainingText(Supervised):
108
""" "A dataset used for training with a train and a test"""
119

12-
pass
10+
train: Param[Base]
11+
test: Param[Optional[Base]] = None
12+
validation: Param[Optional[Base]] = None
1313

1414

1515
class TextFolder(Folder):

0 commit comments

Comments
 (0)