Skip to content

Commit 9c86bf2

Browse files
authored
Merge pull request #6 from experimaestro/adding_ikat
Adding ikat and fixing calls to `Config`
2 parents 9d2546e + f8892d6 commit 9c86bf2

File tree

4 files changed

+179
-5
lines changed

4 files changed

+179
-5
lines changed

docs/source/api/conversation.rst

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ Contextual query reformulation
3434
.. autoclass:: ContextualizedRewrittenQuery
3535
:members:
3636

37+
CANARD Dataset
38+
3739
.. autoxpmconfig:: datamaestro_text.data.conversation.canard.CanardDataset
3840
:members: iter
3941

42+
OrConvQA Dataset
43+
4044
.. autoxpmconfig:: datamaestro_text.data.conversation.orconvqa.OrConvQADataset
4145
:members: iter
4246

@@ -46,10 +50,22 @@ Contextual query reformulation
4650
.. autoclass:: datamaestro_text.data.conversation.orconvqa.OrConvQADatasetHistoryEntry
4751
:members:
4852

53+
QReCC Dataset
4954

50-
51-
.. autoclass:: datamaestro_text.data.conversation.orconvqa.QReCCDatasetEntry
55+
.. autoclass:: datamaestro_text.data.conversation.qrecc.QReCCDatasetEntry
5256
:members:
5357

5458
.. autoxpmconfig:: datamaestro_text.data.conversation.qrecc.QReCCDataset
5559
:members: iter
60+
61+
62+
iKAT Dataset
63+
64+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatDatasetEntry
65+
:members:
66+
67+
.. autoclass:: datamaestro_text.data.conversation.ikat.IkatConversationEntry
68+
:members:
69+
70+
.. autoxpmconfig:: datamaestro_text.data.conversation.ikat.IkatDataset
71+
:members: iter

src/datamaestro_text/config/com/github/apple/ml-qrecc.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ def main(data: Path) -> Supervised[QReCCDataset, None, QReCCDataset]:
3939
answering that includes the individual subtasks of question rewriting,
4040
passage retrieval and reading comprehension
4141
"""
42-
return Supervised(
43-
train=QReCCDataset(path=data / "qrecc_train.json"),
44-
test=QReCCDataset(path=data / "qrecc_test.json"),
42+
return Supervised.C(
43+
train=QReCCDataset.C(path=data / "qrecc_train.json"),
44+
test=QReCCDataset.C(path=data / "qrecc_test.json"),
4545
)
4646

4747

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# See documentation on https://datamaestro.readthedocs.io
2+
3+
from datamaestro.definitions import datatasks, datatags, dataset
4+
from datamaestro.data.ml import Supervised
5+
from datamaestro.data import Base
6+
7+
from datamaestro.utils import HashCheck
8+
from datamaestro.download.single import filedownloader
9+
from datamaestro_text.data.conversation.ikat import IkatDatasetEntry, IkatDataset
10+
from datamaestro_text.datasets.irds.data import (
11+
SimpleJsonDocument,
12+
LZ4JSONLDocumentStore,
13+
)
14+
import logging
15+
16+
@datatags("conversation", "context", "query")
17+
@datatasks("query rewriting")
18+
@filedownloader(
19+
"test.json",
20+
"https://raw.githubusercontent.com/irlabamsterdam/iKAT/refs/heads/main/2025/data/2025_test_topics.json",
21+
checker=HashCheck("16f8444a8d0a8dfe0090f478f185a63c"),
22+
)
23+
24+
@dataset(
25+
Base,
26+
url="https://github.com/irlabamsterdam/iKAT/tree/main/2025",
27+
)
28+
29+
def main(test) -> Supervised[IkatDataset, None, IkatDataset]:
30+
"""Question-in-context rewriting
31+
32+
iKAT is a test dataset for question-in-context rewriting that consists of
33+
questions each given in a dialog context together with a context-independent
34+
rewriting of the question.
35+
One of the special features of iKAT is that it includes a Personal PKTB',
36+
"""
37+
logging.info("Creating iKAT dataset from %s", test)
38+
return IkatDataset.C(path=test)
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
from typing import Iterator, List, Optional
2+
from attr import define, field
3+
import json
4+
import logging
5+
from datamaestro.data import File
6+
from datamaestro.record import Record
7+
8+
from datamaestro_text.data.ir.base import (
9+
IDItem,
10+
SimpleTextItem,
11+
)
12+
13+
14+
from .base import (
15+
AnswerDocumentURL,
16+
AnswerEntry,
17+
ConversationTree,
18+
EntryType,
19+
SimpleDecontextualizedItem,
20+
SingleConversationTree,
21+
)
22+
from . import ConversationDataset
23+
24+
25+
26+
@define(kw_only=True)
27+
class IkatConversationEntry:
28+
"""A query with past history"""
29+
30+
turn_id: int
31+
"""Turn number in the conversation"""
32+
33+
user_utterance: str
34+
"""The last issued query"""
35+
36+
resolved_utterance: str
37+
"""Manually rewritten query"""
38+
39+
response: str
40+
"""The system response to the query"""
41+
42+
relevant_ptkbs: List[str]
43+
"""The list of relevant personal knowledge bases for the query"""
44+
45+
citations: List[str]
46+
"""The list of citations for the response"""
47+
48+
49+
@define(kw_only=True)
50+
class IkatDatasetEntry:
51+
"""A query with past history"""
52+
53+
number: str
54+
"""Conversation ID"""
55+
56+
title: str
57+
"""Title of the conversation"""
58+
59+
ptkb: str
60+
"""The personal knowledge base associated with the user"""
61+
62+
responses: List[IkatConversationEntry] = field(
63+
converter=lambda items: [IkatConversationEntry(**item) if isinstance(item, dict) else item for item in items]
64+
)
65+
"""The list of responses to the query"""
66+
67+
68+
class IkatDataset(ConversationDataset, File):
69+
70+
def entries(self) -> Iterator[IkatDatasetEntry]:
71+
"""Reads all conversation entries from the dataset file."""
72+
with self.path.open("rt") as fp:
73+
raw_data = json.load(fp)
74+
75+
logging.debug("Loaded %d entries from %s", len(raw_data), self.path)
76+
logging.debug(f"raw data has keys {raw_data[0].keys()}")
77+
78+
processed_data = []
79+
for entry in raw_data:
80+
processed_data.append(IkatDatasetEntry(**{key.lower(): value for key, value in entry.items()}))
81+
82+
logging.debug(f"First parsed data sample: {processed_data[0]}")
83+
return iter(processed_data)
84+
85+
def __iter__(self) -> Iterator[ConversationTree]:
86+
for entry in self.entries():
87+
history: List[Record] = []
88+
89+
for turn in entry.responses:
90+
turn: IkatConversationEntry = turn # Ensure type is correct
91+
query_id = f"{entry.number}#{turn.turn_id}"
92+
93+
# USER QUERY record
94+
history.append(
95+
Record(
96+
IDItem(query_id),
97+
SimpleTextItem(turn.user_utterance),
98+
SimpleDecontextualizedItem(turn.resolved_utterance),
99+
EntryType.USER_QUERY,
100+
)
101+
)
102+
103+
# Build citation info (stubbed relevance to match format)
104+
relevances = {}
105+
if turn.relevant_ptkbs:
106+
# Example: just use first as relevant (can be improved)
107+
relevances[0] = (0, None) # No position info in this structure
108+
109+
# SYSTEM ANSWER record
110+
history.append(
111+
Record(
112+
AnswerEntry(turn.response),
113+
EntryType.SYSTEM_ANSWER,
114+
)
115+
)
116+
117+
# Ensure reverse if needed for compatibility (optional)
118+
history.reverse()
119+
yield SingleConversationTree(entry.number, history)
120+

0 commit comments

Comments
 (0)